Distinct for several fields in ElasticSearch - elasticsearch

I need to get distinct values of several fields from ElasticSearch index - but it has to be distinct as a set, just like in MySQL query:
SELECT DISTINCT name, type from some_table;
Until now I have tried some ways to obtain this, but for me all of them failed:
1. Aggregation
GET test_index/_search
{
"size": 0,
"track_total_hits": false,
"aggs" : {
"features": {
"terms": {
"field" : "feature.name",
"size" : 10,
"order": {
"_key": "asc"
}
}
}
}
}
2. Script
This below returns all available combinations for two fields but not only really existing pairs.
GET bm_upgraded_visitors/_search
{
"size": 0,
"aggs": {
"t": {
"terms": {
"script": "doc['feature.name'].values + ' | ' + doc['feature.type'].values"
}
}
}
}
Sample code:
PUT test_index
{
"mappings" : {
"_doc" : {
"dynamic" : "false",
"properties" : {
"features" : {
"type": "nested",
"include_in_root": true,
"properties" : {
"name" : {
"type" : "keyword"
},
"value" : {
"type" : "text"
},
"type": {
"type" : "keyword"
}
}
}
}
}
}
}
Sample doc:
PUT test_index/_doc/1
{
"features": [
{
"name": "Feature 1",
"value": "Value 1",
"type": "Type 1"
},
{
"name": "Feature 2",
"value": "Value 1",
"type": "Type 2"
}
]
}
Result required:
buckets" : [
{
"key" : "Feature 1",
"doc_count" : 1,
"types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Type 1",
"doc_count" : 1
}
]
}
},
{
"key" : "Feature 2",
"doc_count" : 1,
"types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Type 2",
"doc_count" : 1
}
]
}
}
]

Then you need another terms sub-aggregation. Try this:
GET test_index/_search
{
"size": 0,
"track_total_hits": false,
"aggs": {
"features": {
"terms": {
"field": "feature.name",
"size": 10,
"min_doc_count": 1,
"order": {
"_key": "asc"
}
},
"aggs": {
"types": {
"terms": {
"field": "feature.type",
"size": 10,
"min_doc_count": 1,
"order": {
"_key": "asc"
}
}
}
}
}
}
}

Related

How to group documents in Elasticsearch and get the documents in each group?

My Elasticsearch index contains products with a denormalized m:n relationship to categories.
My goal is to derive a categories index from it which contains the same information, but with the relationship inverted.
The index looks like this:
PUT /products
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"article_id": {
"type": "keyword"
},
"categories": {
"type": "nested",
"properties": {
"cat_name": {
"type": "keyword"
}
}
}
}
}
}
containing documents created like this:
POST /products/_doc
{
"name": "radio",
"article_id": "1001",
"categories": [
{ "cat_name": "audio" },
{ "cat_name": "electronics" }
]
}
POST /products/_doc
{
"name": "fridge",
"article_id": "1002",
"categories": [
{ "cat_name": "appliances" },
{ "cat_name": "electronics" }
]
}
I would like to get something like this back from Elasticsearch:
{
"name": "appliances",
"products": [
{
"name": "fridge",
"article_id": "1002"
}
]
},
{
"name": "audio",
"products": [
{
"name": "radio",
"article_id": "1001"
}
]
},
{
"name": "electronics",
"products": [
{
"name": "fridge",
"article_id": "1002"
},
{
"name": "radio",
"article_id": "1001"
}
]
}
which would eventually be put into an index such as:
PUT /categories
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"products": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"article_id": {
"type": "keyword"
}
}
}
}
}
}
I cannot figure out how to do this without loading and grouping all products programmatically.
Here's what I have tried:
Bucket aggregation on field categories.cat_name
This gives me the document count per category but not the product documents. Using top_hits sub-aggregation seems to be limited to 100 documents.
Group using collapse field with expansion
Collapsing is only possible on a single-valued field.
I'm using Elasticsearch 8.1.
The query you need is this one:
POST products/_search
{
"size": 0,
"aggs": {
"cats": {
"nested": {
"path": "categories"
},
"aggs": {
"categories": {
"terms": {
"field": "categories.cat_name",
"size": 10
},
"aggs": {
"root": {
"reverse_nested": {},
"aggs": {
"products": {
"terms": {
"field": "name",
"size": 10
}
}
}
}
}
}
}
}
}
}
Which produces exactly what you need (less the article id, but that's easy):
"buckets" : [
{
"key" : "electronics",
"doc_count" : 2,
"root" : {
"doc_count" : 2,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "fridge",
"doc_count" : 1
},
{
"key" : "radio",
"doc_count" : 1
}
]
}
}
},
{
"key" : "appliances",
"doc_count" : 1,
"root" : {
"doc_count" : 1,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "fridge",
"doc_count" : 1
}
]
}
}
},
{
"key" : "audio",
"doc_count" : 1,
"root" : {
"doc_count" : 1,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "radio",
"doc_count" : 1
}
]
}
}
}
]

How to filter nested aggregations in ElasticSearch?

For example, let's assume we have a product index with the following mapping:
{
"product": {
"mappings": {
"producttype": {
"properties": {
"id": {
"type": "keyword"
},
"productAttributes": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
}
}
I am trying to find how many products which have specific product attributes using the following query(I am using a fuzzy query to allow some edit distance):
{
"size": 0,
"query": {
"nested": {
"query": {
"fuzzy": {
"productAttributes.name": {
"value": "SSD"
}
}
},
"path": "productAttributes"
}
},
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
But it returns all product attributes for each matched document and here is the response I get.
"aggregations" : {
"product_attribute_nested_agg" : {
"doc_count" : 6,
"terms_nested_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "SSD",
"doc_count" : 3
},
{
"key" : "USB 2.0",
"doc_count" : 3
}
]
}
}
}
Could you please guide me to how to filter buckets to only return matched attributes?
Edit:
Here are some document samples:
"hits" : {
"total" : 12,
"max_score" : 1.0,
"hits" : [
{
"_index" : "product",
"_type" : "producttype",
"_id" : "677d1164-c401-4d36-8a08-6aa14f7f32bb",
"_score" : 1.0,
"_source" : {
"title" : "Dell laptop",
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "4"
},
{
"name" : "SSD",
"value" : "250 GB"
}
]
}
},
{
"_index" : "product",
"_type" : "producttype",
"_id" : "2954935a-7f60-437a-8a54-00da2d71da46",
"_score" : 1.0,
"_source" : {
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "3"
},
{
"name" : "SSD",
"value" : "500 GB"
}
],
"title" : "HP laptop"
}
},
]
}
To filter only specific, you can use filter queries.
Query:
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}
This is what it does the trick:
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
}
You need to do filter part of the aggregation.
Output:
"aggregations": {
"product_attribute_nested_agg": {
"doc_count": 4,
"inner": {
"doc_count": 2,
"terms_nested_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SSD",
"doc_count": 2
}
]
}
}
}
}
Filtering using Fuzziness :
GET /product/_search
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"fuzzy": {
"productAttributes.name": {
"value": "SSt",//here will match SSD
"fuzziness": 3//you can remove it to be as Auto
}
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}

Elasticsearch bucketing and add-to-list

How to go about bucketing on a field and then aggregating all the values of a different field into an array. Here's a sample list.
{
"product": "xyz",
"action": "add",
"user": "bob"
},
{
"product": "xyz",
"action": "update",
"user": "bob"
},
{
"product": "xyz",
"action": "add",
"user": "alice"
},
{
"product": "xyz",
"action": "add",
"user": "eve"
},
{
"product": "xyz",
"action": "delete",
"user": "eve"
}
Expected output:
{
"buckets": [
{
"key": "add",
"doc_count": 3,
"user": ["bob", "alice", "eve"]
},
{
"key": "update",
"doc_count": 1,
"user": ["bob"]
},
{
"key": "delete",
"doc_count": 1,
"user": ["eve"]
}
]
}
How to push user values to an array in each bucket? Is there something similar to mongodb $push or $addToFields in elastic aggregation? Appreciate the help.
Here's the work-in-progress aggregation.
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
}
}
}
}
}
}
Would this do? I just added chained one more Terms Aggregation as mentioned below:
Aggregation Query:
POST <your_index_name>
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
},
"aggs": {
"myUsers": {
"terms": {
"field": "user",
"size": 10
}
}
}
}
}
}
}
}
Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"product_filter" : {
"doc_count" : 5,
"group_by_action" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "add",
"doc_count" : 3,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "alice",
"doc_count" : 1
},
{
"key" : "bob",
"doc_count" : 1
},
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "delete",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "update",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "bob",
"doc_count" : 1
}
]
}
}
]
}
}
}
}
I'm not sure if it is possible to have them in a single list as you've mentioned.
Hope this helps!

group by nested and non nested fields in es

Hi i am trying to do group by nested and non nested fields.I want to do group by on 1 non nested fields(from_district) ,1 nested field(truck_number) and max on nested field(truck_number.score).
Requirement -: to get max score of each truck in all districts if truck is present in that district for a given sp_id
eg-:
District1 ,truck1, 0.9,
District2 ,truck1, 0.8,
District1 ,truck2, 1.8,
District2 ,truck3, 0.7,
District3 ,truck4, 1.7
Below is my mapping
{
"sp_ranked_indent" : {
"mappings" : {
"properties" : {
"from_district" : {
"type" : "keyword"
},
"sp_id" : {
"type" : "long"
},
"to_district" : {
"type" : "keyword"
},
"truck_ranking_document" : {
"type" : "nested",
"properties" : {
"score" : {
"type" : "float"
},
"truck_number" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
Below is the query that i tried but it is not grouping by nested and non nested field and also the max truck score is incorrect
{
"size": 0,
"query": {
"terms": {
"sp_id": [650128],
"boost": 1.0
}
},
"aggregations": {
"NESTED_AGG": {
"nested": {
"path": "truck_ranking_document"
},
"aggregations": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"truck_numer": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
},
"fromdistrictagg": {
"reverse_nested": {},
"aggregations": {
"fromDistrict": {
"terms": {
"field": "from_district",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
}
}
}
}
}
}
}
I think this can be done using term and nested aggregation. Below query will produce output in follwing format
District1
Truck1
Max score
Truck2
Max score
Truck3
Max score
District2
Truck1
Max score
Truck2
Max score
Truck3
Max score
Query:
{
"query": {
"terms": {
"sp_id": [
1
]
}
},
"aggs": {
"district": {
"terms": {
"field": "from_district",
"size": 10
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "trucks>truck_no._bucket_count"
},
"script": {
"inline": "params.count != 0"
}
}
}
}
}
}
}
Result:
"aggregations" : {
"district" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "District1",
"doc_count" : 1,
"trucks" : {
"doc_count" : 2,
"truck_no" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"max_score" : {
"value" : 2.0
}
},
{
"key" : "3",
"doc_count" : 1,
"max_score" : {
"value" : 3.0
}
}
]
}
}
}
]
}
Composite Aggregation
Composite aggregation response contains an after_key
"after_key" : {
"district" : "District4"
}
you need to use the after parameter to retrieve the next results
{
"aggs": {
"my_buckets": {
"composite": {
"size": 100,
"sources": [
{
"district": {
"terms": {
"field": "from_district"
}
}
}
]
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
}
}
}
}
}

aggregation query and return all fields in elasticsearch

I have an large(20GB) csv file by flowing format.
date,ip,dev_type,env,time,cpu_usage
2015-11-09,10.241.121.172,M2,production,11:01,8
2015-11-09,10.241.121.172,M2,production,11:02,9
2015-11-09,10.241.121.243,C1,preproduction,11:01,4
2015-11-09,10.241.121.243,C1,preproduction,11:02,8
2015-11-10,10.241.121.172,M2,production,11:01,3
2015-11-10,10.241.121.172,M2,production,11:02,9
2015-11-10,10.241.121.243,C1,preproduction,11:01,4
2015-11-10,10.241.121.243,C1,preproduction,11:02,8
and import into elasticheaseh as flowing format
{
"_index": "cpuusage",
"_type": "logs",
"_id": "AVFOkMS7Q4jUWMFNfSrZ",
"_score": 1,
"_source": {
"date": "2015-11-10",
"ip": "10.241.121.172",
"dev_type": "M2",
"env": "production",
"time": "11:02",
"cpu_usage": "9"
},
"fields": {
"date": [
1447113600000
]
}
}
...
so how could i output all field (date, ip, dev_type, env, cpu_usage) when i find out the maximum value of cpu_usage for each ip in each day
curl -XGET localhost:9200/cpuusage/_search?pretty -d '{
"size": 0,
"aggs": {
"by_date": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs" : {
"genders" : {
"terms" : {
"field" : "ip",
"size": 100000,
"order" : { "_count" : "asc" }
},
"aggs" : {
"cpu_usage" : { "max" : { "field" : "cpu_usage" } }
}
}
}
}
}
}'
---cut---
----output ----
"aggregations" : {
"events_by_date" : {
"buckets" : [ {
"key_as_string" : "2015-11-09T00:00:00.000Z",
"key" : 1447027200000,
"doc_count" : 4,
"genders" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "10.241.121.172",
"doc_count" : 2,
"cpu_usage" : {
"value" : 9.0
}
}, {
"key" : "10.241.121.243",
"doc_count" : 2,
"cpu_usage" : {
"value" : 8.0
}
} ]
}
},
You can do that with top hits aggregation
Try this
{
"size": 0,
"aggs": {
"by_date": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"genders": {
"terms": {
"field": "ip",
"size": 100000,
"order": {
"_count": "asc"
}
},
"aggs": {
"cpu_usage": {
"max": {
"field": "cpu_usage"
}
},
"include_source": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"date", "ip", "dev_type", "env", "cpu_usage"
]
}
}
}
}
}
}
}
}
}
Does this help?

Resources