group by nested and non nested fields in es - elasticsearch

Hi i am trying to do group by nested and non nested fields.I want to do group by on 1 non nested fields(from_district) ,1 nested field(truck_number) and max on nested field(truck_number.score).
Requirement -: to get max score of each truck in all districts if truck is present in that district for a given sp_id
eg-:
District1 ,truck1, 0.9,
District2 ,truck1, 0.8,
District1 ,truck2, 1.8,
District2 ,truck3, 0.7,
District3 ,truck4, 1.7
Below is my mapping
{
"sp_ranked_indent" : {
"mappings" : {
"properties" : {
"from_district" : {
"type" : "keyword"
},
"sp_id" : {
"type" : "long"
},
"to_district" : {
"type" : "keyword"
},
"truck_ranking_document" : {
"type" : "nested",
"properties" : {
"score" : {
"type" : "float"
},
"truck_number" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
Below is the query that i tried but it is not grouping by nested and non nested field and also the max truck score is incorrect
{
"size": 0,
"query": {
"terms": {
"sp_id": [650128],
"boost": 1.0
}
},
"aggregations": {
"NESTED_AGG": {
"nested": {
"path": "truck_ranking_document"
},
"aggregations": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"truck_numer": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
},
"fromdistrictagg": {
"reverse_nested": {},
"aggregations": {
"fromDistrict": {
"terms": {
"field": "from_district",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
}
}
}
}
}
}
}

I think this can be done using term and nested aggregation. Below query will produce output in follwing format
District1
Truck1
Max score
Truck2
Max score
Truck3
Max score
District2
Truck1
Max score
Truck2
Max score
Truck3
Max score
Query:
{
"query": {
"terms": {
"sp_id": [
1
]
}
},
"aggs": {
"district": {
"terms": {
"field": "from_district",
"size": 10
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "trucks>truck_no._bucket_count"
},
"script": {
"inline": "params.count != 0"
}
}
}
}
}
}
}
Result:
"aggregations" : {
"district" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "District1",
"doc_count" : 1,
"trucks" : {
"doc_count" : 2,
"truck_no" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"max_score" : {
"value" : 2.0
}
},
{
"key" : "3",
"doc_count" : 1,
"max_score" : {
"value" : 3.0
}
}
]
}
}
}
]
}
Composite Aggregation
Composite aggregation response contains an after_key
"after_key" : {
"district" : "District4"
}
you need to use the after parameter to retrieve the next results
{
"aggs": {
"my_buckets": {
"composite": {
"size": 100,
"sources": [
{
"district": {
"terms": {
"field": "from_district"
}
}
}
]
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
}
}
}
}
}

Related

Filter in aggregation

say I have mapping:
{
// ...other fields,
"locations": {
"type": "nested",
"properties": {
"countrySlug": { "type": "keyword" },
"citySlug": { "type": "keyword" }
}
}
}
So this way, each document can have multiple locations:
{
"locations": [
{
"countrySlug": "germany",
"citySlug": "berlin"
},
{
"countrySlug": "germany",
"citySlug": "hamburg"
},
{
"countrySlug": "poland",
"citySlug": "krakow"
},
{
"countrySlug": "italy",
"citySlug": "milan"
}
]
}
Now I want to get aggregation of city slugs where location contains countrySlug = "germany".
My query looks like this:
{
"_source": false,
"aggs": {
"cities": {
"filter": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"nested": {
"path": "locations",
"query": {
"bool": {
"must": {
"term": {
"locations.countrySlug": "germany"
}
}
}
}
}
}
]
}
}
]
}
},
"aggs": {
"agg": {
"nested": {
"path": "locations"
},
"aggs": {
"slugs": {
"terms": {
"field": "locations.citySlug",
"size": 5
},
"aggs": {
"top_reverse_nested": {
"reverse_nested": {}
}
}
}
}
}
}
}
},
"size": 0
}
But it returns all city slugs that were found, eg:
berlin: 2
krakow: 1
milan: 3
My goal is to get just:
berlin: 2
(or other city slugs that are related to a location with countrySlug = "germany")
Am I missing anything? How to make something like "post filter" for aggregations?
Thanks, PS
After filtering out all the documents where countrySlug is germany, you can put a nested aggregation on the returned records.
GET /cities/_search
{
"size": 0,
"aggs": {
"cities": {
"nested": {
"path": "locations"
},
"aggs": {
"filter_cities": {
"filter": {
"bool": {
"filter": [
{
"term": {
"locations.countrySlug": "germany"
}
}
]
}
},
"aggs": {
"cities": {
"terms": {
"field": "locations.citySlug"
}
}
}
}
}
}
}
}
The result for the above query:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"cities" : {
"doc_count" : 17,
"filter_cities" : {
"doc_count" : 9,
"cities" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "hamburg",
"doc_count" : 5
},
{
"key" : "berlin",
"doc_count" : 4
}
]
}
}
}
}
}

Elasticsearch cardinality multiple fields

I have an index which is as follows:
{
"_index" : "r2332",
"_type" : "_doc",
"_id" : "Vl81o3oBs8vUbHSMCZVQ",
"_score" : 1.0,
"_source" : {
"maid" : "d8ee3c5e-babb-4777-9cba-17fb0cd8e8a9",
"date" : "2021-06-09",
"hour" : 5,
"site_id" : 1035
}
},
{
"_index" : "r2332",
"_type" : "_doc",
"_id" : "Xl81o3oBs8vUbHSMCZVQ",
"_score" : 1.0,
"_source" : {
"maid" : "d8ee3c5e-babb-4777-9cba-17fb0cd8e8a9",
"date" : "2021-06-09",
"hour" : 5,
"site_id" : 1897
}
}
I am trying to get the unique count across maid, date. I am able to aggregate with one field maid but not both. The following are the codes that I tried.
Trial 1:
{
"size": 0,
"query": {
"bool": {
"filter": [{
"terms": {
"site_id": [7560, 7566]
}
}, {
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}]
}
},
"runtime_mappings": {
"type_and_promoted": {
"type": "keyword",
"script": "emit(doc['maid'].value + ' ' + doc['date'].value)"
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"reverse_nested": {},
"uv": {
"cardinality": {
"field": "runtime_mappings"
}
}
}
}
}
}
}
}
}
}
This is giving an empty output.
Trial 2:
{
"size": 0,
"query": {
"bool": {
"filter": [{
"terms": {
"site_id": [7560, 7566]
}
}, {
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"uv": {
"cardinality": {
"script": "doc['maid'].value + '#' +doc'date'].value"
}
}
}
}
}
}
}
}
}
}
This gives me syntax error at doc['maid'].value. How do I effectively combine two fields for cardinality. I am using Elasticsearch 7.13.2.
The mapping of my index is as follows:
"r2332" : {
"mappings" : {
"dynamic" : "false",
"properties" : {
"date" : {
"type" : "date"
},
"hour" : {
"type" : "integer"
},
"maid" : {
"type" : "keyword"
},
"reach" : {
"type" : "integer"
},
"site_id" : {
"type" : "keyword"
}
}
}
}
}
Modify your second query as shown below. You need to use the maid.keyword instead of the maid field in the cardinality aggregation to avoid the search_phase_execution_exception
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
7560,
7566
]
}
},
{
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}
]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"uv": {
"cardinality": {
"script": "doc['maid.keyword'].value + '#' +doc['date'].value"
}
}
}
}
}
}
}
}
}
}

How to filter nested aggregations in ElasticSearch?

For example, let's assume we have a product index with the following mapping:
{
"product": {
"mappings": {
"producttype": {
"properties": {
"id": {
"type": "keyword"
},
"productAttributes": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
}
}
I am trying to find how many products which have specific product attributes using the following query(I am using a fuzzy query to allow some edit distance):
{
"size": 0,
"query": {
"nested": {
"query": {
"fuzzy": {
"productAttributes.name": {
"value": "SSD"
}
}
},
"path": "productAttributes"
}
},
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
But it returns all product attributes for each matched document and here is the response I get.
"aggregations" : {
"product_attribute_nested_agg" : {
"doc_count" : 6,
"terms_nested_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "SSD",
"doc_count" : 3
},
{
"key" : "USB 2.0",
"doc_count" : 3
}
]
}
}
}
Could you please guide me to how to filter buckets to only return matched attributes?
Edit:
Here are some document samples:
"hits" : {
"total" : 12,
"max_score" : 1.0,
"hits" : [
{
"_index" : "product",
"_type" : "producttype",
"_id" : "677d1164-c401-4d36-8a08-6aa14f7f32bb",
"_score" : 1.0,
"_source" : {
"title" : "Dell laptop",
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "4"
},
{
"name" : "SSD",
"value" : "250 GB"
}
]
}
},
{
"_index" : "product",
"_type" : "producttype",
"_id" : "2954935a-7f60-437a-8a54-00da2d71da46",
"_score" : 1.0,
"_source" : {
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "3"
},
{
"name" : "SSD",
"value" : "500 GB"
}
],
"title" : "HP laptop"
}
},
]
}
To filter only specific, you can use filter queries.
Query:
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}
This is what it does the trick:
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
}
You need to do filter part of the aggregation.
Output:
"aggregations": {
"product_attribute_nested_agg": {
"doc_count": 4,
"inner": {
"doc_count": 2,
"terms_nested_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SSD",
"doc_count": 2
}
]
}
}
}
}
Filtering using Fuzziness :
GET /product/_search
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"fuzzy": {
"productAttributes.name": {
"value": "SSt",//here will match SSD
"fuzziness": 3//you can remove it to be as Auto
}
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}

Elasticsearch: terms aggregations on doubly nested object

I am trying to do a doubly nested aggregation on a doubly nested object. That is, I have the root document, a child property, and a grand-child property. To be more precise, I have the following mapping:
{
"mappings": {
"root": {
"properties": {
"fields": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"selections": {
"type": "nested",
"properties": {
"value": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
I am trying to aggregate selection value counts per field, or in other words, to count the number of occurrences of each value for each field name, accross all root objects.
I have this:
{
"query": {
...
},
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "selections"
},
"aggregations": {
"value": {
"terms": {
"field": "selections.value"
}
}
}
}
}
}
}
}
}
}
which gets the field names as I want but for each of them I get no doc counts for the values.
What am I doing wrong?
You need to give full name for inner nested field, Change "path":"selections" to "path":"fields.selections"
{
"size": 0,
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "fields.selections"
},
"aggregations": {
"value": {
"terms": {
"field": "fields.selections.value"
}
}
}
}
}
}
}
}
}
}
Result:
"aggregations" : {
"fields" : {
"doc_count" : 2,
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "abc",
"doc_count" : 2,
"values" : {
"doc_count" : 2,
"value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 2
}
]
}
}
}
]
}
}
}

aggregation query and return all fields in elasticsearch

I have an large(20GB) csv file by flowing format.
date,ip,dev_type,env,time,cpu_usage
2015-11-09,10.241.121.172,M2,production,11:01,8
2015-11-09,10.241.121.172,M2,production,11:02,9
2015-11-09,10.241.121.243,C1,preproduction,11:01,4
2015-11-09,10.241.121.243,C1,preproduction,11:02,8
2015-11-10,10.241.121.172,M2,production,11:01,3
2015-11-10,10.241.121.172,M2,production,11:02,9
2015-11-10,10.241.121.243,C1,preproduction,11:01,4
2015-11-10,10.241.121.243,C1,preproduction,11:02,8
and import into elasticheaseh as flowing format
{
"_index": "cpuusage",
"_type": "logs",
"_id": "AVFOkMS7Q4jUWMFNfSrZ",
"_score": 1,
"_source": {
"date": "2015-11-10",
"ip": "10.241.121.172",
"dev_type": "M2",
"env": "production",
"time": "11:02",
"cpu_usage": "9"
},
"fields": {
"date": [
1447113600000
]
}
}
...
so how could i output all field (date, ip, dev_type, env, cpu_usage) when i find out the maximum value of cpu_usage for each ip in each day
curl -XGET localhost:9200/cpuusage/_search?pretty -d '{
"size": 0,
"aggs": {
"by_date": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs" : {
"genders" : {
"terms" : {
"field" : "ip",
"size": 100000,
"order" : { "_count" : "asc" }
},
"aggs" : {
"cpu_usage" : { "max" : { "field" : "cpu_usage" } }
}
}
}
}
}
}'
---cut---
----output ----
"aggregations" : {
"events_by_date" : {
"buckets" : [ {
"key_as_string" : "2015-11-09T00:00:00.000Z",
"key" : 1447027200000,
"doc_count" : 4,
"genders" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "10.241.121.172",
"doc_count" : 2,
"cpu_usage" : {
"value" : 9.0
}
}, {
"key" : "10.241.121.243",
"doc_count" : 2,
"cpu_usage" : {
"value" : 8.0
}
} ]
}
},
You can do that with top hits aggregation
Try this
{
"size": 0,
"aggs": {
"by_date": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"genders": {
"terms": {
"field": "ip",
"size": 100000,
"order": {
"_count": "asc"
}
},
"aggs": {
"cpu_usage": {
"max": {
"field": "cpu_usage"
}
},
"include_source": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"date", "ip", "dev_type", "env", "cpu_usage"
]
}
}
}
}
}
}
}
}
}
Does this help?

Resources