Order by sub-aggregation in composite aggregation (Elasticsearch) - elasticsearch

I need to ordenate the results of an composite aggregation, but the value to be orderned is the sum of a specific field (my index is so much larger, so i need the composite for paginate values).
When send this GET:
GET /_search
{
"aggs" : {
"my_buckets": {
"composite" : {
"sources" : [
{ "date": { "date_histogram": { "field": "timestamp", "interval": "1d"} } },
{ "product": { "terms": {"field": "product" } } }
]
},
"aggregations": {
"the_sum": {
"sum": { "field": "price" } <--- i want order by this field aggregation
}
}
}
}
}
How can i get this response? (order by sum of each price)
{
...
"aggregations": {
"my_buckets": {
"after_key": {
"date": 1494374400000,
"product": "mad max"
},
"buckets": [
{
"key": {
"date": 1494460800000,
"product": "apocalypse now"
},
"doc_count": 1,
"the_sum": {
"value": 10.0
}
},
{
"key": {
"date": 1494288000000,
"product" : "mad max"
},
"doc_count": 2,
"the_sum": {
"value": 22.5
}
},
{
"key": {
"date": 1494374400000,
"product": "mad max"
},
"doc_count": 1,
"the_sum": {
"value": 290.0
}
}
]
}
}
}

Related

Elasticsearch daily/monthly aggregation query

I would like to build daily/monthly aggregation query.
Is the only solution to create ranges -> from... to... for each day/month? I can generate ranges but it seems that it can be other way to achieve that.
How can I format from... to... epoch milis to yyyy-mm-dd for each result?
{
"aggs": {
"aggs_sum_amount": {
"filters": {
"filters": {
"Amount1": {
...
},
"Amount2": {
...
}
}
},
"aggs": {
"range": {
"date_range": {
"field": "dateField",
"ranges": [
{
"from": "1613347200000",
"to": "1613433600000"
},
{
"from": "1613433600000",
"to": "1613520000000"
}
...
]
},
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
}
}
}
}
}
}
}
Example response
{
"aggregations": {
"aggs_sum_amountPLN": {
"buckets": {
"Amount1": {
"doc_count": 26,
"range": {
"buckets": [
{
"key": "1613347200000-1613433600000",
"from": 1.6133472E12,
"from_as_string": "1613347200000",
"to": 1.6134336E12,
"to_as_string": "1613433600000",
"doc_count": 0,
"sum_amount": {
"value": 0.0
}
},
{
...
}
]
}
},
"Amount2": {
...
}
}
}
}
}
You can use the date_histogram aggregation
It lets you specify a range and an interval for which you want to get the different buckets for.
This example on the linked page is quite self explanatory.
I've updated it to match your use-case:
POST /sales/_search?size=0
{
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "date",
"calendar_interval": "1M",
"format": "yyyy-MM-dd"
}
}
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
}
}
}
}
Response should be something like this:
{
...
"aggregations": {
"sales_over_time": {
"buckets": [
{
"key_as_string": "2015-01-01",
"key": 1420070400000,
"doc_count": 3,
"sum_amount": {
"value": 15.0
}
},
{
"key_as_string": "2015-02-01",
"key": 1422748800000,
"doc_count": 2,
"sum_amount": {
"value": 10.0
}
},
{
"key_as_string": "2015-03-01",
"key": 1425168000000,
"doc_count": 2,
"sum_amount": {
"value": 25.0
}
}
]
}
}
}

How to sort before aggregation in Elasticsearch?

I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?
You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}

How to sort nested aggregation field based on parent document field in elasticsearch?

I have index of stores at various location. With each store I have a nested list of discount coupon.
Now I have query to get list of all unique coupons in a x km of radius sorted by the distance of the nearest applicable coupon on given location
Database :: Elasticsearch
Index Mapping ::
{
"mappings": {
"car_stores": {
"properties": {
"location": {
"type": "geo_point"
},
"discount_coupons": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
}
}
}
}
}
Sample Doc ::
{
"_index": "stores",
"_type": "car_stores",
"_id": "1258c81d-b6f2-400f-a448-bd728f524b55",
"_score": 1.0,
"_source": {
"location": {
"lat": 36.053757,
"lon": 139.525482
},
"discount_coupons": [
{
"name": "c1"
},
{
"name": "c2"
}
]
}
}
Old Query to get unique discount coupon names in x km area for given location ::
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"_key": "asc"
},
"size": 200
}
}
}
}
}
}
Updated Response ::
{
"took": 60,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 245328,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"coupon": {
"doc_count": 657442,
"name": {
"doc_count_error_upper_bound": -1,
"sum_other_doc_count": 641189,
"buckets": [
{
"key": "local20210211",
"doc_count": 1611,
"back_to_base": {
"doc_count": 1611,
"distance_script": {
"value": 160.61034409639765
}
}
},
{
"key": "local20210117",
"doc_count": 1621,
"back_to_base": {
"doc_count": 1621,
"distance_script": {
"value": 77.51459886447356
}
}
},
{
"key": "local20201220",
"doc_count": 1622,
"back_to_base": {
"doc_count": 1622,
"distance_script": {
"value": 84.15734462544432
}
}
},
{
"key": "kisekae1",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 88.23770888201268
}
}
},
{
"key": "local20210206",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 86.78376012847237
}
}
},
{
"key": "local20210106",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 384.12156408078397
}
}
},
{
"key": "local20210113",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 153.61681676703674
}
}
},
{
"key": "local20",
"doc_count": 1629,
"back_to_base": {
"doc_count": 1629,
"distance_script": {
"value": 168.74132991524073
}
}
},
{
"key": "local20210213",
"doc_count": 1630,
"back_to_base": {
"doc_count": 1630,
"distance_script": {
"value": 155.8335679860034
}
}
},
{
"key": "local20210208",
"doc_count": 1632,
"back_to_base": {
"doc_count": 1632,
"distance_script": {
"value": 99.58790590445102
}
}
}
]
}
}
}
}
Now the above query will return first 200 discount coupon default sorted by count but I want to return coupons sorted on distance based to given location i.e. the coupon that is nearest applicable should come first.
Is there any way to sort nested aggregations based on a parent key or can I solve this use case using a different data model?
Update Query ::
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"geo_distance": {
"distance": "100km",
"location": {
"lat": 35.699104,
"lon": 139.825211
}
}
},
{
"nested": {
"path": "discount_coupons",
"query": {
"bool": {
"filter": {
"exists": {
"field": "discount_coupons"
}
}
}
}
}
}
]
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 10
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(35.699104, 139.825211)"
}
}
}
}
}
}
}
}
}
}
}
Interesting question. You can always order a terms aggregation by the result of a numeric sub-aggregation. The trick here is to escape the nested context via a reverse_nested aggregation and then calculate the distance from the pivot using a script:
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 200
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(40, -70)"
}
}
}
}
}
}
}
}
}
}
}

Elasticsearch array value count aggregation

Sample documents:
{
"id": "62655",
"attributes": [
{
"name": "genre",
"value": "comedy"
},
{
"name": "year",
"value": "2016"
}
]
}
{
"id": "62656",
"attributes": [
{
"name": "genre",
"value": "horror"
},
{
"name": "year",
"value": "2016"
}
]
}
{
"id": "62657",
"attributes": [
{
"name": "language",
"value": "english"
},
{
"name": "year",
"value": "2015"
}
]
}
Expected Output:
{
"hits" : {
"total": 3,
"hits": []
},
"aggregations": {
"attribCount": {
"language": 1,
"genre": 2,
"year": 3
},
"attribVals": {
"language": {
"english": 1
},
"genre": {
"comedy": 1,
"horror": 1
},
"year": {
"2016": 2,
"2015": 1
}
}
}
}
My Query:
I could get the "attribCount" aggregation using below query. But I don't know how to get each attribute value count.
{
"query": {
"filtered": {
"query": {
"match_all": {}
}
}
},
"aggs": {
"attribCount": {
"terms": {
"field": "attributes.name",
"size": 0
}
}
},
"size": 0
}
When I aggregate using attributes.value, it gives overall count. But I need it listed under the name value as given in expected output.
As you say the attribute field is nested.
Try this, this will work
{
"size": 0,
"aggs": {
"count": {
"nested": {
"path": "attributes"
},
"aggs": {
"attribCount": {
"terms": {
"field": "attributes.name"
}
},
"attribVal": {
"terms": {
"field": "attributes.name"
},
"aggs": {
"attribval2": {
"terms": {
"field": "attributes.value"
}
}
}
}
}
}
}
}

Elasticsearch range bucket aggregation based on doc_count

I have an elasticsearch aggregation query like this.
{
"aggs": {
"customer": {
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
}
},
"terms": {
"field": "org",
"size": 50000
}
}
}
}
And it results in bucket aggregation like following
{
"aggregations": {
"customer": {
"buckets": [
{
"Total_Sale": { "value": 9999 },
"doc_count": 8,
"key": "cats"
},
{
"Total_Sale": { "value": 8888 },
"doc_count": 6,
"key": "tigers"
},
{
"Total_Sale": { "value": 444},
"doc_count": 5,
"key": "lions"
},
{
"Total_Sale": { "value": 555 },
"doc_count": 2,
"key": "wolves"
}
]
}
}
}
I want another range bucket aggregation based on doc_count. So, final result required is
{
"buckets": [
{
"Sum_of_Total_Sale": 555, // If I can form bucket, I can get this using sum_bucket. So, getting bucket is important.
"Sum_of_doc_count": 2,
"doc_count": 1,
"key": "*-3",
"to": 3.0
},
{
"Sum_of_Total_Sale": 9332,
"Sum_of_doc_count": 11,
"doc_count": 2,
"from": 4.0,
"key": "4-6",
"to": 6.0
},
{
"Sum_of_Total_Sale": 9999,
"Sum_of_doc_count": 8,
"doc_count": 1,
"from": 7.0,
"key": "7-*"
}
]
}
Bucket Selector Aggregation and then using bucket sum aggregation will not work because there is more than one key for range.
Bucket Script Aggregation does calculation within bucket.
Can I add scripted doc field for each document which help me to create these buckets?
There's no aggregation that I know of that can allow you to do this in one shot. however, there is one technique that I use from time to time to overcome this limitation. The idea is to repeat the same terms/sum aggregation and then use a bucket_selector pipeline aggregation for each of the ranges you're interested in.
POST index/_search
{
"size": 0,
"aggs": {
"*-3": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"*-3": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount <= 3"
}
}
}
},
"*-3_Total_Sales": {
"sum_bucket": {
"buckets_path": "*-3>Total_Sale"
}
},
"*-3_Total_Docs": {
"sum_bucket": {
"buckets_path": "*-3>_count"
}
},
"4-6": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"4-6": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 4 && params.docCount <= 6"
}
}
}
},
"4-6_Total_Sales": {
"sum_bucket": {
"buckets_path": "4-6>Total_Sale"
}
},
"4-6_Total_Docs": {
"sum_bucket": {
"buckets_path": "4-6>_count"
}
},
"7-*": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"7-*": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 7"
}
}
}
},
"7-*_Total_Sales": {
"sum_bucket": {
"buckets_path": "7-*>Total_Sale"
}
},
"7_*_Total_Docs": {
"sum_bucket": {
"buckets_path": "7-*>_count"
}
}
}
}
You'll get an answer that looks like this, which contains exactly the figures you're looking for in the xyz_Total_Sales and xyz_Total_Docs results:
"aggregations": {
"*-3": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "wolves",
"doc_count": 2,
"Total_Sale": {
"value": 555
}
}
]
},
"7-*": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cats",
"doc_count": 8,
"Total_Sale": {
"value": 9999
}
}
]
},
"4-6": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "tigers",
"doc_count": 6,
"Total_Sale": {
"value": 8888
}
},
{
"key": "lions",
"doc_count": 5,
"Total_Sale": {
"value": 444
}
}
]
},
"*-3_Total_Sales": {
"value": 555
},
"*-3_Total_Docs": {
"value": 2
},
"4-6_Total_Sales": {
"value": 9332
},
"4-6_Total_Docs": {
"value": 11
},
"7-*_Total_Sales": {
"value": 9999
},
"7_*_Total_Docs": {
"value": 8
}
}

Resources