How to sort before aggregation in Elasticsearch? - elasticsearch

I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?

You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}

Related

Order by sub-aggregation in composite aggregation (Elasticsearch)

I need to ordenate the results of an composite aggregation, but the value to be orderned is the sum of a specific field (my index is so much larger, so i need the composite for paginate values).
When send this GET:
GET /_search
{
"aggs" : {
"my_buckets": {
"composite" : {
"sources" : [
{ "date": { "date_histogram": { "field": "timestamp", "interval": "1d"} } },
{ "product": { "terms": {"field": "product" } } }
]
},
"aggregations": {
"the_sum": {
"sum": { "field": "price" } <--- i want order by this field aggregation
}
}
}
}
}
How can i get this response? (order by sum of each price)
{
...
"aggregations": {
"my_buckets": {
"after_key": {
"date": 1494374400000,
"product": "mad max"
},
"buckets": [
{
"key": {
"date": 1494460800000,
"product": "apocalypse now"
},
"doc_count": 1,
"the_sum": {
"value": 10.0
}
},
{
"key": {
"date": 1494288000000,
"product" : "mad max"
},
"doc_count": 2,
"the_sum": {
"value": 22.5
}
},
{
"key": {
"date": 1494374400000,
"product": "mad max"
},
"doc_count": 1,
"the_sum": {
"value": 290.0
}
}
]
}
}
}

why aggregation script is not working in elasticsearch?

i have a some problem in elasticsearch.
i want division value with two aggregated values.
this query is working.
{
"query": {
"bool": {
"adjust_pure_negative": true,
"boost": 1.0
}
},
"aggregations": {
"sumPageview": {
"sum": {
"field": "pageview",
"missing": 0
}
},
"sumVisit": {
"sum": {
"field": "visit",
"missing": 0
}
}
}
but this query is not working.
{
"query": {
"bool": {
"adjust_pure_negative": true,
"boost": 1.0
}
},
"aggregations": {
"sumPageview": {
"sum": {
"field": "pageview",
"missing": 0
}
},
"sumVisit": {
"sum": {
"field": "visit",
"missing": 0
}
},
"totalPageviewPerVisit": {
"bucket_script": {
"buckets_path": {
"sumPageview": "sumPageview",
"sumVisit": "sumVisit"
},
"script": {
"source": "params.sumPageview / params.sumVisit",
"lang": "painless"
},
"gap_policy": "skip"
}
}
}
i think this reason is what sum value is not in bucket.
this reason right? help me, please.
Sum aggregation is a single-value metrics aggregation that sums
up numeric values that are extracted from the aggregated documents.
Bucket script aggregation is a parent pipeline aggregation that
executes a script that can perform per bucket computations on
specified metrics in the parent multi-bucket aggregation.
Because sum aggregation, do not create any buckets, so you cannot use bucket script aggregation on it.
Adding a working example with index data, search query, and search result
Index Data:
{
"user_id":1,
"pageview": 1,
"visit": 2
}
{
"user_id":2,
"pageview": 2,
"visit": 3
}
{
"user_id":3,
"pageview": 3,
"visit": 4
}
Search Query:
{
"size": 0,
"aggs": {
"all": {
"terms": {
"field": "user_id"
},
"aggs": {
"sum_1": {
"sum": {
"field": "pageview"
}
},
"sum_2": {
"sum": {
"field": "visit"
}
},
"division": {
"bucket_script": {
"buckets_path": {
"my_var1": "sum_1",
"my_var2": "sum_2"
},
"script": "params.my_var1 / params.my_var2"
}
}
}
}
}
}
Search Result:
"aggregations": {
"all": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 1,
"sum_2": {
"value": 2.0
},
"sum_1": {
"value": 1.0
},
"division": {
"value": 0.5
}
},
{
"key": 2,
"doc_count": 1,
"sum_2": {
"value": 3.0
},
"sum_1": {
"value": 2.0
},
"division": {
"value": 0.6666666666666666
}
},
{
"key": 3,
"doc_count": 1,
"sum_2": {
"value": 4.0
},
"sum_1": {
"value": 3.0
},
"division": {
"value": 0.75
}
}
]
}

Document count greater than 'x', geo_point aggregations elasticsearch

I'm currently doing the following Query to aggregate all points within a certain precision in elastic search.
{
"aggs": {
"coordinates": {
"geohash_grid": {
"field": "properties.Geometry.geo_point",
"precision": 12
},
"aggs": {
"centroid": {
"geo_centroid": {
"field": "properties.Geometry.geo_point"
}
}
}
}
}
}
and the response is
"aggregations": {
"coordinates": {
"buckets": [
{
"key": "s00000000000",
"doc_count": 82571,
"centroid": {
"location": {
"lat": 0,
"lon": 0
},
"count": 82571
}
},
{
"key": "6gyf4bf8m0uh",
"doc_count": 58587,
"centroid": {
"location": {
"lat": -23.55052001774311,
"lon": -46.633309721946716
},
"count": 58587
}
},
{
"key": "7h2y8hz76g8m",
"doc_count": 14551,
"centroid": {
"location": {
"lat": -19.924501832574606,
"lon": -43.93523778766394
},
"count": 14551
}
}
}
I need to get all buckets that have a count greater than a certain number. How can I do that?
You can add the field
GET _search
{
"query": {
"range" : {
"age" : {
"gte" : 5000,
}
}
}
}
as descripe here in the documentation:
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
Solved, I used the following filter:
{
"aggs": {
"coordinates": {
"geohash_grid": {
"field": "properties.Geometry.geo_point",
"precision": 12
},
"aggs": {
"sales_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"doc_count": "centroid.count"
},
"script": "params.doc_count > 500"
}
},
"centroid": {
"geo_centroid": {
"field": "properties.Geometry.geo_point"
}
}
}
}
}
}

Elasticsearch: filter aggregation using bucket value

Not sure how to formulate the question.
I'm using Elasticsearch 2.2.
Let's start with an example of the dataset, made of 5 documents:
[
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": {},
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": { "uuid": "b" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 0
}
}
]
called_entity always has a uuid.
coverage_entity can be empty, or have an uuid.
I use a script to aggregate on either called_entity.uuid or coverage_entity.uuid:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
}
}
}
}
}
So now, the aggregation has generated terms from either header.called_entity.uuid, or header.coverage_entity.uuid.
How can I filter my aggregation using the value of the aggregation key? For example, if I want to count, for each bucket, how many documents have their uuid taken from header.called_entity.uuid only. Something like that:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
},
"from_called_entity": {
"filter": {
"term": { "header.called_entity.uuid": BUCKET_KEY }
}
}
}
}
}
}
Not sure this is possible. The key itself is only available as a sorting option.
Could you use something like this:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script": "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.sucessful_transfers"
}
}
}
},
"called_entity_source": {
"terms": {
"field": "header.called_entity.uuid",
"size": 10
}
},
"coverage_entity_source": {
"terms": {
"field": "header.coverage_entity.uuid",
"size": 10
}
}
}
}
And the output will be something like this:
"called_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 2
}
]
},
"coverage_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 1
}
]
},
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 4,
"successful_transfers": {
"value": 3
}
},
{
"key": "b",
"doc_count": 3,
"successful_transfers": {
"value": 2
}
}
]
}
If you really need to have the json in that specific way, add another final step in your application where you post process the result a bit. The result above does contain the info you need but the keys from coverage_entity_source and called_entity_source are not under the dim aggregation.

Elasticsearch range bucket aggregation based on doc_count

I have an elasticsearch aggregation query like this.
{
"aggs": {
"customer": {
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
}
},
"terms": {
"field": "org",
"size": 50000
}
}
}
}
And it results in bucket aggregation like following
{
"aggregations": {
"customer": {
"buckets": [
{
"Total_Sale": { "value": 9999 },
"doc_count": 8,
"key": "cats"
},
{
"Total_Sale": { "value": 8888 },
"doc_count": 6,
"key": "tigers"
},
{
"Total_Sale": { "value": 444},
"doc_count": 5,
"key": "lions"
},
{
"Total_Sale": { "value": 555 },
"doc_count": 2,
"key": "wolves"
}
]
}
}
}
I want another range bucket aggregation based on doc_count. So, final result required is
{
"buckets": [
{
"Sum_of_Total_Sale": 555, // If I can form bucket, I can get this using sum_bucket. So, getting bucket is important.
"Sum_of_doc_count": 2,
"doc_count": 1,
"key": "*-3",
"to": 3.0
},
{
"Sum_of_Total_Sale": 9332,
"Sum_of_doc_count": 11,
"doc_count": 2,
"from": 4.0,
"key": "4-6",
"to": 6.0
},
{
"Sum_of_Total_Sale": 9999,
"Sum_of_doc_count": 8,
"doc_count": 1,
"from": 7.0,
"key": "7-*"
}
]
}
Bucket Selector Aggregation and then using bucket sum aggregation will not work because there is more than one key for range.
Bucket Script Aggregation does calculation within bucket.
Can I add scripted doc field for each document which help me to create these buckets?
There's no aggregation that I know of that can allow you to do this in one shot. however, there is one technique that I use from time to time to overcome this limitation. The idea is to repeat the same terms/sum aggregation and then use a bucket_selector pipeline aggregation for each of the ranges you're interested in.
POST index/_search
{
"size": 0,
"aggs": {
"*-3": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"*-3": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount <= 3"
}
}
}
},
"*-3_Total_Sales": {
"sum_bucket": {
"buckets_path": "*-3>Total_Sale"
}
},
"*-3_Total_Docs": {
"sum_bucket": {
"buckets_path": "*-3>_count"
}
},
"4-6": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"4-6": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 4 && params.docCount <= 6"
}
}
}
},
"4-6_Total_Sales": {
"sum_bucket": {
"buckets_path": "4-6>Total_Sale"
}
},
"4-6_Total_Docs": {
"sum_bucket": {
"buckets_path": "4-6>_count"
}
},
"7-*": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"7-*": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 7"
}
}
}
},
"7-*_Total_Sales": {
"sum_bucket": {
"buckets_path": "7-*>Total_Sale"
}
},
"7_*_Total_Docs": {
"sum_bucket": {
"buckets_path": "7-*>_count"
}
}
}
}
You'll get an answer that looks like this, which contains exactly the figures you're looking for in the xyz_Total_Sales and xyz_Total_Docs results:
"aggregations": {
"*-3": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "wolves",
"doc_count": 2,
"Total_Sale": {
"value": 555
}
}
]
},
"7-*": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cats",
"doc_count": 8,
"Total_Sale": {
"value": 9999
}
}
]
},
"4-6": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "tigers",
"doc_count": 6,
"Total_Sale": {
"value": 8888
}
},
{
"key": "lions",
"doc_count": 5,
"Total_Sale": {
"value": 444
}
}
]
},
"*-3_Total_Sales": {
"value": 555
},
"*-3_Total_Docs": {
"value": 2
},
"4-6_Total_Sales": {
"value": 9332
},
"4-6_Total_Docs": {
"value": 11
},
"7-*_Total_Sales": {
"value": 9999
},
"7_*_Total_Docs": {
"value": 8
}
}

Resources