Filtering documents after aggregation - elasticsearch

In Elasticsearch, I am storing item state snapshots in an append-only scheme.
For example:
POST /item/item
{
"id": "1",
"time": "2018-09-19T00:00:00Z",
status": "ON_HOLD"
}
POST /item/item
{
"id": "2",
"time": "2018-09-19T00:01:00Z",
"status": "ON_HOLD"
}
POST /item/item
{
"id": "2",
"time": "2018-09-19T00:02:00Z",
"status": "DONE"
}
Now, what I wish to achieve is answer the following question: what items are still on hold? (status==ON_HOLD).
In this simple case, the answer would be:
{
"id": "1",
"time": "2018-09-19T00:00:00Z",
status": "ON_HOLD"
}
So, in order to get the last state of an item, I use a terms aggregation, on id, like so:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 10
},
"aggs": {
"top_items": {
"top_hits": {
"size": 1,
"sort": [
{
"time": {
"order": "desc"
}
}
],
"_source": {
"includes": ["*"]
}
}
}
}
}
}
}
This gives me the last available state of each item identified by its id:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "2",
"doc_count": 2,
"top_items": {
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "S-5eCGYBNyILygyml2jR",
"_score": null,
"_source": {
"id": "2",
"time": "2018-09-19T00:02:00Z",
"status": "DONE"
},
"sort": [
1537315320000
]
}
]
}
}
},
{
"key": "1",
"doc_count": 1,
"top_items": {
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "Se5eCGYBNyILygymjmg0",
"_score": null,
"_source": {
"id": "1",
"time": "2018-09-19T00:00:00Z",
"status": "ON_HOLD"
},
"sort": [
1537315200000
]
}
]
}
}
}
]
}
}
}
Now the problem is I would like to filter the result (after aggregation) on Elasticsearch's side (not client).
I tried a bucket_selector aggregation but it complains since the top_hits result is not a number or single value numeric aggregation.
I also tried to add a script_field to get a numeric value but cannot seem to use this after:
"script_fields": {
"on_hold": {
"script": {
"lang": "painless",
"source": "doc['status.keyword'].value == 'ON_HOLD' ? 1 : 0"
}
}
}
Is what I want to do even possible on Elasticsearch's side or do I have to do it on the client side?
PS: adding the filter before the aggregation does not provide correct result as it will return items who have been ON_HOLD at any point in time.
EDIT:
Alright I am getting somewhere with:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 50
},
"aggs": {
"top_item": {
"terms": {
"size": 1,
"field": "time",
"order": {
"_key": "desc"
}
},
"aggs": {
"on_hold": {
"filter": {
"term": {
"status.keyword": "ON_HOLD"
}
},
"aggs": {
"document": {
"top_hits": {
"size": 1,
"_source": ["*"]
}
}
}
}
}
}
}
}
}
}
The top_hits aggregation is a metrics and not a bucket aggregation, so it does not do the job and must be used last.
One last problem though: filtered out buckets leave empty leaves:
"hits": []
Is there any way to remove such branches ending in empty leaves from the result tree? Thanks

Alright, I found the complete solution to the problem, including filtering out empty branches in the aggregation tree:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 50
},
"aggs": {
"top_item": {
"terms": {
"size": 1,
"field": "time",
"order": {
"_key": "desc"
}
},
"aggs": {
"on_hold": {
"filter": {
"term": {
"status.keyword": "ON_HOLD"
}
},
"aggs": {
"document": {
"top_hits": {
"size": 1,
"_source": ["*"]
}
}
}
},
"remove_filtered": {
"bucket_selector": {
"buckets_path": {
"count": "on_hold._count"
},
"script": {
"source": "params.count != 0"
}
}
}
}
},
"remove_empty": {
"bucket_selector": {
"buckets_path": {
"count": "top_item._bucket_count"
},
"script": "params.count != 0"
}
}
}
}
}
}
This gives the following output which was expected:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1",
"doc_count": 1,
"top_item": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1537315200000,
"key_as_string": "2018-09-19T00:00:00.000Z",
"doc_count": 1,
"on_hold": {
"doc_count": 1,
"document": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "HvywM2YB5Ei0wOZMeia9",
"_score": 1,
"_source": {
"id": "1",
"time": "2018-09-19T00:00:00Z",
"status": "ON_HOLD"
}
}
]
}
}
}
}
]
}
}
]
}
}
}

Related

Elasticsearch top_hits aggregation result and doc_count are different

Query
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"field": "holderInfo.raw",
"size": 50
},
"aggregations": {
"tops": {
"top_hits": {
"_source": {
"includes": ["uid"]
}
}
}
}
}
}
}
Result
{
...
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"498": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "MATSUSHITA ELECTRIC INDUSTRIAL",
"doc_count": 5,
"tops": {
"hits": {
"total": 5,
"max_score": 1,
"hits": [
{
"_index": "someindex",
"_id": "03a3",
"_score": 1,
"_source": {
"uid": "03a3"
}
},
{
"_index": "someindex",
"_id": "08a2",
"_score": 1,
"_source": {
"uid": "08a2"
}
},
{
"_index": "someindex",
"_id": "84a1",
"_score": 1,
"_source": {
"uid": "84a1"
}
}
]
}
}
}
]
}
}
}
"08a2", "08a3", "03a2", "03a3" and "84a1" each clearly have 'MATSUSHITA ELECTRIC INDUSTRIAL' in the holderInfo.raw field.
Therefore, there are 5 cases in the doc_count, but only "03a3", "08a2", and "84a1" are output in the top_hits results, and "08a3" and "03a2" are omitted.
Query
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"script": {
"inline": "doc['holderInfo.raw'].value"
},
"size": 50
}
}
}
}
Result
{
...
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"498": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "MATSUSHITA ELECTRIC INDUSTRIAL",
"doc_count": 3
}
]
}
}
}
In addition, two cases are omitted when aggregating with script.
I'd like to know why some uids are missing.
I'm in a situation where I have to use the elasticsearch version 2.2. I want to know if it's an elasticsearch bug that occurs in an old version or a user's fault.
Thanks!
By default, the top_hits aggregation returns the first 3 top hits. You just need to increase the size parameter:
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"field": "holderInfo.raw",
"size": 50
},
"aggregations": {
"tops": {
"top_hits": {
"size": 5, <---- add this
"_source": {
"includes": ["uid"]
}
}
}
}
}
}
}

Max and min from all index in query

Is there way to get max and min for all documents in index, not only max and min from category "game" without making another request to elastic?
{
"query": {
"bool": {
"must": [
{
"match": {
"category": "game"
}
}
]
}
},
"aggs": {
"maxPoints": {
"max": {
"field": "points"
}
},
"minPoints": {
"min": {
"field": "points"
}
}
}
Here is some data data i have, with query above I want to get this 2 docs from category game and min 0, max 100 instead of min 10, max 20.
[
{
"id": 1,
"category": "offer",
"points": 0
},
{
"id": 2,
"category": "game",
"points": 10
},
{
"id": 3,
"category": "game",
"points": 20
},
{
"id": 4,
"category": "offer",
"points": 100
}
]
Yeah, just remove the match clause, and add match_all query to include all the documents in your index. Use post_filter to get the expected results in a single ES call.
{
"query": {
"match_all": {}
},
"aggs": {
"maxPoints": {
"max": {
"field": "points"
}
},
"minPoints": {
"min": {
"field": "points"
}
}
},
"post_filter": { // Note this
"term": {
"category": "game"
}
}
}
Output
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65406564",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"id": 2,
"category": "game",
"points": 10
}
},
{
"_index": "65406564",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"id": 3,
"category": "game",
"points": 20
}
}
]
},
"aggregations": {
"maxPoints": {
"value": 100.0
},
"minPoints": {
"value": 0.0
}
}
}

How can i get the ALL lastest record with each group by Elasticsearch query?

I have reference from this how-to-get-latest-values-for-each-group-with-an-elasticsearch-query
and now i do the search, but the aggregations only return 10 doc for me, how can it show all match result? I ONLY show two since its too long for the return repsonse , thanks!
my ES query is :
{
"size" :1,
"aggs": {
"group": {
"terms": {
"field": "studentId"
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}
and the result:
{
"took": 32,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": 1.0,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "N-wsrHYB4zCrGLTdS7Ur",
"_score": 1.0,
"_source": {
"studentId": 144,
"timestampstring": "2020-09-02 05:58:04.828",
"type": "data"
}
}
]
},
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 99670,
"buckets": [
{
"key": 131,
"doc_count": 579,
"group_docs": {
"hits": {
"total": {
"value": 579,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "SVVj4HYBlaUrIoJst3-o",
"_score": null,
"_source": {
"studentId": 131,
"timestampstring": "2021-01-08 13:06:34.413",
"type": "data"
},
"sort": [
1609340059767
]
}
]
}
}
},
{
"key": 147,
"doc_count": 529,
"group_docs": {
"hits": {
"total": {
"value": 529,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "SVVj4HYBlaUrIoJst3-o",
"_score": null,
"_source": {
"studentId": 147,
"timestampstring": "2021-01-08 13:06:34.413",
"type": "data"
},
"sort": [
1610082394413
]
}
]
}
}
}
]
}
}
}
You need to add the size param in the terms aggregation
The size parameter can be set to define how many term buckets should
be returned out of the overall terms list.
{
"size": 1,
"aggs": {
"group": {
"terms": {
"field": "studentId",
"size": 100 // note this
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}
Update 1:
You can use stats bucket aggregation, to get the count of unique studenid
{
"size": 1,
"aggs": {
"group": {
"terms": {
"field": "studentId",
"size": 100 // note this
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
},
"bucketcount": {
"stats_bucket": {
"buckets_path": "group._count"
}
}
}
}

How aggs by id and order result by title? Elasticsearch 5.5

I started to learn Elasticsearch and faced the problem of sorting the grouped results.
I tried to find something about this in the documentation, but I did not find anything.
That's what I have:
Mapping:
{
"mappings": {
"post": {
"properties": {
"tags": {
"type": "nested"
}
}
}
}
}
Posts:
{
"title": "some title",
"message": "some message",
"tags": [
{
"id": 1,
"title": "some tag title"
},
{
"id": 2,
"title": "some tag title 2"
},
{
"id": 3,
"title": "some tag title 3"
}
]
}
{
"title": "some title 2",
"message": "some message 2",
"tags": [
{
"id": 2,
"title": "some tag title 2"
}
]
}
{
"title": "some title 3",
"message": "some message 3",
"tags": [
{
"id": 1,
"title": "some tag title"
},
{
"id": 2,
"title": "some tag title 2"
}
]
}
My request:
{
"size": 0,
"aggs": {
"group_by_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"tags": {
"terms": {
"field": "tags.id"
},
"aggs": {
"titles": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"tags.title"
]
}
}
}
}
}
}
}
}
}
Response:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"group_by_tags": {
"doc_count": 6,
"tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 3,
"titles": {
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 0
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title 2"
}
}
}
]
}
}
},
{
"key": 1,
"doc_count": 2,
"titles": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 0
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title"
}
}
}
]
}
}
},
{
"key": 3,
"doc_count": 1,
"titles": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 2
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title 3"
}
}
}
]
}
}
}
]
}
}
}
}
How can you see result aggregations sorted by doc_count.
How can I get result aggregations sorted by tag.title?
What I want:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"group_by_tags": {
"doc_count": 6,
"tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 2,
"titles": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 0
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title"
}
}
}
]
}
}
},
{
"key": 2,
"doc_count": 3,
"titles": {
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 0
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title 2"
}
}
}
]
}
}
},
{
"key": 3,
"doc_count": 1,
"titles": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_nested": {
"field": "tags",
"offset": 2
},
"_score": 1,
"_source": {
"tags": {
"title": "some tag title 3"
}
}
}
]
}
}
}
]
}
}
}
}
Sorry for my english.
Updated:
I update request:
{
"size": 0,
"aggs": {
"group_by_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"tags": {
"terms": {
"field": "tags.id"
},
"aggs": {
"titles": {
"terms": {
"field": "tags.title.keyword"
}
}
}
}
}
}
}
}
Updated 2:
Request:
{
"size": 0,
"aggs": {
"group_by_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"tags": {
"terms": {
"field": "tags.id",
"order": {
"titles": "asc"
}
},
"aggs": {
"titles": {
"terms": {
"field": "tags.title.keyword"
}
}
}
}
}
}
}
}
With this request I get error:
"reason": {
"type": "aggregation_execution_exception",
"reason": "Invalid terms aggregation order path [titles]. Terms buckets can only be sorted on a sub-aggregator path that is built out of zero or more single-bucket aggregations within the path and a final single-bucket or a metrics aggregation at the path end."
}

summing a bunch of values given a condition in elasticsearch

Given the following elasticsearch document, how would I construct a search that would sum the values of the seconds column for a given datetime range?
See below for my current query.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "searchdb",
"_type": "profile",
"_id": "1825",
"_score": 1,
"_source": {
"id": 1825,
"market": "Chicago",
"geo_location": {
"lat": 41.1234,
"lon": -87.5678
},
"hourly_values": [
{
"datetime": "1997-07-16T19:00:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
}
]
}
},
{
"_index": "searchdb",
"_type": "profile",
"_id": "1808",
"_score": 1,
"_source": {
"id": 1808,
"market": "Chicago",
"geo_location": {
"lat": 41.1234,
"lon": -87.5678
},
"hourly_values": [
{
"datetime": "1997-07-16T19:00:00.00+00:00",
"seconds": 900
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 800
}
]
}
}
]
}
Below is my current query. The problem with it is it doesn't take into consideration the datetime field. I need to be able to sum only the seconds values that fall within a given datetime range in the query.
{
"aggs": {
"Ids": {
"terms": {
"field": "id",
"size": 0
},
"aggs": {
"Nesting": {
"nested": {
"path": "hourly_values"
},
"aggs": {
"availability": {
"sum": {
"field": "hourly_values.seconds"
}
}
}
}
}
}
}
}
I know you can use a range, something like this:
"filter" : {
"range" : { "timestamp" : { "from" : "now/1d+9.5h", "to" : "now/1d+16h" }}
}
but I can't figure out how to integrate that into my query to get the desired output.
For clarity, my desired output is to return each of the objects returned from the query, and the values of the summation of the seconds fields, but I only want to sum the values for the given time range.
I think this can be done with filter aggregation
Try this
{
"aggs": {
"Ids": {
"terms": {
"field": "id",
"size": 0
},
"aggs": {
"Nesting": {
"nested": {
"path": "hourly_values"
},
"aggs": {
"filtered_result": {
"filter": {
"query": {
"range": {
"hourly_values.datetime": {
"gt": "1997-07-16T19:10:00.00+00:00",
"lt": "1997-07-16T19:22:00.00+00:00"
}
}
}
},
"aggs": {
"availability": {
"sum": {
"field": "hourly_values.seconds"
}
}
}
}
}
}
}
}
},
"size": 0
}
The result I get
"aggregations": {
"Ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1808",
"doc_count": 1,
"Nesting": {
"doc_count": 3,
"filtered_result": {
"doc_count": 2,
"availability": {
"value": 2000
}
}
}
},
{
"key": "1825",
"doc_count": 1,
"Nesting": {
"doc_count": 3,
"filtered_result": {
"doc_count": 2,
"availability": {
"value": 2400
}
}
}
}
]
}
}
Does this help?

Resources