Elasticsearch Error get aggregation top_hits source - elasticsearch

So I'm trying to achieve the top_hits value to be aggregated in bucket_script but I got the errors like this
{
"error": {
"root_cause": [
{
"type": "action_request_validation_exception",
"reason": "Validation Failed: 1: No aggregation found for path [first_doc.hits.hits._source.close];"
}
],
"type": "action_request_validation_exception",
"reason": "Validation Failed: 1: No aggregation found for path [first_doc.hits.hits._source.close];"
},
"status": 400
}
It seems the elasticsearch aggregation cannot read the object from top_hits, my goal is to get the top_hits on my bucket_script and appear on my aggregation on this code:
{
"query": {
"bool": {
"must": [
{
// range filter
"range": {
"date": {
"gte": "now-89d",
"lte": "now+1d"
}
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "stock"
},
"aggs": {
"last_doc": {
"top_hits": {
"size": 1,
"sort": [
{
"date": {
"order": "desc"
}
}
],
"_source": {
"includes": ["close", "stock", "date"]
}
}
},
"first_doc": {
"top_hits": {
"size": 1,
"sort": [
{
"date": {
"order": "asc"
}
}
],
"_source": {
"includes": ["close", "stock", "date"]
}
}
},
"ninety_scr":{
"bucket_script":{
"buckets_path": {
"first_doc": "first_doc.hits.hits._source.close",
"last_doc": "last_doc.hits.hits._source.close"
},
"script": "params.first_doc / params.last_doc"
}
}
}
}
}
}
Here's the example of hits documents, when I take out the ninety_scr bucket_script object :
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": 1.0,
"_source": {
"date": "2022-06-10 13:23:36",
"fvei": "8732.75",
"pbv": "0.0",
"prev": "81",
"book": "1050093.39",
"roe": "6.21",
"der": "1.85",
"high": "86",
"avg": "79.0",
"fve": "78.0",
"low": "77",
"stock": "ADCP",
"per": "0.0",
"close": "78",
"trade_vol": "19784300",
"group": "IDXPROPERT",
"paid_up_cap_shares": "2000.0",
"trade_val": "1569762300",
"chg": "-3.49",
"change": "-3.0",
"peg_ratio": "0.0",
"eps": "65180.29",
"trade_freq": "1240",
"peg_analysis": "negative growth",
"board": "RG",
"open": "81"
}
}
]
And here's the result aggregation:
"aggregations": {
"group": {
"doc_count_error_upper_bound": 3,
"sum_other_doc_count": 757,
"buckets": [
{
"key": "ADCP",
"doc_count": 1,
"last_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": null,
"_source": {
"date": "2022-06-10 13:23:36",
"stock": "ADCP",
"close": "78"
},
"sort": [
1654867416000
]
}
]
}
},
"first_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": null,
"_source": {
"date": "2022-06-10 13:23:36",
"stock": "ADCP",
"close": "78"
},
"sort": [
1654867416000
]
}
]
}
}
}
]
}
}

Related

Elasticsearch - Is it possible to collapse first then aggregate data of a nested field?

I am using Elasticsearch and I want to group our results by a specific field, returning top n documents per group. The document have a nested filed and I want to aggregate all the documents' nested field for each group.
Example
I have 5 documents and each have a groupId and also a nested field peoples. I want group these documents by the groupId. And then for each group, I want to get top 2 people(some documents may contain same people).
PUT test/_mapping
{
"properties": {
"groupId":{
"type":"keyword"
},
"id":{
"type":"keyword"
},
"name":{
"type":"text"
},
"people":{
"type":"nested",
"properties":{
"email":{
"type":"keyword"
}
}
}
}
}
PUT test/_doc/1
{
"name": "docs1",
"groupId": "1",
"people":[{
"email":"people1#test.com"
}]
}
PUT test/_doc/2
{
"name": "docs2",
"groupId": "1",
"people":[{
"email":"people2.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/3
{
"name": "docs3",
"groupId": "2",
"people":[{
"email":"people3.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/4
{
"name": "docs4",
"groupId": "1",
"people":[{
"email":"people4.1#test.com"
},
{
"email":"people4.2#test.com"
}]
}
PUT test/_doc/5
{
"name": "docs5",
"groupId": "3",
"people":[{
"email":"people5.1#test.com"
},
{
"email":"people5.2#test.com"
}]
}
Search query
GET test/_search
{
"collapse": {
"field": "groupId",
"inner_hits": {
"name":"inner",
"size": 2
}
},
"sort": [
{
"groupId": {
"order": "asc"
}
}
],
"size": 2,
"from": 0
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": null,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
},
"fields": {
"groupId": [
"1"
]
},
"sort": [
"1"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": 0,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
}
},
{
"_index": "test",
"_id": "2",
"_score": 0,
"_source": {
"name": "docs2",
"groupId": "1",
"people": [
{
"email": "people2.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
},
{
"_index": "test",
"_id": "3",
"_score": null,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
},
"fields": {
"groupId": [
"2"
]
},
"sort": [
"2"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "3",
"_score": 0,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
}
]
}
}
Expecting is to aggregate a groupPeople field for each group and it contains top n people of that group(should not affected by the inner_hit size, like for groupId=1, it contains 3 documents and 5 people).
The query that you're looking for is this one:
POST test/_search
{
"size": 0,
"aggs": {
"groups": {
"terms": {
"field": "groupId",
"size": 10
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}
If you need pagination, you can achieve the same using the composite aggregation:
POST test/_search
{
"size": 0,
"aggs": {
"pages": {
"composite": {
"sources": [
{
"groups": {
"terms": {
"field": "groupId"
}
}
}
]
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}

How can I search and return only nested documents in ElasticSearch?

Say I have the following author documents, each with book documents:
[
{
"name": "Foo McBarrington",
"books": [
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
},
{
"name": "Bar McFooington",
"books": [
{
"title": "Baz Book",
"published": "2020-06-23"
}
]
}
]
I would like to search for and return books, ignoring anything from the author documents, and sort the books by published field on each book. I should be able to get the books sorted relative to each other:
[
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Baz Book",
"published": "2020-06-23"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
Notice that the book from the second author is sorted in the middle of the two books from the first author.
Is this possible with ElasticSearch? So far I've tried using the top_hits aggregation in a nested aggregation but I'm not sure why it's not working.
Yes, it is possible. You can use a combination of nested aggregation, terms aggregation, and top hits aggregation to achieve your result
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "books"
},
"aggs": {
"books": {
"terms": {
"field": "books.title.keyword"
},
"aggs": {
"latest_books": {
"top_hits": {
"sort": [
{
"books.published": {
"order": "asc"
}
}
],
"_source": {
"includes": [
"books.title",
"books.published"
]
},
"size": 1
}
}
}
}
}
}
}
}
The search result will be
"aggregations": {
"resellers": {
"doc_count": 3,
"books": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Bar Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 1
},
"_score": null,
"_source": {
"published": "2021-06-13", // note this
"title": "Bar Book"
},
"sort": [
1623542400000
]
}
]
}
}
},
{
"key": "Baz Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2020-06-23", // note this
"title": "Baz Book"
},
"sort": [
1592870400000
]
}
]
}
}
},
{
"key": "Foo Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2019-05-02", // note this
"title": "Foo Book"
},
"sort": [
1556755200000
]
}
]
}
}
}
]
}
}
}
I found a way to get the books in a paginated format using a composite aggregation:
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}
To get the following page, specify after:
I found a way to get the books in a paginated format using a [composite aggregation][1]:
```json
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"after": {
"published": "<DATE HERE>",
"order": "desc"
},
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}

Deduplicate results in elasticsearch based on a field

I have an elasticsearch index (v6.8) that contains documents that may share a similar value for a field.
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 123,
"owner": "B",
"price": 20
},
{
"siren": 456,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "C",
"price": 30
}
]
I would like to get all documents from owner A and B, but deduplicated on the siren field. The result would be. I don't care which deduplicated line is returned (from owner A or B).
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "A",
"price": 10
}
]
Also, I would like my aggregations to count documents deduplicated on the same field.
I have tried
{
"query": {
"bool": {
"must": [
[
{
"terms": {
"owner": [
"A",
"B"
]
}
}
]
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "price",
"size": 20
}
}
}
}
But this counts multiple times the "same" document.
You can use terms aggregation on the siren field along with top hits aggregation
{
"size":0,
"query": {
"bool": {
"must": [
{
"terms": {
"owner.keyword": [
"A",
"B"
]
}
}
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "siren",
"size": 20
},
"aggs": {
"top_sales_hits": {
"top_hits": {
"_source": {
"includes": [
"siren",
"owner",
"price"
]
},
"size": 1
}
}
}
}
}
}
Search Result will be
"aggregations": {
"by_price": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 2,
"top_sales_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 123,
"price": 10
}
}
]
}
}
},
{
"key": 456,
"doc_count": 1,
"top_sales_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 456,
"price": 10
}
}
]
}
}
}
]
}
}

How can i get the ALL lastest record with each group by Elasticsearch query?

I have reference from this how-to-get-latest-values-for-each-group-with-an-elasticsearch-query
and now i do the search, but the aggregations only return 10 doc for me, how can it show all match result? I ONLY show two since its too long for the return repsonse , thanks!
my ES query is :
{
"size" :1,
"aggs": {
"group": {
"terms": {
"field": "studentId"
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}
and the result:
{
"took": 32,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": 1.0,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "N-wsrHYB4zCrGLTdS7Ur",
"_score": 1.0,
"_source": {
"studentId": 144,
"timestampstring": "2020-09-02 05:58:04.828",
"type": "data"
}
}
]
},
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 99670,
"buckets": [
{
"key": 131,
"doc_count": 579,
"group_docs": {
"hits": {
"total": {
"value": 579,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "SVVj4HYBlaUrIoJst3-o",
"_score": null,
"_source": {
"studentId": 131,
"timestampstring": "2021-01-08 13:06:34.413",
"type": "data"
},
"sort": [
1609340059767
]
}
]
}
}
},
{
"key": 147,
"doc_count": 529,
"group_docs": {
"hits": {
"total": {
"value": 529,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "data",
"_type": "class",
"_id": "SVVj4HYBlaUrIoJst3-o",
"_score": null,
"_source": {
"studentId": 147,
"timestampstring": "2021-01-08 13:06:34.413",
"type": "data"
},
"sort": [
1610082394413
]
}
]
}
}
}
]
}
}
}
You need to add the size param in the terms aggregation
The size parameter can be set to define how many term buckets should
be returned out of the overall terms list.
{
"size": 1,
"aggs": {
"group": {
"terms": {
"field": "studentId",
"size": 100 // note this
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}
Update 1:
You can use stats bucket aggregation, to get the count of unique studenid
{
"size": 1,
"aggs": {
"group": {
"terms": {
"field": "studentId",
"size": 100 // note this
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
},
"bucketcount": {
"stats_bucket": {
"buckets_path": "group._count"
}
}
}
}

How to perform union operation on an array field after group by aggregation on another field in elasticsearch

I have the following documents in index products
{ "product_name": "prod-1", "meta": [ { "tag": "tag1", "score": "12" }, { "tag": "tag2", "score": "24" } ] }
{ "product_name": "prod-2", "meta": [ { "tag": "tag1", "score": "36" } ] }
{ "product_name": "prod-2", "meta": [ { "tag": "tag2", "score": "44" } ] }
{ "product_name": "prod-3", "meta": [ { "tag": "tag3", "score": "54" } ] }
I know how to group by product_name in es
POST /products/_search
{
"size": 0,
"aggs": {
"by_product": {
"terms": {
"field": "product_name"
}
}
}
}
After grouping by product_name, I want a field called meta in each bucket which has a union of meta from all documents in that bucket like this
[
{
"key": "prod-1",
"meta": [{ "tag": "tag1", "score": "12" }, { "tag": "tag2", "score": "24" }]
},
{
"key": "prod-2",
"meta": [{ "tag": "tag1", "score": "36" }, { "tag": "tag2", "score": "44" }]
},
{
"key": "prod-3",
"meta": [ { "tag": "tag3", "score": "54" } ]
}
]
How can I achive this in elaticsearch?
The best way to show your expected search result is to use top hits
aggregation using which you can add additional fields to terms
aggregation
Search Query:
{
"size": 0,
"aggs": {
"by_product": {
"terms": {
"field": "product_name.keyword"
},
"aggs": {
"top_sales_hits": {
"top_hits": {
"_source": {
"includes": [
"meta.tag",
"meta.score"
]
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"by_product": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "prod-2",
"doc_count": 2,
"top_sales_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "64801386",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"meta": [
{
"score": "36",
"tag": "tag1"
}
]
}
},
{
"_index": "64801386",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"meta": [
{
"score": "44",
"tag": "tag2"
}
]
}
}
]
}
}
},
{
"key": "prod-1",
"doc_count": 1,
"top_sales_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "64801386",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"meta": [
{
"score": "12",
"tag": "tag1"
},
{
"score": "24",
"tag": "tag2"
}
]
}
}
]
}
}
},
{
"key": "prod-3",
"doc_count": 1,
"top_sales_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "64801386",
"_type": "_doc",
"_id": "4",
"_score": 1.0,
"_source": {
"meta": [
{
"score": "54",
"tag": "tag3"
}
]
}
}
]
}
}
}
]
}

Resources