Deduplicate results in elasticsearch based on a field - elasticsearch

I have an elasticsearch index (v6.8) that contains documents that may share a similar value for a field.
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 123,
"owner": "B",
"price": 20
},
{
"siren": 456,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "C",
"price": 30
}
]
I would like to get all documents from owner A and B, but deduplicated on the siren field. The result would be. I don't care which deduplicated line is returned (from owner A or B).
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "A",
"price": 10
}
]
Also, I would like my aggregations to count documents deduplicated on the same field.
I have tried
{
"query": {
"bool": {
"must": [
[
{
"terms": {
"owner": [
"A",
"B"
]
}
}
]
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "price",
"size": 20
}
}
}
}
But this counts multiple times the "same" document.

You can use terms aggregation on the siren field along with top hits aggregation
{
"size":0,
"query": {
"bool": {
"must": [
{
"terms": {
"owner.keyword": [
"A",
"B"
]
}
}
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "siren",
"size": 20
},
"aggs": {
"top_sales_hits": {
"top_hits": {
"_source": {
"includes": [
"siren",
"owner",
"price"
]
},
"size": 1
}
}
}
}
}
}
Search Result will be
"aggregations": {
"by_price": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 2,
"top_sales_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 123,
"price": 10
}
}
]
}
}
},
{
"key": 456,
"doc_count": 1,
"top_sales_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 456,
"price": 10
}
}
]
}
}
}
]
}
}

Related

Elasticsearch - Is it possible to collapse first then aggregate data of a nested field?

I am using Elasticsearch and I want to group our results by a specific field, returning top n documents per group. The document have a nested filed and I want to aggregate all the documents' nested field for each group.
Example
I have 5 documents and each have a groupId and also a nested field peoples. I want group these documents by the groupId. And then for each group, I want to get top 2 people(some documents may contain same people).
PUT test/_mapping
{
"properties": {
"groupId":{
"type":"keyword"
},
"id":{
"type":"keyword"
},
"name":{
"type":"text"
},
"people":{
"type":"nested",
"properties":{
"email":{
"type":"keyword"
}
}
}
}
}
PUT test/_doc/1
{
"name": "docs1",
"groupId": "1",
"people":[{
"email":"people1#test.com"
}]
}
PUT test/_doc/2
{
"name": "docs2",
"groupId": "1",
"people":[{
"email":"people2.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/3
{
"name": "docs3",
"groupId": "2",
"people":[{
"email":"people3.1#test.com"
},
{
"email":"people2.2#test.com"
}]
}
PUT test/_doc/4
{
"name": "docs4",
"groupId": "1",
"people":[{
"email":"people4.1#test.com"
},
{
"email":"people4.2#test.com"
}]
}
PUT test/_doc/5
{
"name": "docs5",
"groupId": "3",
"people":[{
"email":"people5.1#test.com"
},
{
"email":"people5.2#test.com"
}]
}
Search query
GET test/_search
{
"collapse": {
"field": "groupId",
"inner_hits": {
"name":"inner",
"size": 2
}
},
"sort": [
{
"groupId": {
"order": "asc"
}
}
],
"size": 2,
"from": 0
}
Result
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": null,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
},
"fields": {
"groupId": [
"1"
]
},
"sort": [
"1"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "1",
"_score": 0,
"_source": {
"name": "docs1",
"groupId": "1",
"people": [
{
"email": "people1#test.com"
}
]
}
},
{
"_index": "test",
"_id": "2",
"_score": 0,
"_source": {
"name": "docs2",
"groupId": "1",
"people": [
{
"email": "people2.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
},
{
"_index": "test",
"_id": "3",
"_score": null,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
},
"fields": {
"groupId": [
"2"
]
},
"sort": [
"2"
],
"inner_hits": {
"inner": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0,
"hits": [
{
"_index": "test",
"_id": "3",
"_score": 0,
"_source": {
"name": "docs3",
"groupId": "2",
"people": [
{
"email": "people3.1#test.com"
},
{
"email": "people2.2#test.com"
}
]
}
}
]
}
}
}
}
]
}
}
Expecting is to aggregate a groupPeople field for each group and it contains top n people of that group(should not affected by the inner_hit size, like for groupId=1, it contains 3 documents and 5 people).
The query that you're looking for is this one:
POST test/_search
{
"size": 0,
"aggs": {
"groups": {
"terms": {
"field": "groupId",
"size": 10
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}
If you need pagination, you can achieve the same using the composite aggregation:
POST test/_search
{
"size": 0,
"aggs": {
"pages": {
"composite": {
"sources": [
{
"groups": {
"terms": {
"field": "groupId"
}
}
}
]
},
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"emails": {
"terms": {
"field": "people.email",
"size": 2
}
}
}
}
}
}
}
}

Elasticsearch Error get aggregation top_hits source

So I'm trying to achieve the top_hits value to be aggregated in bucket_script but I got the errors like this
{
"error": {
"root_cause": [
{
"type": "action_request_validation_exception",
"reason": "Validation Failed: 1: No aggregation found for path [first_doc.hits.hits._source.close];"
}
],
"type": "action_request_validation_exception",
"reason": "Validation Failed: 1: No aggregation found for path [first_doc.hits.hits._source.close];"
},
"status": 400
}
It seems the elasticsearch aggregation cannot read the object from top_hits, my goal is to get the top_hits on my bucket_script and appear on my aggregation on this code:
{
"query": {
"bool": {
"must": [
{
// range filter
"range": {
"date": {
"gte": "now-89d",
"lte": "now+1d"
}
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "stock"
},
"aggs": {
"last_doc": {
"top_hits": {
"size": 1,
"sort": [
{
"date": {
"order": "desc"
}
}
],
"_source": {
"includes": ["close", "stock", "date"]
}
}
},
"first_doc": {
"top_hits": {
"size": 1,
"sort": [
{
"date": {
"order": "asc"
}
}
],
"_source": {
"includes": ["close", "stock", "date"]
}
}
},
"ninety_scr":{
"bucket_script":{
"buckets_path": {
"first_doc": "first_doc.hits.hits._source.close",
"last_doc": "last_doc.hits.hits._source.close"
},
"script": "params.first_doc / params.last_doc"
}
}
}
}
}
}
Here's the example of hits documents, when I take out the ninety_scr bucket_script object :
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": 1.0,
"_source": {
"date": "2022-06-10 13:23:36",
"fvei": "8732.75",
"pbv": "0.0",
"prev": "81",
"book": "1050093.39",
"roe": "6.21",
"der": "1.85",
"high": "86",
"avg": "79.0",
"fve": "78.0",
"low": "77",
"stock": "ADCP",
"per": "0.0",
"close": "78",
"trade_vol": "19784300",
"group": "IDXPROPERT",
"paid_up_cap_shares": "2000.0",
"trade_val": "1569762300",
"chg": "-3.49",
"change": "-3.0",
"peg_ratio": "0.0",
"eps": "65180.29",
"trade_freq": "1240",
"peg_analysis": "negative growth",
"board": "RG",
"open": "81"
}
}
]
And here's the result aggregation:
"aggregations": {
"group": {
"doc_count_error_upper_bound": 3,
"sum_other_doc_count": 757,
"buckets": [
{
"key": "ADCP",
"doc_count": 1,
"last_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": null,
"_source": {
"date": "2022-06-10 13:23:36",
"stock": "ADCP",
"close": "78"
},
"sort": [
1654867416000
]
}
]
}
},
"first_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "stocks",
"_id": "8odITIEBRQt2Zq4UUGu4",
"_score": null,
"_source": {
"date": "2022-06-10 13:23:36",
"stock": "ADCP",
"close": "78"
},
"sort": [
1654867416000
]
}
]
}
}
}
]
}
}

How can I search and return only nested documents in ElasticSearch?

Say I have the following author documents, each with book documents:
[
{
"name": "Foo McBarrington",
"books": [
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
},
{
"name": "Bar McFooington",
"books": [
{
"title": "Baz Book",
"published": "2020-06-23"
}
]
}
]
I would like to search for and return books, ignoring anything from the author documents, and sort the books by published field on each book. I should be able to get the books sorted relative to each other:
[
{
"title": "Foo Book",
"published": "2019-05-02"
},
{
"title": "Baz Book",
"published": "2020-06-23"
},
{
"title": "Bar Book",
"published": "2021-06-13"
}
]
Notice that the book from the second author is sorted in the middle of the two books from the first author.
Is this possible with ElasticSearch? So far I've tried using the top_hits aggregation in a nested aggregation but I'm not sure why it's not working.
Yes, it is possible. You can use a combination of nested aggregation, terms aggregation, and top hits aggregation to achieve your result
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "books"
},
"aggs": {
"books": {
"terms": {
"field": "books.title.keyword"
},
"aggs": {
"latest_books": {
"top_hits": {
"sort": [
{
"books.published": {
"order": "asc"
}
}
],
"_source": {
"includes": [
"books.title",
"books.published"
]
},
"size": 1
}
}
}
}
}
}
}
}
The search result will be
"aggregations": {
"resellers": {
"doc_count": 3,
"books": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Bar Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 1
},
"_score": null,
"_source": {
"published": "2021-06-13", // note this
"title": "Bar Book"
},
"sort": [
1623542400000
]
}
]
}
}
},
{
"key": "Baz Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2020-06-23", // note this
"title": "Baz Book"
},
"sort": [
1592870400000
]
}
]
}
}
},
{
"key": "Foo Book",
"doc_count": 1,
"latest_books": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "68477157",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "books",
"offset": 0
},
"_score": null,
"_source": {
"published": "2019-05-02", // note this
"title": "Foo Book"
},
"sort": [
1556755200000
]
}
]
}
}
}
]
}
}
}
I found a way to get the books in a paginated format using a composite aggregation:
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}
To get the following page, specify after:
I found a way to get the books in a paginated format using a [composite aggregation][1]:
```json
{
"size": 0,
"aggs": {
"authors": {
"nested": {
"path": "books"
},
"aggs": {
"books_composite": {
"composite": {
"size": 25,
"after": {
"published": "<DATE HERE>",
"order": "desc"
},
"sources": [
{
"published": {
"terms": {
"field": "books.published",
"order": "desc"
}
}
}
]
}
}
}
}
}
}

Elastic Search - select DISTINCT value from aggregation result?

In Elastic Search I have an index named Menu. In Menu have an array of Shop. Something like this.
{
"menu_id": 1,
"name": 1,
"shops": [
{
"name": "A",
"shop_id: "A",
},
{
"name": "B",
"shop_id: "B",
}
]
}
{
"menu_id": 2,
"name": 2,
"shops": [
{
"name": "C",
"shop_id: "C",
}
]
}
{
"menu_id": 3,
"name": 3,
"shops": [
{
"name": "A",
"shop_id: "A",
}
]
}
{
"menu_id": 4,
"name": 4,
"shops": [
{
"name": "A",
"shop_id: "A",
},
{
"name": "C",
"shop_id: "C",
}
]
}
With my query I want to search Shop that have id "A" or "C". I want my result being like this.
{
"name": "A",
"shop_id: "A",
},
{
"name": "C",
"shop_id: "C",
}
I tried with this query.
{
"_source": "shops",
"query": {
"bool": {
"should": [
{
"match": {
"shops.id": "A"
}
},
{
"match": {
"shops.id": "C"
}
}
]
}
},
"aggs": {
"all_shops": {
"terms": {
"field": "shops.id.keyword",
"min_doc_count": 1
},
"aggs": {
"real_shop": {
"top_hits": {
"_source": [
"shops"
],
"size": 1
}
}
}
}
}
}
And this query.
{
"_source": "shops",
"query": {
"bool": {
"should": [
{
"match": {
"shops.id": "A"
}
},
{
"match": {
"shops.id": "C"
}
}
]
}
},
"aggs": {
"messages": {
"filters": {
"filters": [
{
"match": {
"shops.id": "A"
}
},
{
"match": {
"shops.id": "C"
}
}
]
},
"aggs": {
"real_shop": {
"top_hits": {
"_source": [
"shops"
],
"size": 1
}
}
}
}
}
}
I still got many "A", "B" and many "C".
How can I get just once "A" and once "C".
I cannot search it with Index Shop Because I want to use Information from Menu to search it.
Final Query is "Search shop with shop's name or menu's name with shop ids".
You need to make shops to be of the nested type, to query on each nested field object. You can use inner_hits to return documents that matched exactly with the query. Modify your index mapping as shown below
{
"mappings": {
"properties": {
"shops": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "shops",
"query": {
"terms": {
"shops.shop_id.keyword": [
"A",
"C"
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"hits": [
{
"_index": "66675093",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"menu_id": 1,
"name": 1,
"shops": [
{
"name": "A",
"shop_id": "A"
},
{
"name": "B",
"shop_id": "B"
}
]
},
"inner_hits": {
"shops": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66675093",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "shops",
"offset": 0
},
"_score": 1.0,
"_source": {
"name": "A", // note this
"shop_id": "A"
}
}
]
}
}
}
},
{
"_index": "66675093",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"menu_id": 1,
"name": 1,
"shops": [
{
"name": "C",
"shop_id": "C"
}
]
},
"inner_hits": {
"shops": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66675093",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "shops",
"offset": 0
},
"_score": 1.0,
"_source": {
"name": "C",
"shop_id": "C" // note this
}
}
]
}
}
}
}
]
UPDATE 1:
You can use filter aggregation along with nested aggregation, to achieve your use case. Try out this below query
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "shops"
},
"aggs": {
"NAME": {
"filter": {
"terms": {
"shops.shop_id.keyword": ["A","C"]
}
},
"aggs": {
"NAME": {
"terms": {
"field": "shops.shop_id.keyword"
},
"aggs": {
"top_sales_hits": {
"top_hits": {
"size": 1
}
}
}
}
}
}
}
}
}
}
Search Result will be
"aggregations": {
"NAME": {
"doc_count": 6,
"NAME": {
"doc_count": 5,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 3,
"top_sales_hits": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66675093",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "shops",
"offset": 0
},
"_score": 1.0,
"_source": {
"name": "A", // note this
"shop_id": "A"
}
}
]
}
}
},
{
"key": "C",
"doc_count": 2,
"top_sales_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66675093",
"_type": "_doc",
"_id": "2",
"_nested": {
"field": "shops",
"offset": 0
},
"_score": 1.0,
"_source": {
"name": "C", // note this
"shop_id": "C"
}
}
]
}
}
}
]
}
}
}
}

Create the Elastic search query to show Random 5 Questions by category

I Have fields Category & Questions in the Table.
My Requirement is for the below mentioned 3 category against I need the questions which is tagged (SO I want the Category and Questions field in the query) by writing elastic search query
Category :
OLA
BNA
DRG
GET logstash-sdc-feedback/_search? { "_source":["Category.keyword"], "size": 5, "query":{ "bool": { "must": [ {"match":{"Category.keyword"": "OLA","BNA","DRG"}}
],
}
}, "aggs": { "MyBuckets": { "terms": { "field": "questions.keyword","Category.keyword" "order":{ "_count": "asc" }, "size": "5"
} } } }
You can use terms query along with terms aggregation, to achieve your use case.
Adding a working example
Index Data:
{
"category": "XYZ",
"question": "d"
}
{
"category": "OLA",
"question": "a"
}
{
"category": "BNA",
"question": "b"
}
{
"category": "DRG",
"question": "c"
}
Search Query:
{
"query": {
"bool": {
"must": {
"terms": {
"category.keyword": [
"OLA",
"BNA",
"DRG"
]
}
}
}
},
"aggs": {
"top_tags": {
"terms": {
"field": "category.keyword"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"question"
]
},
"size": 1
}
}
}
}
}
}
Search Result:
"aggregations": {
"top_tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "BNA", // note this
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"question": "b" // note this
}
}
]
}
}
},
{
"key": "DRG",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"question": "c"
}
}
]
}
}
},
{
"key": "OLA",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "65566020",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"question": "a"
}
}
]
}
}
}
]
}
}

Resources