Elasticsearch - Count occurrences of elements in nested field

Elasticsearch - Count occurrences of elements in nested field - elasticsearch

I have an elasticsearch index with this simplified structure:
{
"id": "group1",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user2"
}
]
},
{
"id": "group2",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
},
{
"id": "group3",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
}
I need to get the number of documents where each user appears. Something like this:
[
{
"key": "user1",
"doc_count": 3
},
{
"key": "user2",
"doc_count": 1
},
{
"key": "user3",
"doc_count: 2
}
]

You need to use nested aggregation with the terms
aggregation
Adding a working example with index mapping, search query, and search result
Index Mapping:
{
"mappings":{
"properties":{
"users":{
"type":"nested"
}
}
}
}
Search Query:
{
"size":0,
"aggs": {
"resellers": {
"nested": {
"path": "users"
},
"aggs": {
"unique_user": {
"terms": {
"field": "users.user_id.keyword"
}
}
}
}
}
}
Search Result:
"aggregations": {
"resellers": {
"doc_count": 6,
"unique_user": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user1",
"doc_count": 3
},
{
"key": "user3",
"doc_count": 2
},
{
"key": "user2",
"doc_count": 1
}
]
}
}
}

Related

Order by sub-aggregation in composite aggregation (Elasticsearch)

I need to ordenate the results of an composite aggregation, but the value to be orderned is the sum of a specific field (my index is so much larger, so i need the composite for paginate values).
When send this GET:
GET /_search
{
"aggs" : {
"my_buckets": {
"composite" : {
"sources" : [
{ "date": { "date_histogram": { "field": "timestamp", "interval": "1d"} } },
{ "product": { "terms": {"field": "product" } } }
]
},
"aggregations": {
"the_sum": {
"sum": { "field": "price" } <--- i want order by this field aggregation
}
}
}
}
}
How can i get this response? (order by sum of each price)
{
...
"aggregations": {
"my_buckets": {
"after_key": {
"date": 1494374400000,
"product": "mad max"
},
"buckets": [
{
"key": {
"date": 1494460800000,
"product": "apocalypse now"
},
"doc_count": 1,
"the_sum": {
"value": 10.0
}
},
{
"key": {
"date": 1494288000000,
"product" : "mad max"
},
"doc_count": 2,
"the_sum": {
"value": 22.5
}
},
{
"key": {
"date": 1494374400000,
"product": "mad max"
},
"doc_count": 1,
"the_sum": {
"value": 290.0
}
}
]
}
}
}

Elasticsearch use the key of term aggregation as the input in script

I stored the result of each game as a doc. The players and their scores were stored in users and scores arrays.
Sample data :
[
{
"gameId": "game01",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-11T17:00:00.000Z",
"scores": [
4,
1
]
},
{
"gameId": "game02",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-12T17:00:00.000Z",
"scores": [
3,
1
]
},
{
"gameId": "game02",
"users": [
"user02",
"user03"
],
"#timestamp": "2022-08-12T18:00:00.000Z",
"scores": [
2,
4
]
}
]
I expected to use the below query to aggregate the daily total scores of each game of users:
{
"aggs": {
"aggByDate": {
"date_histogram": {
"field": "#timestamp",
"interval": "1d",
"time_zone": "+8",
"min_doc_count": 1
},
"aggs": {
"aggByGame": {
"terms": {
"field": "gameId"
},
"aggs": {
"aggByUser": {
"terms": {
"field": "users"
},
"aggs": {
"totalScore": {
"sum": {
"script": {
"source": """
String targetUser = params.key; <--- I don't know how to get the key here
int i = 0;
for (def user: doc.users) {
if (user == targetUser) break;
i++;
}
return doc.scores[i];
"""
}
}
}
}
}
}
}
}
}
}
}
Expected result:
{
"aggregations": {
"aggByDate": {
"buckets": [
{
"key_as_string": "2022-08-11T00:00:00.000+08:00",
"doc_count": 1,
"aggByGame": {
"buckets": [
{
"key": "game01",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01", <--- this is the value I want to inject into the script
"doc_count": 1,
"totalScore": {
"value": 4
}
},
{
"key": "user02",
"doc_count": 1,
"totalScore": {
"value": 1
}
}
]
}
}
]
}
},
{
"key_as_string": "2022-08-12T00:00:00.000+08:00",
"doc_count": 2,
"aggByGame": {
"buckets": [
{
"key": "game02",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01",
"doc_count": 1,
"totalScore": {
"value": 3
}
},
{
"key": "user02",
"doc_count": 2,
"totalScore": {
"value": 3
}
},
{
"key": "user03",
"doc_count": 1,
"totalScore": {
"value": 4
}
}
]
}
}
]
}
}
]
}
}
}
But since I don't know the userId before the query, I don't know where can I let elasticsearch inject the params.key into the script in the middle of the query.
I also referenced the issue here. Seems it's not possible to access the aggregation data for sub-aggs. Do we have another workaround here? Thanks!!
(I use ElasticSearch v7.10)

How to sort before aggregation in Elasticsearch?

I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?

You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}

Complex Elastic Search Query

I have the following documents in the elastic search index.
[{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_en-us",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "hi-in",
"user": "aadab2f531206e9d",
"version": 1
},
{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "en-us",
"user": "aadab2f531206e9d",
"version": 1
}
],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1",
"locale": "en-us",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:36:43.037Z",
"fields": []
}
},
{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_mr-in",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:37:26.205Z",
"locale": "mr-in",
"user": "aadab2f531206e9d",
"version": 1
}],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1 marathi",
"locale": "mr-in",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:37:20.092Z",
"fields": []
}
}
]
And I want Result [] blank from this. As here we can see that uid of both the documents is the same. I am using the following query to get result :
{
"query": {
"bool": {
"must": [{
"bool": {
"must_not": [{
"bool": {
"must": [{
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.environment": "603fe91adbdcff66"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "en-us"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "hi-in"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "mr-in"
}
}
}
}]
}
}]
}
}]
}
}
}
But the above query gives me all 2 documents, but I want results as bank the reason here is here uid is common and that uid contains all three local in publishing details. So is way to get a valid result, Is any aggregation query that helps me here. it is just a sample I have so many documents to filter out. Kindle Helps me here.

{
"aggs": {
"agg1": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"agg2": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
This query will group you by uid first then publish_details.locale
It provides results as below
"aggregations": {
"agg1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"agg2": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"agg2": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
}
}
}
]
I have three docs where two has same id and other one is different.
I will update the query further to remove the first result where you have 3 buckets. You also can proceed further to handle it in the code.
You can do that. 10k documents is fine. But when you have in millions, you should have enough resources to execute this.
{
"size" : 0,
"query":{
"bool" :{
"must_not":{
"match":{
"publish_details.environment":"603fe91adbdcff66"
}
}
}
},
"aggs": {
"uids": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"details": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
},
"unique_locales": {
"value_count": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
Result:
"aggregations": {
"uids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"details": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 3
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"details": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 2
}
}
}
]

ElasticSearch - Aggregations with document details

I need to aggregate the following documents:
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
by actor, and would like to get the following structure:
[
{"name": "Christian Bale"},
{"movies": [
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6"
},
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0"
}, ...
]
Beyong using a standard term aggregation based on the casting.name field, how can I retrieve the releaseDate and imdbRate of the related documents?
For each actor, I also need movies to be sorted by releaseDate asc.
Can I perform this using one single request?

As you have an array of casting objects in your documents you'll need to use the nested type in your mapping. To get the aggregations you want you need a combination of Terms Aggregations, Nested Aggregations and Reverse Nested Aggregations. Below is an example.
Create and index with the mapping:
POST /test
{
"mappings": {
"movie": {
"properties": {
"title": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"releaseDate": {
"type": "string",
"index": "not_analyzed"
},
"casting": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"category": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
}
Index the documents:
POST /test/movie/1
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
POST /test/movie/2
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
And finally search:
POST /test/movie/_search?search_type=count
{
"aggs": {
"nested_path": {
"nested": {
"path": "casting"
},
"aggs": {
"actor_name": {
"terms": {
"field": "casting.name.raw"
},
"aggs": {
"movies": {
"reverse_nested": {},
"aggs": {
"movie_title": {
"terms": {
"field": "title.raw"
},
"aggs": {
"release_date": {
"terms": {
"field": "releaseDate"
}
},
"imdbRate_date": {
"terms": {
"field": "imdbRate"
}
}
}
}
}
}
}
}
}
}
}
}
The response for Christian Bale is:
{
"key": "Christian Bale",
"doc_count": 2,
"movies": {
"doc_count": 2,
"movie_title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "American Psycho",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7/06/2000",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7.6",
"doc_count": 1
}
]
}
},
{
"key": "The Dark Knight",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "13/08/2008",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9.0",
"doc_count": 1
}
]
}
}
]
}
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch - Count occurrences of elements in nested field - elasticsearch

Related

Order by sub-aggregation in composite aggregation (Elasticsearch)

Elasticsearch use the key of term aggregation as the input in script

How to sort before aggregation in Elasticsearch?

Complex Elastic Search Query

ElasticSearch - Aggregations with document details

Categories

Resources