I have an elasticsearch index with this simplified structure:
{
"id": "group1",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user2"
}
]
},
{
"id": "group2",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
},
{
"id": "group3",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
}
I need to get the number of documents where each user appears. Something like this:
[
{
"key": "user1",
"doc_count": 3
},
{
"key": "user2",
"doc_count": 1
},
{
"key": "user3",
"doc_count: 2
}
]
You need to use nested aggregation with the terms
aggregation
Adding a working example with index mapping, search query, and search result
Index Mapping:
{
"mappings":{
"properties":{
"users":{
"type":"nested"
}
}
}
}
Search Query:
{
"size":0,
"aggs": {
"resellers": {
"nested": {
"path": "users"
},
"aggs": {
"unique_user": {
"terms": {
"field": "users.user_id.keyword"
}
}
}
}
}
}
Search Result:
"aggregations": {
"resellers": {
"doc_count": 6,
"unique_user": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user1",
"doc_count": 3
},
{
"key": "user3",
"doc_count": 2
},
{
"key": "user2",
"doc_count": 1
}
]
}
}
}
Related
I need to ordenate the results of an composite aggregation, but the value to be orderned is the sum of a specific field (my index is so much larger, so i need the composite for paginate values).
When send this GET:
GET /_search
{
"aggs" : {
"my_buckets": {
"composite" : {
"sources" : [
{ "date": { "date_histogram": { "field": "timestamp", "interval": "1d"} } },
{ "product": { "terms": {"field": "product" } } }
]
},
"aggregations": {
"the_sum": {
"sum": { "field": "price" } <--- i want order by this field aggregation
}
}
}
}
}
How can i get this response? (order by sum of each price)
{
...
"aggregations": {
"my_buckets": {
"after_key": {
"date": 1494374400000,
"product": "mad max"
},
"buckets": [
{
"key": {
"date": 1494460800000,
"product": "apocalypse now"
},
"doc_count": 1,
"the_sum": {
"value": 10.0
}
},
{
"key": {
"date": 1494288000000,
"product" : "mad max"
},
"doc_count": 2,
"the_sum": {
"value": 22.5
}
},
{
"key": {
"date": 1494374400000,
"product": "mad max"
},
"doc_count": 1,
"the_sum": {
"value": 290.0
}
}
]
}
}
}
I stored the result of each game as a doc. The players and their scores were stored in users and scores arrays.
Sample data :
[
{
"gameId": "game01",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-11T17:00:00.000Z",
"scores": [
4,
1
]
},
{
"gameId": "game02",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-12T17:00:00.000Z",
"scores": [
3,
1
]
},
{
"gameId": "game02",
"users": [
"user02",
"user03"
],
"#timestamp": "2022-08-12T18:00:00.000Z",
"scores": [
2,
4
]
}
]
I expected to use the below query to aggregate the daily total scores of each game of users:
{
"aggs": {
"aggByDate": {
"date_histogram": {
"field": "#timestamp",
"interval": "1d",
"time_zone": "+8",
"min_doc_count": 1
},
"aggs": {
"aggByGame": {
"terms": {
"field": "gameId"
},
"aggs": {
"aggByUser": {
"terms": {
"field": "users"
},
"aggs": {
"totalScore": {
"sum": {
"script": {
"source": """
String targetUser = params.key; <--- I don't know how to get the key here
int i = 0;
for (def user: doc.users) {
if (user == targetUser) break;
i++;
}
return doc.scores[i];
"""
}
}
}
}
}
}
}
}
}
}
}
Expected result:
{
"aggregations": {
"aggByDate": {
"buckets": [
{
"key_as_string": "2022-08-11T00:00:00.000+08:00",
"doc_count": 1,
"aggByGame": {
"buckets": [
{
"key": "game01",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01", <--- this is the value I want to inject into the script
"doc_count": 1,
"totalScore": {
"value": 4
}
},
{
"key": "user02",
"doc_count": 1,
"totalScore": {
"value": 1
}
}
]
}
}
]
}
},
{
"key_as_string": "2022-08-12T00:00:00.000+08:00",
"doc_count": 2,
"aggByGame": {
"buckets": [
{
"key": "game02",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01",
"doc_count": 1,
"totalScore": {
"value": 3
}
},
{
"key": "user02",
"doc_count": 2,
"totalScore": {
"value": 3
}
},
{
"key": "user03",
"doc_count": 1,
"totalScore": {
"value": 4
}
}
]
}
}
]
}
}
]
}
}
}
But since I don't know the userId before the query, I don't know where can I let elasticsearch inject the params.key into the script in the middle of the query.
I also referenced the issue here. Seems it's not possible to access the aggregation data for sub-aggs. Do we have another workaround here? Thanks!!
(I use ElasticSearch v7.10)
I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?
You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}
I have the following documents in the elastic search index.
[{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_en-us",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "hi-in",
"user": "aadab2f531206e9d",
"version": 1
},
{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "en-us",
"user": "aadab2f531206e9d",
"version": 1
}
],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1",
"locale": "en-us",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:36:43.037Z",
"fields": []
}
},
{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_mr-in",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:37:26.205Z",
"locale": "mr-in",
"user": "aadab2f531206e9d",
"version": 1
}],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1 marathi",
"locale": "mr-in",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:37:20.092Z",
"fields": []
}
}
]
And I want Result [] blank from this. As here we can see that uid of both the documents is the same. I am using the following query to get result :
{
"query": {
"bool": {
"must": [{
"bool": {
"must_not": [{
"bool": {
"must": [{
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.environment": "603fe91adbdcff66"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "en-us"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "hi-in"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "mr-in"
}
}
}
}]
}
}]
}
}]
}
}
}
But the above query gives me all 2 documents, but I want results as bank the reason here is here uid is common and that uid contains all three local in publishing details. So is way to get a valid result, Is any aggregation query that helps me here. it is just a sample I have so many documents to filter out. Kindle Helps me here.
{
"aggs": {
"agg1": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"agg2": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
This query will group you by uid first then publish_details.locale
It provides results as below
"aggregations": {
"agg1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"agg2": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"agg2": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
}
}
}
]
I have three docs where two has same id and other one is different.
I will update the query further to remove the first result where you have 3 buckets. You also can proceed further to handle it in the code.
You can do that. 10k documents is fine. But when you have in millions, you should have enough resources to execute this.
{
"size" : 0,
"query":{
"bool" :{
"must_not":{
"match":{
"publish_details.environment":"603fe91adbdcff66"
}
}
}
},
"aggs": {
"uids": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"details": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
},
"unique_locales": {
"value_count": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
Result:
"aggregations": {
"uids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"details": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 3
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"details": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 2
}
}
}
]
I need to aggregate the following documents:
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
by actor, and would like to get the following structure:
[
{"name": "Christian Bale"},
{"movies": [
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6"
},
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0"
}, ...
]
Beyong using a standard term aggregation based on the casting.name field, how can I retrieve the releaseDate and imdbRate of the related documents?
For each actor, I also need movies to be sorted by releaseDate asc.
Can I perform this using one single request?
As you have an array of casting objects in your documents you'll need to use the nested type in your mapping. To get the aggregations you want you need a combination of Terms Aggregations, Nested Aggregations and Reverse Nested Aggregations. Below is an example.
Create and index with the mapping:
POST /test
{
"mappings": {
"movie": {
"properties": {
"title": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"releaseDate": {
"type": "string",
"index": "not_analyzed"
},
"casting": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"category": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
}
Index the documents:
POST /test/movie/1
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
POST /test/movie/2
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
And finally search:
POST /test/movie/_search?search_type=count
{
"aggs": {
"nested_path": {
"nested": {
"path": "casting"
},
"aggs": {
"actor_name": {
"terms": {
"field": "casting.name.raw"
},
"aggs": {
"movies": {
"reverse_nested": {},
"aggs": {
"movie_title": {
"terms": {
"field": "title.raw"
},
"aggs": {
"release_date": {
"terms": {
"field": "releaseDate"
}
},
"imdbRate_date": {
"terms": {
"field": "imdbRate"
}
}
}
}
}
}
}
}
}
}
}
}
The response for Christian Bale is:
{
"key": "Christian Bale",
"doc_count": 2,
"movies": {
"doc_count": 2,
"movie_title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "American Psycho",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7/06/2000",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7.6",
"doc_count": 1
}
]
}
},
{
"key": "The Dark Knight",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "13/08/2008",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9.0",
"doc_count": 1
}
]
}
}
]
}
}
}