Elasticsearch use the key of term aggregation as the input in script - elasticsearch

I stored the result of each game as a doc. The players and their scores were stored in users and scores arrays.
Sample data :
[
{
"gameId": "game01",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-11T17:00:00.000Z",
"scores": [
4,
1
]
},
{
"gameId": "game02",
"users": [
"user01",
"user02"
],
"#timestamp": "2022-08-12T17:00:00.000Z",
"scores": [
3,
1
]
},
{
"gameId": "game02",
"users": [
"user02",
"user03"
],
"#timestamp": "2022-08-12T18:00:00.000Z",
"scores": [
2,
4
]
}
]
I expected to use the below query to aggregate the daily total scores of each game of users:
{
"aggs": {
"aggByDate": {
"date_histogram": {
"field": "#timestamp",
"interval": "1d",
"time_zone": "+8",
"min_doc_count": 1
},
"aggs": {
"aggByGame": {
"terms": {
"field": "gameId"
},
"aggs": {
"aggByUser": {
"terms": {
"field": "users"
},
"aggs": {
"totalScore": {
"sum": {
"script": {
"source": """
String targetUser = params.key; <--- I don't know how to get the key here
int i = 0;
for (def user: doc.users) {
if (user == targetUser) break;
i++;
}
return doc.scores[i];
"""
}
}
}
}
}
}
}
}
}
}
}
Expected result:
{
"aggregations": {
"aggByDate": {
"buckets": [
{
"key_as_string": "2022-08-11T00:00:00.000+08:00",
"doc_count": 1,
"aggByGame": {
"buckets": [
{
"key": "game01",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01", <--- this is the value I want to inject into the script
"doc_count": 1,
"totalScore": {
"value": 4
}
},
{
"key": "user02",
"doc_count": 1,
"totalScore": {
"value": 1
}
}
]
}
}
]
}
},
{
"key_as_string": "2022-08-12T00:00:00.000+08:00",
"doc_count": 2,
"aggByGame": {
"buckets": [
{
"key": "game02",
"doc_count": 1,
"aggByUser": {
"buckets": [
{
"key": "user01",
"doc_count": 1,
"totalScore": {
"value": 3
}
},
{
"key": "user02",
"doc_count": 2,
"totalScore": {
"value": 3
}
},
{
"key": "user03",
"doc_count": 1,
"totalScore": {
"value": 4
}
}
]
}
}
]
}
}
]
}
}
}
But since I don't know the userId before the query, I don't know where can I let elasticsearch inject the params.key into the script in the middle of the query.
I also referenced the issue here. Seems it's not possible to access the aggregation data for sub-aggs. Do we have another workaround here? Thanks!!
(I use ElasticSearch v7.10)

Related

Getting the count of customers for each version, only including the highest version a customer has

I have an aggregation to get the count of customers for each version:
{
"aggs": {
"2": {
"terms": {
"field": "version.string.keyword",
"order": {
"_key": "desc"
},
"size": 50
},
"aggs": {
"1": {
"cardinality": {
"field": "orgId.keyword"
}
}
}
}
}
The problem with this is that if a customer has two versions running at the same time, the customer will be included in both versions. What I need is for the customer to be included only in the highest version. For example, if I've got documents:
{
"orgId": "A",
"version": {
"string": "1.1",
"major": 1,
"minor": 1
}
}
{
"orgId": "A",
"version": {
"string": "1.2",
"major": 1,
"minor": 2
}
}
{
"orgId": "B",
"version": {
"string": "1.1",
"major": 1,
"minor": 2
}
}
The response should be:
[
{
"1": {
"value": 1
},
"key": "1.1"
},
{
"1": {
"value": 1
},
"key": "1.2"
}
]
instead of:
[
{
"1": {
"value": 2
},
"key": "1.1"
},
{
"1": {
"value": 1
},
"key": "1.2"
}
]
I've tried this query which correctly returns highest version for each customer:
{
"aggs": {
"2": {
"terms": {
"field": "orgId.keyword",
"order": {
"_key": "desc"
},
"size": 50
},
"aggs": {
"sorted_version": {
"top_hits": {
"sort": [
{
"version.major": {
"order": "desc"
},
"version.minor": {
"order": "desc"
}
}
],
"_source": {
"includes": [
"version.string"
]
},
"size": 1
}
}
}
}
}
}
I'm kinda lost now on how to combine these two queries, any help would be appreciated.
This results help you?
{
"size": 0,
"aggs": {
"group_by_version_string": {
"terms": {
"field": "version.string.keyword",
"order": {
"_key": "desc"
}
},
"aggs": {
"group_by_orgId": {
"terms": {
"field": "orgId.keyword",
"order": {
"_key": "desc"
}
}
}
}
}
}
}
Response
"buckets": [
{
"key": "1.2",
"doc_count": 1,
"group_by_orgId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 1
}
]
}
},
{
"key": "1.1",
"doc_count": 2,
"group_by_orgId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "B",
"doc_count": 1
},
{
"key": "A",
"doc_count": 1
}
]
}
}
]

Elasticsearch - Count occurrences of elements in nested field

I have an elasticsearch index with this simplified structure:
{
"id": "group1",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user2"
}
]
},
{
"id": "group2",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
},
{
"id": "group3",
"users": [
{
"user_id": "user1"
},
{
"user_id": "user3"
},
]
}
I need to get the number of documents where each user appears. Something like this:
[
{
"key": "user1",
"doc_count": 3
},
{
"key": "user2",
"doc_count": 1
},
{
"key": "user3",
"doc_count: 2
}
]
You need to use nested aggregation with the terms
aggregation
Adding a working example with index mapping, search query, and search result
Index Mapping:
{
"mappings":{
"properties":{
"users":{
"type":"nested"
}
}
}
}
Search Query:
{
"size":0,
"aggs": {
"resellers": {
"nested": {
"path": "users"
},
"aggs": {
"unique_user": {
"terms": {
"field": "users.user_id.keyword"
}
}
}
}
}
}
Search Result:
"aggregations": {
"resellers": {
"doc_count": 6,
"unique_user": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user1",
"doc_count": 3
},
{
"key": "user3",
"doc_count": 2
},
{
"key": "user2",
"doc_count": 1
}
]
}
}
}

Reaggregate on nested aggregation results using Elasticsearch

I want to compute some aggregations (using Elasticsearch 6.2) on products that have criteria. All the criteria are flattened and I want to reuse some aggregation results to reaggregate by a specific criterion.
Here is my index mapping:
PUT my_index
{
"mappings" : {
"_doc" : {
"properties" : {
"contract": {
"properties": {
"products": {
"type": "nested",
"properties": {
"productKey": {
"type": "keyword"
},
"criteria": {
"type": "nested",
"properties": {
"criterionKey": {
"type": "keyword"
},
"criterionValue": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
}
}
I populated my index with the following data:
POST my_index/_doc
{
"contract": {
"products": [
{
"productKey": "PK_0001",
"criteria": [
{
"criterionKey": "CK_AAAA",
"criterionValue": "above_50"
},
{
"criterionKey": "CK_AAAB",
"criterionValue": "all"
}
]
}
]
}
}
POST my_index/_doc
{
"contract": {
"products": [
{
"productKey": "PK_0001",
"criteria": [
{
"criterionKey": "CK_AAAA",
"criterionValue": "below_50"
},
{
"criterionKey": "CK_AAAB",
"criterionValue": "dep"
}
]
}
]
}
}
POST my_index/_doc
{
"contract": {
"products": [
{
"productKey": "PK_0002",
"criteria": [
{
"criterionKey": "CK_AAAA",
"criterionValue": "below_50"
},
{
"criterionKey": "CK_AAAB",
"criterionValue": "dep"
}
]
}
]
}
}
I am able to count the occurrences of all criterion values per product. To do so, I use the following aggregation request:
POST my_index/_doc/_search
{
"size": 0,
"aggs": {
"agg_by_product": {
"nested": {
"path": "contract.products"
},
"aggs": {
"agg_by_product_key": {
"terms": {
"field": "contract.products.productKey"
},
"aggs": {
"agg_by_product_crit": {
"nested": {
"path": "contract.products.criteria"
},
"aggs": {
"agg_by_product_crit_key": {
"terms": {
"field": "contract.products.criteria.criterionKey",
"include": [ "CK_AAAB", "CK_AAAA" ]
},
"aggs": {
"agg_by_product_crit_value": {
"terms": {
"field": "contract.products.criteria.criterionValue"
}
}
}
}
}
}
}
}
}
}
}
}
It returns:
{
// ...
"aggregations": {
"agg_by_product": {
"doc_count": 3,
"agg_by_product_key": {
"buckets": [
{
"key": "PK_0001",
"doc_count": 2,
"agg_by_product_crit": {
"doc_count": 8,
"agg_by_product_crit_key": {
"buckets": [
{
"key": "CK_AAAB",
"doc_count": 2,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "dep",
"doc_count": 1
},
{
"key": "all",
"doc_count": 1
}
]
}
},
{
"key": "CK_AAAA",
"doc_count": 2,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "below_50",
"doc_count": 1
},
{
"key": "above_50",
"doc_count": 1
}
]
}
}
]
}
}
},
{
"key": "PK_0002",
"doc_count": 1,
"agg_by_product_crit": {
"doc_count": 4,
"agg_by_product_crit_key": {
"buckets": [
{
"key": "CK_AAAB",
"doc_count": 1,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "dep",
"doc_count": 1
}
]
}
},
{
"key": "CK_AAAA",
"doc_count": 1,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "below_50",
"doc_count": 1
}
]
}
}
]
}
}
}
]
}
}
}
}
Now I would like to aggregate by criterion values of a specified criterion key, in order to get something like this:
{
// ...
"aggregations": {
"agg_by_product": {
"doc_count": 3,
"agg_by_product_key": {
"buckets": [
{
"key": "PK_0001",
"doc_count": 2,
"agg_by_product_crit": {
"doc_count": 8,
"agg_by_product_crit_key": {
"buckets": [
{
"key": "CK_AAAB",
"doc_count": 2,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "dep",
"doc_count": 1,
"AGG_BY_SOMETHING": {
"buckets": [
{
"key": "CK_AAAA",
"doc_count": 1,
"AGG_BY_SOMETHING_2": {
"buckets": [
{
"key": "below_50",
"doc_count": 1
}
]
}
}
]
}
},
{
"key": "all",
"doc_count": 1,
"AGG_BY_SOMETHING": {
"buckets": [
{
"key": "CK_AAAA",
"doc_count": 1,
"AGG_BY_SOMETHING_2": {
"buckets": [
{
"key": "above_50",
"doc_count": 1
}
]
}
}
]
}
}
]
}
}
]
}
}
},
{
"key": "PK_0002",
"doc_count": 1,
"agg_by_product_crit": {
"doc_count": 4,
"agg_by_product_crit_key": {
"buckets": [
{
"key": "CK_AAAB",
"doc_count": 1,
"agg_by_product_crit_value": {
"buckets": [
{
"key": "dep",
"doc_count": 1,
"AGG_BY_SOMETHING": {
"buckets": [
{
"key": "CK_AAAA",
"doc_count": 1,
"AGG_BY_SOMETHING_2": {
"buckets": [
{
"key": "below_50",
"doc_count": 1
}
]
}
}
]
}
}
]
}
}
]
}
}
}
]
}
}
}
}
What should be the corresponding aggregation request?
Finally I found a solution using a reverse_nested aggregation.
POST my_index/_doc/_search
{
"size": 0,
"aggs": {
"agg_by_product": {
"nested": {
"path": "contract.products"
},
"aggs": {
"agg_by_product_key": {
"terms": {
"field": "contract.products.productKey"
},
"aggs": {
"agg_by_product_crit": {
"nested": {
"path": "contract.products.criteria"
},
"aggs": {
"agg_by_product_crit_key": {
"terms": {
"field": "contract.products.criteria.criterionKey",
"include": [ "CK_AAAB" ]
},
"aggs": {
"agg_by_product_crit_value": {
"terms": {
"field": "contract.products.criteria.criterionValue"
},
"aggs": {
"agg_back_to_root": {
"reverse_nested": {},
"aggs": {
"agg_by_product_crit2": {
"nested": {
"path": "contract.products.criteria"
},
"aggs": {
"agg_by_product_crit_key2": {
"terms": {
"field": "contract.products.criteria.criterionKey",
"include": [ "CK_AAAA" ]
},
"aggs": {
"agg_by_product_crit_value2": {
"terms": {
"field": "contract.products.criteria.criterionValue"
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}

Get all documents of having unique value with elastic

for example:
i have many documents like this:
email status
1#123.com open
1#123.com click
2#123.com open
3#123.com open
i will query all documents with unique status value :"open", due to the record "1#123.com" contains "click" status, so "1#123.com" don't expect!
i tried this below,but not my expect:
{
"aggs": {
"hard_bounce_count": {
"filter": {
"term": {
"actionStatus": "open"
}
},
"aggs": {
"email_count": {
"value_count": {
"field": "email"
}
}
}
my expect response like this:
2#123.com open
3#123.com open
How can i do this,thanks..
Here, outer term-aggs (named EMAIL_LIST) return all emails and then within each email bucket, first it finds whether the status is open or not (using filter-aggs with name OPEN) then it finds if the status is other than "open" (using another filter-aggs with name OTHER_THAN_OPEN)
{
"size": 0,
"aggs": {
"EMAIL_LIST": {
"terms": {
"field": "email.keyword"
},
"aggs": {
"OPEN": {
"filter": {
"bool": {
"must": [
{
"term": {
"status": "open"
}
}
]
}
}
},
"OTHER_THAN_OPEN": {
"filter": {
"bool": {
"must_not": [
{
"term": {
"status": "open"
}
}
]
}
}
},
"SELECTION_SCRIPT": {
"bucket_selector": {
"buckets_path": {
"open_count": "OPEN._count",
"other_than_open_count": "OTHER_THAN_OPEN._count"
},
"script": "params.other_than_open_count==0 && params.open_count>0"
}
}
}
}
}
}
Above "bucket_selector" aggregation select only those bucket to output which have only open status
"aggregations": {
"EMAIL_LIST": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "2#123.com",
"doc_count": 1,
"OTHER_THAN_OPEN": {
"doc_count": 0
},
"OPEN": {
"doc_count": 1
}
},
{
"key": "3#123.com",
"doc_count": 1,
"OTHER_THAN_OPEN": {
"doc_count": 0
},
"OPEN": {
"doc_count": 1
}
}
]
}
}
so final answer will be email "2#123.com" and "3#123.com"
I can query this too.
{
"aggs": {
"email": {
"terms": {
"field": "email"
},
"aggs": {
"status_group": {
"terms": {
"field": "status"
}
}
}
}
}
}
response:
"aggregations": {
"email": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "1#123.com",
"doc_count": 2,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "click",
"doc_count": 1
}, {
"key": "open",
"doc_count": 1
}
]
}
}, {
"key": "2#123.com",
"doc_count": 1,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "open",
"doc_count": 1
}
]
}
}, {
"key": "3#123.com",
"doc_count": 1,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "open",
"doc_count": 1
}
]
}
}
]
}
}
but how can I exclude "1#email" in resulting buckets, Because I eventually need the statistics of all eligible documents

Elasticsearch range bucket aggregation based on doc_count

I have an elasticsearch aggregation query like this.
{
"aggs": {
"customer": {
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
}
},
"terms": {
"field": "org",
"size": 50000
}
}
}
}
And it results in bucket aggregation like following
{
"aggregations": {
"customer": {
"buckets": [
{
"Total_Sale": { "value": 9999 },
"doc_count": 8,
"key": "cats"
},
{
"Total_Sale": { "value": 8888 },
"doc_count": 6,
"key": "tigers"
},
{
"Total_Sale": { "value": 444},
"doc_count": 5,
"key": "lions"
},
{
"Total_Sale": { "value": 555 },
"doc_count": 2,
"key": "wolves"
}
]
}
}
}
I want another range bucket aggregation based on doc_count. So, final result required is
{
"buckets": [
{
"Sum_of_Total_Sale": 555, // If I can form bucket, I can get this using sum_bucket. So, getting bucket is important.
"Sum_of_doc_count": 2,
"doc_count": 1,
"key": "*-3",
"to": 3.0
},
{
"Sum_of_Total_Sale": 9332,
"Sum_of_doc_count": 11,
"doc_count": 2,
"from": 4.0,
"key": "4-6",
"to": 6.0
},
{
"Sum_of_Total_Sale": 9999,
"Sum_of_doc_count": 8,
"doc_count": 1,
"from": 7.0,
"key": "7-*"
}
]
}
Bucket Selector Aggregation and then using bucket sum aggregation will not work because there is more than one key for range.
Bucket Script Aggregation does calculation within bucket.
Can I add scripted doc field for each document which help me to create these buckets?
There's no aggregation that I know of that can allow you to do this in one shot. however, there is one technique that I use from time to time to overcome this limitation. The idea is to repeat the same terms/sum aggregation and then use a bucket_selector pipeline aggregation for each of the ranges you're interested in.
POST index/_search
{
"size": 0,
"aggs": {
"*-3": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"*-3": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount <= 3"
}
}
}
},
"*-3_Total_Sales": {
"sum_bucket": {
"buckets_path": "*-3>Total_Sale"
}
},
"*-3_Total_Docs": {
"sum_bucket": {
"buckets_path": "*-3>_count"
}
},
"4-6": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"4-6": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 4 && params.docCount <= 6"
}
}
}
},
"4-6_Total_Sales": {
"sum_bucket": {
"buckets_path": "4-6>Total_Sale"
}
},
"4-6_Total_Docs": {
"sum_bucket": {
"buckets_path": "4-6>_count"
}
},
"7-*": {
"terms": {
"field": "org",
"size": 1000
},
"aggs": {
"Total_Sale": {
"sum": {
"field": "amount"
}
},
"7-*": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 7"
}
}
}
},
"7-*_Total_Sales": {
"sum_bucket": {
"buckets_path": "7-*>Total_Sale"
}
},
"7_*_Total_Docs": {
"sum_bucket": {
"buckets_path": "7-*>_count"
}
}
}
}
You'll get an answer that looks like this, which contains exactly the figures you're looking for in the xyz_Total_Sales and xyz_Total_Docs results:
"aggregations": {
"*-3": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "wolves",
"doc_count": 2,
"Total_Sale": {
"value": 555
}
}
]
},
"7-*": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cats",
"doc_count": 8,
"Total_Sale": {
"value": 9999
}
}
]
},
"4-6": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "tigers",
"doc_count": 6,
"Total_Sale": {
"value": 8888
}
},
{
"key": "lions",
"doc_count": 5,
"Total_Sale": {
"value": 444
}
}
]
},
"*-3_Total_Sales": {
"value": 555
},
"*-3_Total_Docs": {
"value": 2
},
"4-6_Total_Sales": {
"value": 9332
},
"4-6_Total_Docs": {
"value": 11
},
"7-*_Total_Sales": {
"value": 9999
},
"7_*_Total_Docs": {
"value": 8
}
}

Resources