How to aggregate until a certain value is reached in ElasticSearch? - elasticsearch

I would like to aggregate a list of documents (each of them has two fields - timestamp and amount) by "amount" field until a certain value is reached. For example I would like to get list of documents sorted by timestamp which total amount is equal to 100. Is it possible to do in one query?
Here is my query which returns total amount - I would like to add here a condition to stop aggregation when a certain value is reached.
{
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": 1525168583
}
}
}
]
}
},
"aggs": {
"total_amount": {
"sum": {
"field": "amount"
}
}
},
"sort": [
"timestamp"
],
"size": 10000
}
Thank You

It's perfectly possible using a combination of function_score scripting for mimicking sorting, filter aggs for the range gte query and a healthy amount of scripted_metric aggs to limit the summation up to a certain amount.
Let's first set up a mapping and ingest some docs:
PUT summation
{
"mappings": {
"properties": {
"timestamp": {
"type": "date",
"format": "epoch_second"
}
}
}
}
POST summation/_doc
{
"context": "newest",
"timestamp": 1587049128,
"amount": 20
}
POST summation/_doc
{
"context": "2nd newest",
"timestamp": 1586049128,
"amount": 30
}
POST summation/_doc
{
"context": "3rd newest",
"timestamp": 1585049128,
"amount": 40
}
POST summation/_doc
{
"context": "4th newest",
"timestamp": 1585049128,
"amount": 30
}
Then perform the query:
GET summation/_search
{
"size": 0,
"aggs": {
"filtered_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1585049128
}
}
},
{
"function_score": {
"query": {
"match_all": {}
},
"script_score": {
"script": {
"source": "return (params['now'] - doc['timestamp'].date.toMillis())",
"params": {
"now": 1587049676
}
}
}
}
}
]
}
},
"aggs": {
"limited_sum": {
"scripted_metric": {
"init_script": """
state['my_hash'] = new HashMap();
state['my_hash'].put('sum', 0);
state['my_hash'].put('docs', new ArrayList());
""",
"map_script": """
if (state['my_hash']['sum'] <= 100) {
state['my_hash']['sum'] += doc['amount'].value;
state['my_hash']['docs'].add(doc['context.keyword'].value);
}
""",
"combine_script": "return state['my_hash']",
"reduce_script": "return states[0]"
}
}
}
}
}
}
yielding
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"filtered_agg" : {
"meta" : { },
"doc_count" : 4,
"limited_sum" : {
"value" : {
"docs" : [
"newest",
"2nd newest",
"3rd newest",
"4th newest"
],
"sum" : 120
}
}
}
}
}
I've chosen here to only return the doc.contexts but you can adjust it to retrieve whatever you like -- be it IDs, amounts etc.

Related

bucket aggregation/bucket_script computation

How to apply computation using bucket fields via bucket_script? More so, I would like to understand how to aggregate on distinct, results.
For example, below is a sample query, and the response.
What I am looking for is to aggregate the following into two fields:
sum of all buckets dist.value from e.g. response (1+2=3)
sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
Query
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"field": "value"
}
}
]
}
},
"aggs":{
"sales_summary":{
"terms":{
"field":"qty",
"size":"100"
},
"aggs":{
"dist":{
"cardinality":{
"field":"somekey.keyword"
}
}
}
}
}
}
Query Result:
{
"aggregations": {
"sales_summary": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 10,
"doc_count": 100,
"dist": {
"value": 1
}
},
{
"key": 20,
"doc_count": 200,
"dist": {
"value": 2
}
}
]
}
}
}
You need to use a sum bucket aggregation, which is a pipeline aggregation to find the sum of response of cardinality aggregation across all the buckets.
Search Query for sum of all buckets dist.value from e.g. response (1+2=3):
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>dist"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
}
}
]
},
"sum_buckets" : {
"value" : 5.0
}
}
For the second requirement, you need to first modify the response of value in the bucket aggregation response, using bucket script aggregation, and then use the modified value to perform bucket sum aggregation on it.
Search Query for sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
},
"format-value-agg": {
"bucket_script": {
"buckets_path": {
"newValue": "dist"
},
"script": "params.newValue * 10"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>format-value-agg"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
},
"format-value-agg" : {
"value" : 20.0
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
},
"format-value-agg" : {
"value" : 30.0
}
}
]
},
"sum_buckets" : {
"value" : 50.0
}
}

How to count number of fields inside nested field? - Elasticsearch

I did the following mapping. I would like to count the number of products in each nested field "products" (for each document separately). I would also like to do a histogram aggregation, so that I would know the number of specific bucket sizes.
PUT /receipts
{
"mappings": {
"properties": {
"id" : {
"type": "integer"
},
"user_id" : {
"type": "integer"
},
"date" : {
"type": "date"
},
"sum" : {
"type": "double"
},
"products" : {
"type": "nested",
"properties": {
"name" : {
"type" : "text"
},
"number" : {
"type" : "double"
},
"price_single" : {
"type" : "double"
},
"price_total" : {
"type" : "double"
}
}
}
}
}
}
I've tried this query, but I get the number of all the products instead of number of products for each document separately.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products"
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count" : 6552,
"bucket_size" : {
"value" : 0
}
}
}
UPDATE
Now I have this code where I make separate buckets for each id and count the number of products inside them.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size" : 0,
"aggs": {
"terms":{
"terms":{
"field": "_id"
},
"aggs": {
"nested": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count_error_upper_bound" : 5,
"sum_other_doc_count" : 490,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"nested" : {
"doc_count" : 21,
"bucket_size" : {
"value" : 21
}
}
},
{
"key" : "10",
"doc_count" : 1,
"nested" : {
"doc_count" : 5,
"bucket_size" : {
"value" : 5
}
}
},
{
"key" : "100",
"doc_count" : 1,
"nested" : {
"doc_count" : 12,
"bucket_size" : {
"value" : 12
}
}
},
...
Is is possible to group these values (21, 5, 12, ...) into buckets to make a histogram of them?
products is only the path to the array of individual products, not an aggregatable field. So you'll need to use it on one of your product's field -- such as the number:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
Note that is a product has no number, it'll not contribute to the total count. It's therefore best practice to always include an ID in each of them and then aggregate on that field.
Alternatively you could use a script to account for missing values. Luckily value_count does not deduplicate -- meaning if two products are alike and/or have empty values, they'll still be counted as two:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"script": {
"source": "doc['products.number'].toString()"
}
}
}
}
}
}
}
UPDATE
You could also use a nested composite aggregation which'll give you the histogrammed product count w/ the corresponding receipt id:
GET /receipts/_search
{
"size": 0,
"aggs": {
"my_aggs": {
"nested": {
"path": "products"
},
"aggs": {
"composite_parent": {
"composite": {
"sources": [
{
"receipt_id": {
"terms": {
"field": "_id"
}
}
},
{
"product_number": {
"histogram": {
"field": "products.number",
"interval": 1
}
}
}
]
}
}
}
}
}
}
The interval is modifiable.

elasticsearch return hits found in aggregation

I am trying to get rows from my database that have a unique 'sku' field.
I have a working query which counts this number correctly, my query:
GET _search
{
"size": 0,
"aggs": {
"unique_products":{
"cardinality":{
"field":"sku.keyword"
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
Returns this result:
{
"took" : 129,
"timed_out" : false,
"_shards" : {
"total" : 514,
"successful" : 514,
"skipped" : 98,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 150,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_products" : {
"value" : 38
}
}
}
Which correctly reports the number of unique_products as 38.
I am trying to edit this query so that it will actually return all 38 unique products, but am unsure how, I started by trying to return the top hit from the agg result:
GET _search
{
"size": 0,
"aggs": {
"unique_products":{
"cardinality":{
"field":"sku.keyword"
}
},
"top_hits": {
"size": 1,
"_source": {
"include": [
"sku", "source_store"
]
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
But got an error in my result saying:
{
"error": {
"root_cause": [
{
"type": "parsing_exception",
"reason": "Expected [START_OBJECT] under [size], but got a [VALUE_NUMBER] in [top_hits]",
"line": 10,
"col": 13
}
],
"type": "parsing_exception",
"reason": "Expected [START_OBJECT] under [size], but got a [VALUE_NUMBER] in [top_hits]",
"line": 10,
"col": 13
},
"status": 400
}
Is a cardinality agg still my best bet for returning all 38 unique products? thanks
While the cardinality aggregation gives the unique count, it cannot accept sub-aggs. In other words top_hits cannot be used here directly.
The approach was correct but you may first want to bucketize the skus and then retrieve the underlying docs using top_hits:
{
"size": 0,
"aggs": {
"unique_products": {
"cardinality": {
"field": "sku.keyword"
}
},
"terms_agg": {
"terms": {
"field": "sku.keyword",
"size": 100
},
"aggs": {
"top_hits_agg": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"sku",
"source_store"
]
}
}
}
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
FYI The reason your query threw an exception is that top_hits is an agg type and, just like unique_products, it was missing its own name.

ElasticSearch aggregation using array items as keys

Is it possible to create an aggregation by unnesting an array's elements to use as keys?
Here's an example:
Docs:
[
{
"languages": [ 1, 2 ],
"value": 100
},
{
"languages": [ 1 ],
"value": 50
}
]
its mapping:
{
"documents": {
"mappings": {
"properties": {
"languages": {
"type": "integer"
},
"value": {
"type": "integer"
}
}
}
}
}
and the expected output of a summing aggregation would be:
{
1: 150,
2: 100
}
You can achieve what you want by using a simple terms aggregation. Array elements will be bucketed individually:
POST index/_search
{
"aggs": {
"languages": {
"terms": {
"field": "languages"
},
"aggs": {
"total": {
"sum": {
"field": "value"
}
}
}
}
}
}
Results:
"aggregations" : {
"languages" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 2,
"total" : {
"value" : 150.0
}
},
{
"key" : 2,
"doc_count" : 1,
"total" : {
"value" : 100.0
}
}
]
}
}
The terms agg will sum up the # of occurences. What you want instead is a script to sum up the values based on the language array items as keys:
GET langs/_search
{
"size": 0,
"aggs": {
"lang_sums": {
"scripted_metric": {
"init_script": "state.lang_sums=[:]",
"map_script": """
for (def lang : doc['languages']) {
def lang_str = lang.toString();
def value = doc['value'].value;
if (state.lang_sums.containsKey(lang_str)) {
state.lang_sums[lang_str] += value;
} else {
state.lang_sums[lang_str] = value;
}
}
""",
"combine_script": "return state",
"reduce_script": "return states"
}
}
}
}
yielding
{
...
"aggregations":{
"lang_sums":{
"value":[
{
"lang_sums":{
"1":150,
"2":100
}
}
]
}
}
}

How can I aggregate over the _score

I tried to run an aggregate query over the _score field on Elastic Search with no results. Seems it is not possible to use the _score field, maybe because it is not a field of the document. How can I aggregate over the _score ?
This is my query:
{
"_source": false, "explain": false, "from": 0, "size": 0,
"aggs" : {
"score_ranges" : {
"range" : {
"field" : "_score",
"ranges" : [
{ "to" : 50 },
{ "from" : 50, "to" : 75 },
{ "from" : 75 }
]
}
}
},
"query": {
"function_score": {
"query": {
"match_all": { }
}
}
}
}
"aggs": {
"scores_histogram": {
"histogram": {
"script": "return _score.doubleValue() * 10",
"interval": 3
}
}
}
or, with ranges:
"aggs": {
"score_ranges": {
"range": {
"script": "_score",
"ranges": [
{
"to": 50
},
{
"from": 50,
"to": 75
},
{
"from": 75
}
]
}
}
}
And you need to enable dynamic scripting.

Resources