Finding intersection of two buckets using Elastic - elasticsearch

I have data structured as the following in an elastic index:
[ { customer_id: 1, date_of_purchase: 01-01-2022 },
{ customer_id: 2, date_of_purchase: 01-02-2022 },
{ customer_id: 1, date_of_purchase: 01-02-2022 },
....
]
I want to find the numbers of users who have bought something in both September and October, but having issues figuring out how to make a query for this. Any suggestions would rock, thanks!

I have used following aggregations
1. Terms aggregation
2. Bucket selector
3. Date Range
In query I have filtered all documents which either have purchase date in Jan or in Feb. This reduces number of documents for aggregation to work on. In aggregation I have done a group by(terms aggregation) on customer_id and then further grouped documents based on date ranges(1 bucket for each month). Then I have eliminated months(using bucket selector) which have zero documents i.e. with no purchase date in that month and further eliminated customers which have 1 or zero buckets
Query
{
"query": {
"bool": {
"should": [
{
"range": {
"date_of_purchase": {
"gte": "2022-01-01",
"lte": "2022-01-31"
}
}
},
{
"range": {
"date_of_purchase": {
"gte": "2022-02-01",
"lte": "2022-02-28"
}
}
}
]
}
},
"aggs": {
"cutomers": {
"terms": {
"field": "customer_id",
"size": 10
},
"aggs": {
"range": {
"date_range": {
"field": "date_of_purchase",
"ranges": [
{
"to": "2022-01-31",
"from": "2022-01-01"
},
{
"to": "2022-02-28",
"from": "2022-02-01"
}
]
},
"aggs": {
"filter_months": {
"bucket_selector": {
"buckets_path": {
"doc_count":"_count"
},
"script": "params.doc_count>=1"
}
}
}
},
"bucket_count":{
"bucket_selector": {
"buckets_path": {
"bucket_count":"range._bucket_count"
},
"script": "params.bucket_count>1"
}
}
}
}
}
}
Results
"aggregations" : {
"cutomers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 2,
"range" : {
"buckets" : [
{
"key" : "2022-01-01T00:00:00.000Z-2022-01-31T00:00:00.000Z",
"from" : 1.6409952E12,
"from_as_string" : "2022-01-01T00:00:00.000Z",
"to" : 1.6435872E12,
"to_as_string" : "2022-01-31T00:00:00.000Z",
"doc_count" : 1
},
{
"key" : "2022-02-01T00:00:00.000Z-2022-02-28T00:00:00.000Z",
"from" : 1.6436736E12,
"from_as_string" : "2022-02-01T00:00:00.000Z",
"to" : 1.6460064E12,
"to_as_string" : "2022-02-28T00:00:00.000Z",
"doc_count" : 1
}
]
}
}
]
}
}

Related

bucket aggregation/bucket_script computation

How to apply computation using bucket fields via bucket_script? More so, I would like to understand how to aggregate on distinct, results.
For example, below is a sample query, and the response.
What I am looking for is to aggregate the following into two fields:
sum of all buckets dist.value from e.g. response (1+2=3)
sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
Query
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"field": "value"
}
}
]
}
},
"aggs":{
"sales_summary":{
"terms":{
"field":"qty",
"size":"100"
},
"aggs":{
"dist":{
"cardinality":{
"field":"somekey.keyword"
}
}
}
}
}
}
Query Result:
{
"aggregations": {
"sales_summary": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 10,
"doc_count": 100,
"dist": {
"value": 1
}
},
{
"key": 20,
"doc_count": 200,
"dist": {
"value": 2
}
}
]
}
}
}
You need to use a sum bucket aggregation, which is a pipeline aggregation to find the sum of response of cardinality aggregation across all the buckets.
Search Query for sum of all buckets dist.value from e.g. response (1+2=3):
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>dist"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
}
}
]
},
"sum_buckets" : {
"value" : 5.0
}
}
For the second requirement, you need to first modify the response of value in the bucket aggregation response, using bucket script aggregation, and then use the modified value to perform bucket sum aggregation on it.
Search Query for sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
},
"format-value-agg": {
"bucket_script": {
"buckets_path": {
"newValue": "dist"
},
"script": "params.newValue * 10"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>format-value-agg"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
},
"format-value-agg" : {
"value" : 20.0
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
},
"format-value-agg" : {
"value" : 30.0
}
}
]
},
"sum_buckets" : {
"value" : 50.0
}
}

Get top values from Elasticsearch bucket

I have some items with brand
I want to return N records, but no more than x from each bucket
So far I have my buckets grouped by brand
"aggs": {
"brand": {
"terms": {
"field": "brand"
}
}
}
"aggregations" : {
"brand" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 130
},
{
"key" : "brandB",
"doc_count" : 127
}
]
}
But how do I access specific bucket and get top x values from there?
You can use top hits sub aggregation to get documents under each brand. You can sort those documents and define a size too.
{
"aggs": {
"brand": {
"terms": {
"field": "brand",
"size": 10 --> no of brands
},
"aggs": {
"top_docs": {
"top_hits": {
"sort": [
{
"date": {
"order": "desc"
}
}
],
"size": 1 --> no of documents under each brand
}
}
}
}
}
}

How to filter by sub-aggregated results in Elasticsearch

I've got the following elastic search query in order to get the number of product sales per hour grouped by product id and hour of sale.
POST /my_sales/_search?size=0
{
"aggs": {
"sales_per_hour": {
"date_histogram": {
"field": "event_time",
"fixed_interval": "1h",
"format": "yyyy-MM-dd:HH:mm"
},
"aggs": {
"sales_per_hour_per_product": {
"terms": {
"field": "name.keyword"
}
}
}
}
}
}
One example of data :
{
"#timestamp" : "2020-10-29T18:09:56.921Z",
"name" : "my-beautifull_product",
"event_time" : "2020-10-17T08:01:33.397Z"
}
This query returns several buckets (one per hour and per product) but i would like to only retrieve those who have a doc_count higher than 10 for example, is it possible ?
For those results i would like to know the id of the product and the event_time bucket.
Thanks for your help.
Perhaps using the Bucket Selector feature will help on filtering out the results.
Try out this below search query:
{
"aggs": {
"sales_per_hour": {
"date_histogram": {
"field": "event_time",
"fixed_interval": "1h",
"format": "yyyy-MM-dd:HH:mm"
},
"aggs": {
"sales_per_hour_per_product": {
"terms": {
"field": "name.keyword"
},
"aggs": {
"the_filter": {
"bucket_selector": {
"buckets_path": {
"the_doc_count": "_count"
},
"script": "params.the_doc_count > 10"
}
}
}
}
}
}
}
}
It will filter out all the documents, whose count is greater than 10 based on "params.the_doc_count > 10"
Thank you for your help this is not far from what i would like but not exactly ; with the bucket selector i have something like this :
"aggregations" : {
"sales_per_hour" : {
"buckets" : [
{
"key_as_string" : "2020-08-31:23:00",
"key" : 1598914800000,
"doc_count" : 16,
"sales_per_hour_per_product" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "my_product_1",
"doc_count" : 2
},
{
"key" : "my_product_2",
"doc_count" : 2
},
{
"key" : "myproduct_3",
"doc_count" : 12
}
]
}
}
]
}
And sometimes none of the buckets are greater than 10, is it possible to have the same thing but with the filter on _count applied to the second level aggregation (sales_per_hour_per_product) and not on the first level (sales_per_hour) ?

I need to get average document count by date in elasticsearch

I want to get average document count by date without getting the whole bunch of buckets data and get average value by hand cause there are years of data and when I group by the date I get too_many_buckets_exception.
So my current query is
{
"query": {
"bool": {
"must": [],
"filter": []
}
},
"aggs": {
"groupByChannle": {
"terms": {
"field": "channel"
},
"aggs": {
"docs_per_day": {
"date_histogram": {
"field": "message_date",
"fixed_interval": "1d"
}
}
}
}
}
}
How can I get an average doc count grouped by message_date(day) and channel without taking buckets array of this data
"buckets" : [
{
"key_as_string" : "2018-03-17 00:00:00",
"key" : 1521244800000,
"doc_count" : 4027
},
{
"key_as_string" : "2018-03-18 00:00:00",
"key" : 1521331200000,
"doc_count" : 10133
},
...thousands of rows
]
my index structure looks like this
"mappings" : {
"properties" : {
"channel" : {
"type" : "keyword"
},
"message" : {
"type" : "text"
},
"message_date" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
},
}
}
By this query, I want to get JUST A AVERAGE DOC COUNT BY DATE and nothing else
"avg_count": {
"avg_bucket": {
"buckets_path": "docs_per_day>_count"
}
}
after docs_per_day ending this.
avg_count provides average count.
_count refers the bucket count
I think, that you can use stats aggregation with the script :
{
"size": 0,
"aggs": {
"term": {
"terms": {
"field": "chanel"
},
"aggs": {
"stats": {
"stats": {
"field": "message_date"
}
},
"result": {
"bucket_script": {
"buckets_path": {
"max" : "stats.max",
"min" : "stats.min",
"count" : "stats.count"
},
"script": "params.count/(params.max - params.min)/1000/86400)"
}
}
}
}
}
}

Elastic script from buckets and higher level aggregation

I want to compare the daily average of a metric (the frequency of words appearing in texts) to the value of a specific day. This is during a week. My goal is to check whether there's a spike. If the last day is way higher than the daily average, I'd trigger an alarm.
So from my input in Elasticsearch I compute the daily average during the week and find out the value for the last day of that week.
For getting the daily average for the week, I simply cut a week's worth of data using a range query on date field, so all my available data is the given week. I compute the sum and divide by 7 for a daily average.
For getting the last day's value, I did a terms aggregation on the date field with descending order and size 1 as suggested in a different question (How to select the last bucket in a date_histogram selector in Elasticsearch)
The whole output is as follows. Here you can see words "rama0" and "rama1" with their corresponding frequencies.
{
"aggregations" : {
"the_keywords" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "rama0",
"doc_count" : 4200,
"the_last_day" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 3600,
"buckets" : [
{
"key" : 1580169600000,
"key_as_string" : "2020-01-28T00:00:00.000Z",
"doc_count" : 600,
"the_last_day_frequency" : {
"value" : 3000.0
}
}
]
},
"the_weekly_sum" : {
"value" : 21000.0
},
"the_daily_average" : {
"value" : 3000.0
}
},
{
"key" : "rama1",
"doc_count" : 4200,
"the_last_day" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 3600,
"buckets" : [
{
"key" : 1580169600000,
"key_as_string" : "2020-01-28T00:00:00.000Z",
"doc_count" : 600,
"the_last_day_frequency" : {
"value" : 3000.0
}
}
]
},
"the_weekly_sum" : {
"value" : 21000.0
},
"the_daily_average" : {
"value" : 3000.0
}
},
[...]
]
}
}
}
Now I have the_daily_average in a high level of the output, and the_last_day_frequency in the single-element buckets list in the_last_day aggregation. I cannot use a bucket_script to compare those, because I cannot refer to a single bucket (if I place the script outside the_last_day aggregation) and I cannot refer to higher-level aggregations if I place the script inside the_last_day.
IMO the reasonable thing to do would be to put the script outside the aggregation and use a buckets_path using the <AGG_NAME><MULTIBUCKET_KEY> syntax mentioned in the docs, but I have tried "var1": "the_last_day[1580169600000]>the_last_day_frequency" and variations (hardcoding first until it works), but I haven't been able to refer to a particular bucket.
My ultimate goal is to have a list of keywords for which the last day frequency greatly exceeds the daily average.
For anyone interested, my current query is as follows. Notice that the part I'm struggling with is commented out.
body='{
"query": {
"range": {
"date": {
"gte": "START",
"lte": "END"
}
}
},
"aggs": {
"the_keywords": {
"terms": {
"field": "keyword",
"size": 100
},
"aggs": {
"the_weekly_sum": {
"sum": {
"field": "frequency"
}
},
"the_daily_average" : {
"bucket_script": {
"buckets_path": {
"weekly_sum": "the_weekly_sum"
},
"script": {
"inline": "return params.weekly_sum / 7"
}
}
},
"the_last_day": {
"terms": {
"field": "date",
"size": 1,
"order": {"_key": "desc"}
},
"aggs": {
"the_last_day_frequency": {
"sum": {
"field": "frequency"
}
}
}
}/*,
"the_spike": {
"bucket_script": {
"buckets_path": {
"last_day_frequency": "the_last_day>the_last_day_frequency",
"daily_average": "the_daily_average"
},
"script": {
"inline": "return last_day_frequency / daily_average"
}
}
}*/
}
}
}
}'
In your query the_last_day>the_last_day_frequency points to a bucket not a single value so it is throwing error. You need to get single metric value from "the_last_day_frequency", you can achieve it using max_bucket. Then you can use bucket_Selector aggregation to compare last day value with average value
Query:
"aggs": {
"the_keywords": {
"terms": {
"field": "keyword",
"size": 100
},
"aggs": {
"the_weekly_sum": {
"sum": {
"field": "frequency"
}
},
"the_daily_average": {
"bucket_script": {
"buckets_path": {
"weekly_sum": "the_weekly_sum"
},
"script": {
"inline": "return params.weekly_sum / 7"
}
}
},
"the_last_day": {
"terms": {
"field": "date",
"size": 1,
"order": {
"_key": "desc"
}
},
"aggs": {
"the_last_day_frequency": {
"sum": {
"field": "frequency"
}
}
}
},
"max_frequency_last_day": {
"max_bucket": {
"buckets_path": "the_last_day>the_last_day_frequency"
}
},
"the_spike": {
"bucket_selector": {
"buckets_path": {
"last_day_frequency": "max_frequency_last_day",
"daily_average": "the_daily_average"
},
"script": {
"inline": "params.last_day_frequency > params.daily_average"
}
}
}
}
}
}
````

Resources