sum all values in specific hours of date dsl queries - elasticsearch

I have index contains docs looks like this:
"_source": {
"price": "11",
"loggingdate": "15/02/2016 08:56:58",
}
I need a query that sum all values in price values between 10:00 - 12:00 in this year
I need total of each hour(10:00,11:00,12:00) This gives me total 3 hours of the result(only 1 item in bucket) but I need in seperated for each hour(three item in bucket)
{
"size":0,
"query":{
"filtered":{
"filter":{
"bool":{
"must":[
{
"range":{
"loggingdate":{
"gte":"now-1y"
}
}
},
{
},
{
"script":{
"script":"doc.loggingdate.date.getHourOfDay() >= 10 && doc.loggingdate.date.getHourOfDay() <= 12"
}
}
]
}
}
}
},
"aggs": {
"by_hour": {
"date_histogram": {
"field": "loggingdate",
"interval": "hour"
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
}
}
result:
"aggregations": {
"by_hour": {
"buckets": [
{
"key_as_string": "15/01/2016 10:00:00",
"key": 1452852000000,
"doc_count": 58453,
"total": {
"value": 2106110494
}
},
{
"key_as_string": "15/01/2016 11:00:00",
"key": 1452855600000,
"doc_count": 23243,
"total": {
"value": 849522038
}
},
{
"key_as_string": "15/01/2016 12:00:00",
"key": 1452859200000,
"doc_count": 11994,
"total": {
"value": 430906409
}
},
{
"key_as_string": "17/01/2016 10:00:00",
"key": 1453024800000,
"doc_count": 1,
"total": {
"value": 0
}
},...
I think I need use range with date_histogram but how can I sum all price values in other docs date_histogram gives me only docs in range..
Any idea?

You basically need a range filter to select only documents of the desired year and then another script filter in order to only select documents with the hours between 10am and 12am. Finally, you simply need a sum aggregation to sum all the prices of the matching documents.
{
"query": {
"bool": {
"filter": [
{
"range": {
"loggingdate": {
"gte": "2016-01-01",
"lt": "2017-01-01"
}
}
},
{
"script": {
"script": "doc.loggingdate.date.getHourOfDay() >= min && doc.loggingdate.date.getHourOfDay() <= max",
"params": {
"min": 10,
"max": 12
}
}
}
]
}
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
UPDATE
If you need the total by hour, you can use this aggregations instead:
"aggs": {
"by_hour": {
"terms": {
"script": "doc.loggingdate.date.getHourOfDay()"
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
}

Related

How to get hours between Min and Max date in Elasticsearch Aggregation?

How can I calculate hours between max and min dates (same tree level of max and min) in Elasticsearch?
My Query:-
{
"size": 0,
"query": {
"bool": {
"must": []
}
},
"aggs": {
"group_by_areaId": {
"terms": {
"size": 100000,
"field": "areaId.keyword"
},
"aggs": {
"4m": {
"date_histogram": {
"field": "timestamp",
"format": "yyyy-MM-dd'T'HH:mm:ssZZ",
"interval": "4m",
"order": {
"_key": "asc"
}
},
"aggs": {
"maxDate": {
"max": {
"field": "timestamp"
}
},
"minDate": {
"min": {
"field": "timestamp"
}
}
}
}
}
}
}
}
And the response (short) as,
"aggregations": {
"group_by_areaId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "key1",
"doc_count": 15,
"4m": {
"buckets": [
{
"key_as_string": "2020-02-12T06:08:00+0000",
"key": 1581487680000,
"doc_count": 3,
"minDate": {
"value": 1.581487847E12,
"value_as_string": "2020-02-12T06:10:47Z"
},
"maxDate": {
"value": 1.58148791E12,
"value_as_string": "2020-02-12T06:11:50Z"
},
*// Need hours between maxDate and minDate here
//{
// "hours" : "0.0175" (maxDate-minDate)
//}*
}
]
}
}
]
}
}
Anyone please help me to find out the solution?
Thanks in Advance.
You can leverage the bucket_script pipeline aggregation in order to compute the difference between min and max for each bucket.
Simply add the following at the same level as minDate and maxDate:
"hours": {
"bucket_script": {
"buckets_path": {
"min": "minDate",
"max": "maxDate"
},
"script": "(params.max - params.min) / 3600000"
}
}
For your sample data above, the result in this case would be 0.0175 (i.e. roughly 1 minute)

Post Filtering Date histogram aggregation bucket results not working

I have an aggregation query where I am trying to calculate the max standard deviation of the number of destination ips per IP Address for a certain time range. As everyone knows the common problem with the moving function std_dev aggregation function, the first 2 days' std dev values will always be null and 0 respectively due to no data being taken into account previously.
Here is my aggregation query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"exists": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
}
]
}
},
"aggs": {
"range": {
"date_range": {
"field": "Source Time",
"ranges": [
{
"from": "2018-04-25",
"to": "2018-05-02"
}
]
},
"aggs": {
"by ip": {
"terms": {
"field": "IP Address.keyword",
"size": 500
},
"aggs": {
"datehisto": {
"date_histogram": {
"field": "Source Time",
"interval": "day"
},
"aggs": {
"max_dest_ips": {
"sum": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
},
"max_dest_ips_std_dev": {
"moving_fn": {
"buckets_path": "max_dest_ips",
"window": 3,
"script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))"
}
}
}
}
}
}
}
}
},
"post_filter": {
"range": {
"Source Time": {
"gte": "2018-05-01"
}
}
}
}
Here is a snippet of the response:
{
"key": "192.168.0.1",
"doc_count": 6,
"datehisto": {
"buckets": [
{
"key_as_string": "2018-04-25T00:00:00.000Z",
"key": 1524614400000,
"doc_count": 1,
"max_dest_ips": {
"value": 309
},
"max_dest_ips_std_dev": {
"value": null
}
},
{
"key_as_string": "2018-04-26T00:00:00.000Z",
"key": 1524700800000,
"doc_count": 1,
"max_dest_ips": {
"value": 529
},
"max_dest_ips_std_dev": {
"value": 0
}
},
{
"key_as_string": "2018-04-27T00:00:00.000Z",
"key": 1524787200000,
"doc_count": 1,
"max_dest_ips": {
"value": 408
},
"max_dest_ips_std_dev": {
"value": 110
}
},
{
"key_as_string": "2018-04-28T00:00:00.000Z",
"key": 1524873600000,
"doc_count": 1,
"max_dest_ips": {
"value": 187
},
"max_dest_ips_std_dev": {
"value": 89.96419040682551
}
}
]
}
}
What I want is for the first 2 days' bucket data (25th and 26th) to be filtered and removed from the above bucket results. I have tried the post filter above and the normal query filter below:
"filter": {
"range": {
"Source Time": {
"gte": "2018-04-27"
}
}
}
The Post Filter does nothing and doesn't work. The above filter range query makes the buckets start from the 27th but also makes the standard deviation calculations start on 27th as well (resulting in 27th being null and 28th being 0) when I want it to start from the 25th instead.
Any other alternative solutions? Help is greatly appreciated!

Elastic aggregation to identify period A vs B percentage increases

I have some daily sales data indexed into Elasticsearch. I successfully run a number of aggregations to identify top sellers across a date range etc.
I am now trying to write a single query to do the following:
Identify Top n sellers over a date range (Period A)
Take the results of Period A and sum sales for these products over second date range (Period B)
Compare sales in period A to Period B and identify those with percentage increases above X%.
My attempt so far:
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_periods": {
"date_range": {
"field": "date",
"format": "YYYY-MM-dd",
"ranges": [
{
"from": "2017-10-01",
"to": "2017-10-07"
},
{
"from": "2017-10-08",
"to": "2017-10-14"
}
]
},
"aggs": {
"product_id_split": {
"terms": {
"field": "product_id"
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
}
}
}
}
}
}
}
Although this outputs results for two periods, I don't think this is quite what I want as the initial filter is running from Period A start date to Period B end date and I think summing results for that range instead of Period A only. I also don't get the % comparison, I would probably do this at my application level, but I understand could be handled with a scripted Elastic query?
It would be especially awesome if instead of top n results in period A, I could set a sales threshold of say 1,000 sales.
Any pointers would be much appreciated. Thanks in advance!
Currently running Elastic 5.6
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_period1": {
"filter": {
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-07"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"date_period2": {
"filter": {
"range": {
"date": {
"gte": "2017-10-08",
"lte": "2017-10-14"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"percentage_increase": {
"bucket_script": {
"buckets_path": {
"firstPeriod": "date_period1>unit_sum",
"secondPeriod": "date_period2>unit_sum"
},
"script": "(params.secondPeriod-params.firstPeriod)*100/params.firstPeriod"
}
},
"retain_buckets": {
"bucket_selector": {
"buckets_path": {
"percentage": "percentage_increase"
},
"script": "params.percentage > 5"
}
}
}
}
}
}
And a full test data in this gist.
The result of this aggregation is giving you this:
"aggregations": {
"data_split": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 6,
"date_period1": {
"doc_count": 3,
"unit_sum": {
"value": 150
}
},
"date_period2": {
"doc_count": 3,
"unit_sum": {
"value": 160
}
},
"percentage_increase": {
"value": 6.666666666666667
}
},
{
"key": "C",
"doc_count": 2,
"date_period1": {
"doc_count": 1,
"unit_sum": {
"value": 50
}
},
"date_period2": {
"doc_count": 1,
"unit_sum": {
"value": 70
}
},
"percentage_increase": {
"value": 40
}
}
]
}
}
The idea is that you use two filter type of aggregations for the two date intervals. And for each you calculate a sum. Then, using a third aggregation of type bucket_script you calculate the percentage increase (note, though, that it will be a negative number of there is a decrease in sales for example).
Then, using yet another aggregation - of type bucket_selector - you keep the product_ids where the percentage is larger than 5%.

Elasticsearch Date Histogram aggregation with specific time range

We are performing 3 level aggregation for a certain date range we require fetching the distinct "Website" name grouped by distinct "HitCount" vale grouped by "DateTime" intervals. Here, histogram aggregation allows us to fetch the interval based documents, however the "key_as_string" of date is always considered from 12 AM instead of the date range time provided in the query. Depending on the interval period value, the day (24 hrs starting from 12 AM of the from time) is divided and aggregation output is given.
For e.g. we have given the from time as "2015-11-10T11:00:00" and To time as "2015-11-13T11:00:00" with interval of 8 hrs
Following is the query used:
{
"size": 0,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"DateTime": {
"from": "2015-11-10T11:00:00",
"to": "2015-11-13T11:00:00"
}
}
}
]
}
}
}
},
"aggs": {
"Website": {
"terms": {
"field": "Website",
"size": 0,
"order": {
"_count": "desc"
}
},
"aggs": {
"HitCount": {
"terms": {
"field": "HitCount",
"size": 0,
"order": {
"_count": "desc"
}
},
"aggs": {
"DateTime": {
"date_histogram": {
"field": "DateTime",
"interval": "8h",
"min_doc_count": 0,
"extended_bounds": {
"min": 1447153200000,
"max": 1447412400000
}
}
}
}
}
}
}
}
}
The query Output wrt 3rd level DateTime aggregation is:
"DateTime": {
"buckets": [
{
"key_as_string": "2015-11-10T08:00:00.000Z",
"key": 1447142400000,
"doc_count": 62698
}
,
{
"key_as_string": "2015-11-10T16:00:00.000Z",
"key": 1447171200000,
"doc_count": 248118
}
,
{
"key_as_string": "2015-11-11T00:00:00.000Z",
"key": 1447200000000,
"doc_count": 224898
}
,
{
"key_as_string": "2015-11-11T08:00:00.000Z",
"key": 1447228800000,
"doc_count": 221663
}
,
{
"key_as_string": "2015-11-11T16:00:00.000Z",
"key": 1447257600000,
"doc_count": 220935
}
,
{
"key_as_string": "2015-11-12T00:00:00.000Z",
"key": 1447286400000,
"doc_count": 219340
}
,
{
"key_as_string": "2015-11-12T08:00:00.000Z",
"key": 1447315200000,
"doc_count": 218452
}
,
{
"key_as_string": "2015-11-12T16:00:00.000Z",
"key": 1447344000000,
"doc_count": 190
}
,
{
"key_as_string": "2015-11-13T00:00:00.000Z",
"key": 1447372800000,
"doc_count": 0
}
,
{
"key_as_string": "2015-11-13T08:00:00.000Z",
"key": 1447401600000,
"doc_count": 0
}
]
}
Expected Output:
Here, we would expect the intervals to be divided and queried as:
2015-11-10T11:00:00 to 2015-11-10T19:00:00
2015-11-10T19:00:00 to 2015-11-11T03:00:00
2015-11-11T03:00:00 to 2015-11-11T11:00:00
2015-11-11T11:00:00 to 2015-11-11T19:00:00
2015-11-11T19:00:00 to 2015-11-12T03:00:00
2015-11-12T03:00:00 to 2015-11-12T11:00:00
2015-11-12T11:00:00 to 2015-11-12T19:00:00
2015-11-12T19:00:00 to 2015-11-13T03:00:00
2015-11-13T03:00:00 to 2015-11-13T11:00:00
ie. the "key_as_string" output value should be 2015-11-10T11:00:00, 2015-11-10T19:00:00, .... and so on
The above is required as we have given a From & to time of 11 AM so that it can be a updated value of every 8 hrs whenever we fire the query rather than getting a fixed range of time for the whole day.
Note: ES 1.7 is used
The documentation explains that you can use the offset parameter.
So
{
"size": 0,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"DateTime": {
"from": "2015-11-10T11:00:00",
"to": "2015-11-13T11:00:00"
}
}
}
]
}
}
}
},
"aggs": {
"Website": {
"terms": {
"field": "Website",
"size": 0,
"order": {
"_count": "desc"
}
},
"aggs": {
"HitCount": {
"terms": {
"field": "HitCount",
"size": 0,
"order": {
"_count": "desc"
}
},
"aggs": {
"DateTime": {
"date_histogram": {
"field": "DateTime",
"interval": "8h",
"min_doc_count": 0,
"offset": "+11h"
}
}
}
}
}
}
}
}

Elasticsearch range filter not working in date histogram

I came across a confusion in elasticsearch (version : 1.7.1). As per documentation https://www.elastic.co/guide/en/elasticsearch/guide/current/_filtering_queries_and_aggregations.html ,a filter applied to the query will also be applied to aggregation. When I issued the following query, I am getting unexpected results.
{
"aggregations": {
"outer": {
"aggregations": {
"inner": {
"date_histogram": {
"extended_bounds": {
"min": 0
},
"field": "time",
"interval": "30d",
"min_doc_count": 0,
"order": {
"_key": "desc"
}
}
}
},
"terms": {
"field": "ad_id",
"size": 10
}
}
},
"query": {
"filtered": {
"filter": {
"and": {
"filters": [
{
"range": {
"time": {
"from": 1441619173000,
"include_lower": false,
"include_upper": true,
"to": 1442835370000
}
}
}
]
}
}
}
}
}
A portion of result is here.
{
"buckets": [
{
"key": 203737,
"doc_count": 27,
"inner": {
"buckets": [
{
"key_as_string": "2015-09-02T00:00:00.000Z",
"key": 1441152000000,
"doc_count": 27
},
{
"key_as_string": "1970-01-31T00:00:00.000Z",
"key": 2592000000,
"doc_count": 0
},
...
{
"key_as_string": "1970-01-01T00:00:00.000Z",
"key": 0,
"doc_count": 0
}
]
}
}
]
}
Please note that the aggregation result includes keys outside the range I have applied. Type of the time field is date. I have also tried the following query, but the result was same.
{
"aggs": {
"outer_filter": {
"filter": {
"and": {
"filters": [
{
"range": {
"time": {
"from": 1441619173000,
"include_lower": false,
"include_upper": true,
"to": 1442835370000
}
}
}
]
}
},
"aggs": {
"outer_term": {
"terms": {
"field": "ad_id",
"size": 10
},
"aggs": {
"inner": {
"date_histogram": {
"extended_bounds": {
"min": 0
},
"field": "time",
"interval": "30d",
"min_doc_count": 0,
"order": {
"_key": "desc"
}
}
}
}
}
}
}
}
}
My problem is that the aggregation result includes results outside the filters ("from": 1441619173000,"to": 1442835370000).
Why are the filters not getting applied ?
Can anyone help please.
'extended_bound' min value is the problem. As min is 0 and the field is of type date, buckets starts from 1970 itself.
You appear to have the range filter confused with the range aggregation.
The range filter takes two types of parameters, gte or gt (greater than) and lte or lt (less than).
The from/to parameters are for the range aggregation, which is used to split your results into user defined buckets.

Resources