Elasticsearch Date Histogram with a Point in Time count of documents - elasticsearch

I am attempting to create a date histogram showing the number of employees on a monthly basis.
Employee mapping looks something like this:
{
"number": 1234,
"firstName": "Chris",
"lastName": "Smith",
"employmentDates: [
{
"startDate": "2014-10-03T06:00:00Z",
"endDate": "2017-11-04T06:00:00Z"
}
],
"lastPaidOnDate": "2017-11-10T06:00:00Z",
....
}
Given a start end scenario like this (for three employees):
|----------------|
|-----------------------------|
|---| |---------------------|
^ ^ ^ ^ ^ ^
I would expect the histogram to be similar to this:
"aggregations": {
"employees_per_month": {
"buckets": [
{
"key_as_string": "2017-01-01",
"doc_count": 1
},
{
"key_as_string": "2017-02-01",
"doc_count": 2
},
{
"key_as_string": "2017-03-01",
"doc_count": 2
},
{
"key_as_string": "2017-04-01",
"doc_count": 3
},
{
"key_as_string": "2017-05-01",
"doc_count": 3
},
{
"key_as_string": "2017-06-01",
"doc_count": 2
}
]
}
}
It seems like I need to have a sub-aggregation on a scripted field, but I'm not sure where to start.
Your assistance is greatly appreciated.

I believe it's can be done by using DateHistogram. But I'm suggesting a a simple approach. You will have to run the query every time for one specific month:
{
"size": 0,
"aggregations": {
"bool_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"employmentDates.startDate": {
"lt": "2017-12-01T00:00:00Z"
}
}
},
{
"range": {
"employmentDates.endDate": {
"gte": "2017-11-01T00:00:00Z"
}
}
}
]
}
},
"aggregations": {
"distinct_agg": {
"cardinality": {
"field": "number"
}
}
}
}
}
}
bool_agg: using Filter Aggregation to filter only employment in November
distinct_agg: using Cardinality Aggregation to count, by unique field number, the total employees
Pay attention that if employmentDates would contain more then one record, e.g:
"employmentDates: [
{
"startDate": "2014-10-03T06:00:00Z",
"endDate": "2017-11-04T06:00:00Z"
}
{
"startDate": "2018-03-03T06:00:00Z",
"endDate": "2018-07-04T06:00:00Z"
}
You will must go nested with Nested Datatype, example can be found here.
And update the query to:
{
"size": 0,
"aggregations": {
"nested_agg": {
"nested": {
"path": "employmentDates"
},
"aggregations": {
"bool_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"employmentDates.startDate": {
"lt": "2017-12-01T00:00:00Z"
}
}
},
{
"range": {
"employmentDates.endDate": {
"gte": "2017-11-01T00:00:00Z"
}
}
}
]
}
},
"aggregations": {
"comment_to_issue": {
"reverse_nested": {},
"aggregations": {
"distinct_agg": {
"cardinality": {
"field": "number"
}
}
}
}
}
}
}
}
}
}

Related

Post Filtering Date histogram aggregation bucket results not working

I have an aggregation query where I am trying to calculate the max standard deviation of the number of destination ips per IP Address for a certain time range. As everyone knows the common problem with the moving function std_dev aggregation function, the first 2 days' std dev values will always be null and 0 respectively due to no data being taken into account previously.
Here is my aggregation query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"exists": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
}
]
}
},
"aggs": {
"range": {
"date_range": {
"field": "Source Time",
"ranges": [
{
"from": "2018-04-25",
"to": "2018-05-02"
}
]
},
"aggs": {
"by ip": {
"terms": {
"field": "IP Address.keyword",
"size": 500
},
"aggs": {
"datehisto": {
"date_histogram": {
"field": "Source Time",
"interval": "day"
},
"aggs": {
"max_dest_ips": {
"sum": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
},
"max_dest_ips_std_dev": {
"moving_fn": {
"buckets_path": "max_dest_ips",
"window": 3,
"script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))"
}
}
}
}
}
}
}
}
},
"post_filter": {
"range": {
"Source Time": {
"gte": "2018-05-01"
}
}
}
}
Here is a snippet of the response:
{
"key": "192.168.0.1",
"doc_count": 6,
"datehisto": {
"buckets": [
{
"key_as_string": "2018-04-25T00:00:00.000Z",
"key": 1524614400000,
"doc_count": 1,
"max_dest_ips": {
"value": 309
},
"max_dest_ips_std_dev": {
"value": null
}
},
{
"key_as_string": "2018-04-26T00:00:00.000Z",
"key": 1524700800000,
"doc_count": 1,
"max_dest_ips": {
"value": 529
},
"max_dest_ips_std_dev": {
"value": 0
}
},
{
"key_as_string": "2018-04-27T00:00:00.000Z",
"key": 1524787200000,
"doc_count": 1,
"max_dest_ips": {
"value": 408
},
"max_dest_ips_std_dev": {
"value": 110
}
},
{
"key_as_string": "2018-04-28T00:00:00.000Z",
"key": 1524873600000,
"doc_count": 1,
"max_dest_ips": {
"value": 187
},
"max_dest_ips_std_dev": {
"value": 89.96419040682551
}
}
]
}
}
What I want is for the first 2 days' bucket data (25th and 26th) to be filtered and removed from the above bucket results. I have tried the post filter above and the normal query filter below:
"filter": {
"range": {
"Source Time": {
"gte": "2018-04-27"
}
}
}
The Post Filter does nothing and doesn't work. The above filter range query makes the buckets start from the 27th but also makes the standard deviation calculations start on 27th as well (resulting in 27th being null and 28th being 0) when I want it to start from the 25th instead.
Any other alternative solutions? Help is greatly appreciated!

Elastic aggregation to identify period A vs B percentage increases

I have some daily sales data indexed into Elasticsearch. I successfully run a number of aggregations to identify top sellers across a date range etc.
I am now trying to write a single query to do the following:
Identify Top n sellers over a date range (Period A)
Take the results of Period A and sum sales for these products over second date range (Period B)
Compare sales in period A to Period B and identify those with percentage increases above X%.
My attempt so far:
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_periods": {
"date_range": {
"field": "date",
"format": "YYYY-MM-dd",
"ranges": [
{
"from": "2017-10-01",
"to": "2017-10-07"
},
{
"from": "2017-10-08",
"to": "2017-10-14"
}
]
},
"aggs": {
"product_id_split": {
"terms": {
"field": "product_id"
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
}
}
}
}
}
}
}
Although this outputs results for two periods, I don't think this is quite what I want as the initial filter is running from Period A start date to Period B end date and I think summing results for that range instead of Period A only. I also don't get the % comparison, I would probably do this at my application level, but I understand could be handled with a scripted Elastic query?
It would be especially awesome if instead of top n results in period A, I could set a sales threshold of say 1,000 sales.
Any pointers would be much appreciated. Thanks in advance!
Currently running Elastic 5.6
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_period1": {
"filter": {
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-07"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"date_period2": {
"filter": {
"range": {
"date": {
"gte": "2017-10-08",
"lte": "2017-10-14"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"percentage_increase": {
"bucket_script": {
"buckets_path": {
"firstPeriod": "date_period1>unit_sum",
"secondPeriod": "date_period2>unit_sum"
},
"script": "(params.secondPeriod-params.firstPeriod)*100/params.firstPeriod"
}
},
"retain_buckets": {
"bucket_selector": {
"buckets_path": {
"percentage": "percentage_increase"
},
"script": "params.percentage > 5"
}
}
}
}
}
}
And a full test data in this gist.
The result of this aggregation is giving you this:
"aggregations": {
"data_split": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 6,
"date_period1": {
"doc_count": 3,
"unit_sum": {
"value": 150
}
},
"date_period2": {
"doc_count": 3,
"unit_sum": {
"value": 160
}
},
"percentage_increase": {
"value": 6.666666666666667
}
},
{
"key": "C",
"doc_count": 2,
"date_period1": {
"doc_count": 1,
"unit_sum": {
"value": 50
}
},
"date_period2": {
"doc_count": 1,
"unit_sum": {
"value": 70
}
},
"percentage_increase": {
"value": 40
}
}
]
}
}
The idea is that you use two filter type of aggregations for the two date intervals. And for each you calculate a sum. Then, using a third aggregation of type bucket_script you calculate the percentage increase (note, though, that it will be a negative number of there is a decrease in sales for example).
Then, using yet another aggregation - of type bucket_selector - you keep the product_ids where the percentage is larger than 5%.

Can ElasticSearch aggregate over top N items in each sorted bucket

I have this query that buckets the records by data source code, and computes an average over all records in each bucket.
How could I modify it so that each bucket is limited to having (at most) top N records when ordered by record.timestamp desc (or any other record field for that matter)
The end effect I want is an average per bucket using the most recent N records rather than all records (so the doc_count in each bucket would have an upper limit of N).
I've searched and experimented extensively with no success.
Current query:
{
"size": 0,
"query": {
"constant_score": {
"filter": {
"bool": {
"must": [
{
"term": {
"jobType": "LiveEventScoring"
}
},
{
"term": {
"host": "MTVMDANS"
}
},
{
"term": {
"measurement": "EventDataLoadFromCacheDuration"
}
}
]
}
}
}
},
"aggs": {
"data-sources": {
"terms": {
"field": "dataSourceCode"
},
"aggs": {
"avgDuration": {
"avg": {
"field": "elapsedMs"
}
}
}
}
}
}
Results in:
"aggregations": {
"data-sources": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "AU_VIRT",
"doc_count": 6259,
"avgDuration": {
"value": 3525.683176226234
}
},
{
"key": "AU_HN_VIRT",
"doc_count": 2812,
"avgDuration": {
"value": 3032.0771692745375
}
},
{
"key": "GB_VIRT",
"doc_count": 1845,
"avgDuration": {
"value": 1432.39945799458
}
}
]
}
}
}
Alternately if grabbing top N from sorted bucket is not possible, I could do multiple queries one for each dataSourceCode, e.g. for AU_VIRT:
{
"size":0,
"query": {
"constant_score": {
"filter": {
"bool": {
"must": [
{
"term": {
"jobType": "LiveEventScoring"
}
},
{
"term": {
"host": "MTVMDANS"
}
},
{
"term": {
"dataSourceCode": "AU_VIRT"
}
},
{
"term": {
"measurement": "EventDataLoadFromCacheDuration"
}
}
]
}
}
}
},
"aggs": {
"avgDuration": {
"avg": {
"field": "elapsedMs"
}
}
}
}
}
but I am now challenged in how I make the avgDuration work on top N results sorted by timestamp desc.

Elasticsearch : Is it possible to not analysed aggregation query on analysed field?

I have certain document which stores the brand names in analysed form for ex: {"name":"Sam-sung"} {"name":"Motion:Systems"}. There are cases where i would want to aggregation these brands under timestamp.
my query as follow ,
{
"size": 0,
"aggs": {
"filtered_aggs": {
"filter": {
"range": {
"#timestamp":{
"gte":"2016-07-18T14:23:41.459Z",
"lte":"2016-07-18T14:53:10.017Z"
}
}
},
"aggs": {
"execute_time": {
"terms": {
"field": "brands",
"size": 0
}
}
}
}
}
}
but the return results will be
{
...
"aggregations": {
"states": {
"buckets": [
{
"key": "Sam",
"doc_count": 5
},
{
"key": "sung",
"doc_count": 5
},
{
"key": "Motion",
"doc_count": 1
},
{
"key": "Systems",
"doc_count": 1
}
]
}
}
}
but i want to the results is
{
...
"aggregations": {
"states": {
"buckets": [
{
"key": "Sam-sung",
"doc_count": 5
},
{
"key": "Motion:Systems",
"doc_count": 1
}
]
}
}
}
Is there any way in which i can make not analysed query on analysed field in elastic search?
You need to add a not_analyzed sub-field to your brands fields and then aggregate on that field.
PUT /index/_mapping/type
{
"properties": {
"brands": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
Then you need to fully reindex your data in order to populate the new sub-fields brands.raw.
Finally, you can change your query to this:
POST index/_search
{
"size": 0,
"aggs": {
"filtered_aggs": {
"filter": {
"range": {
"#timestamp":{
"gte":"2016-07-18T14:23:41.459Z",
"lte":"2016-07-18T14:53:10.017Z"
}
}
},
"aggs": {
"execute_time": {
"terms": {
"field": "brands.raw",
"size": 0
}
}
}
}
}
}

Ratio with elasticsearch

I have a list of customers with this structure:
{
"name" : "Toya Romano",
"hungry" : false,
"date" : 1420090500020
}
I would like to get the ratio of people who are hungry. How can I do it with an ElasticSearch query? I am running ES 2.3.
Rather a hacky approach because of this issue, but this should work:
{
"size": 0,
"aggs": {
"whatever": {
"filters": {
"filters": [{}]
},
"aggs": {
"all_people": {
"filter": {}
},
"hungry_count": {
"filter": {
"term": {
"hungry": true
}
}
},
"hungry_ratio": {
"bucket_script": {
"buckets_path": {
"total_hungry": "hungry_count._count",
"all": "all_people._count"
},
"script": "total_hungry/all"
}
}
}
}
}
}
With the result like this:
"buckets": [
{
"doc_count": 5,
"all_people": {
"doc_count": 5
},
"hungry_count": {
"doc_count": 3
},
"hungry_ratio": {
"value": 0.6
}
}
]

Resources