Elasticsearch - getting aggregated data based on unique values from field - elasticsearch

In my elasticsearch (7.13) index, I have the following dataset:
maid site_id date hour
m1 1300 2021-06-03 1
m1 1300 2021-06-03 2
m1 1300 2021-06-03 1
m2 1300 2021-06-03 1
I am trying to get unique count of records for each date and site_id from the above table. The desired result is
maid site_id date count
m1 1300 2021-06-03 1
m2 1300 2021-06-03 1
I have millions of maid for each site_id and the dates spans across two years. I am using the following code with cardinality on maid assuming that it will return the unique maid's.
GET /r_2332/_search
{
"size":0,
"aggs": {
"site_id": {
"terms": {
"field": "site_id",
"size":100,
"include": [
1171, 1048
]
},"aggs" : {
"bydate" : {
"range" : {
"field": "date","ranges" : [
{
"from": "2021-04-08",
"to": "2021-04-22"
}
]
},"aggs" : {
"rdate" : {
"terms" : {
"field":"date"
},"aggs" :{
"maids" : {
"cardinality": {
"field": "maid"
}
}
}
}
}
}
}
}
}
}
This still returns the data with all the duplicate values. How do I include maid field into my query where I get the data filtered on unique maid values.

You can use multi terms aggregation along with cardinality aggregation if you want to get unique documents based on site_id and maid
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
"1300",
"1301"
]
}
},
{
"range": {
"date": {
"gte": "2021-06-02",
"lte": "2021-06-03"
}
}
}
]
}
},
"aggs": {
"group_by": {
"multi_terms": {
"terms": [
{
"field": "site_id"
},
{
"field": "maid.keyword"
}
]
},
"aggs": {
"type_count": {
"cardinality": {
"field": "site_id"
}
}
}
}
}
}
Search Result will be
"aggregations": {
"group_by": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
1300,
"m1"
],
"key_as_string": "1300|m1",
"doc_count": 3,
"type_count": {
"value": 1 // note this
}
},
{
"key": [
1300,
"m2"
],
"key_as_string": "1300|m2",
"doc_count": 1,
"type_count": {
"value": 1 // note this
}
}
]
}

Related

Elastic aggregation on specific values from within one field

I am migrating my db from postgres to elasticsearch. My postgres query looks like this:
select site_id, count(*) from r_2332 where site_id in ('1300','1364') and date >= '2021-01-25' and date <= '2021-01-30'
The expected result is as follows:
site_id count
1300 1234
1364 2345
I am trying to derive the same result from elasticsearch aggs. I have tried the following:
GET /r_2332/_search
{
"query": {
"bool" : {
"should" : [
{"match" : {"site_id": "1300"}},
{"match" : {"site_id": "1364"}}
],"minimum_should_match": 1
}
},
"aggs" : {
"footfall" : {
"range" : {
"field" : "date",
"ranges" : [
{
"from":"2021-01-21",
"to":"2021-01-30"
}
]
}
}
}
}
This gives me the result as follows:
"aggregations":{"footfall":{"buckets":[{"key":"2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z","from":1.6111872E12,"from_as_string":"2021-01-21T00:00:00.000Z","to":1.6119648E12,"to_as_string":"2021-01-30T00:00:00.000Z","doc_count":2679}]}
and this:
GET /r_2332/_search
{
"query": {
"terms": {
"site_id": [ "1300", "1364" ],
"boost": 1.0
}
},
"aggs" : {
"footfall" : {
"range" : {
"field" : "date",
"ranges" : [
{
"from":"2021-01-21",
"to":"2021-01-30"
}
]
}
}
}
}
This provided the same result:
"aggregations":{"footfall":{"buckets":[{"key":"2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z","from":1.6111872E12,"from_as_string":"2021-01-21T00:00:00.000Z","to":1.6119648E12,"to_as_string":"2021-01-30T00:00:00.000Z","doc_count":2679}]}
How do I get the result separately for each site_id?
You can use a combination of terms and range aggregation to achieve your task
Adding a working example with index data, search query and search result
Index Data:
{
"site_id":1365,
"date":"2021-01-24"
}
{
"site_id":1300,
"date":"2021-01-22"
}
{
"site_id":1300,
"date":"2020-01-22"
}
{
"site_id":1364,
"date":"2021-01-24"
}
Search Query:
{
"size": 0,
"aggs": {
"siteId": {
"terms": {
"field": "site_id",
"include": [
1300,
1364
]
},
"aggs": {
"footfall": {
"range": {
"field": "date",
"ranges": [
{
"from": "2021-01-21",
"to": "2021-01-30"
}
]
}
}
}
}
}
}
Search Result:
"aggregations": {
"siteId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1300,
"doc_count": 2,
"footfall": {
"buckets": [
{
"key": "2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z",
"from": 1.6111872E12,
"from_as_string": "2021-01-21T00:00:00.000Z",
"to": 1.6119648E12,
"to_as_string": "2021-01-30T00:00:00.000Z",
"doc_count": 1 // note this
}
]
}
},
{
"key": 1364,
"doc_count": 1,
"footfall": {
"buckets": [
{
"key": "2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z",
"from": 1.6111872E12,
"from_as_string": "2021-01-21T00:00:00.000Z",
"to": 1.6119648E12,
"to_as_string": "2021-01-30T00:00:00.000Z",
"doc_count": 1 // note this
}
]
}
}
]
}
}
This might perform better
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
"1300",
"1365"
]
}
},
{
"range": {
"date": {
"gte": "2021-01-21",
"lte": "2021-01-24"
}
}
}
]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id"
}
}
}
}

Stats Aggregation with Min Mode in ElasticSearch

I have the below mapping in ElasticSearch
{
"properties":{
"Costs":{
"type":"nested",
"properties":{
"price":{
"type":"integer"
}
}
}
}
}
So every document has an Array field Costs, which contains many elements and each element has price in it. I want to find the min and max price with the condition being - that from each array the element with the minimum price should be considered. So it is basically min/max among the minimum value of each array.
Lets say I have 2 documents with the Costs field as
Costs: [
{
"price": 100,
},
{
"price": 200,
}
]
and
Costs: [
{
"price": 300,
},
{
"price": 400,
}
]
So I need to find the stats
This is the query I am currently using
{
"costs_stats":{
"nested":{
"path":"Costs"
},
"aggs":{
"price_stats_new":{
"stats":{
"field":"Costs.price"
}
}
}
}
}
And it gives me this:
"min" : 100,
"max" : 400
But I need to find stats after taking minimum elements of each array for consideration.
So this is what i need:
"min" : 100,
"max" : 300
Like we have a "mode" option in sort, is there something similar in stats aggregation also, or any other way of achieving this, maybe using a script or something. Please suggest. I am really stuck here.
Let me know if anything is required
Update 1:
Query for finding min/max among minimums
{
"_source":false,
"timeout":"5s",
"from":0,
"size":0,
"aggs":{
"price_1":{
"terms":{
"field":"id"
},
"aggs":{
"price_2":{
"nested":{
"path":"Costs"
},
"aggs":{
"filtered":{
"aggs":{
"price_3":{
"min":{
"field":"Costs.price"
}
}
},
"filter":{
"bool":{
"filter":{
"range":{
"Costs.price":{
"gte":100
}
}
}
}
}
}
}
}
}
},
"minValue":{
"min_bucket":{
"buckets_path":"price_1>price_2>filtered>price_3"
}
}
}
}
Only few buckets are coming and hence the min/max is coming among those, which is not correct. Is there any size limit.
One way to achieve your use case is to add one more field id, in each document. With the help of id field terms aggregation can be performed, and so buckets will be dynamically built - one per unique value.
Then, we can apply min aggregation, which will return the minimum value among numeric values extracted from the aggregated documents.
Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"Costs": {
"type": "nested"
}
}
}
}
Index Data:
{
"id":1,
"Costs": [
{
"price": 100
},
{
"price": 200
}
]
}
{
"id":2,
"Costs": [
{
"price": 300
},
{
"price": 400
}
]
}
Search Query:
{
"size": 0,
"aggs": {
"id_terms": {
"terms": {
"field": "id",
"size": 15 <-- note this
},
"aggs": {
"nested_entries": {
"nested": {
"path": "Costs"
},
"aggs": {
"min_position": {
"min": {
"field": "Costs.price"
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 100.0
}
}
},
{
"key": 2,
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 300.0
}
}
}
]
}
Using stats aggregation also, it can be achieved (if you add one more field id that uniquely identifies your document)
{
"size": 0,
"aggs": {
"id_terms": {
"terms": {
"field": "id",
"size": 15 <-- note this
},
"aggs": {
"costs_stats": {
"nested": {
"path": "Costs"
},
"aggs": {
"price_stats_new": {
"stats": {
"field": "Costs.price"
}
}
}
}
}
}
}
}
Update 1:
To find the maximum value among those minimums (as seen in the above query), you can use max bucket aggregation
{
"size": 0,
"aggs": {
"id_terms": {
"terms": {
"field": "id",
"size": 15 <-- note this
},
"aggs": {
"nested_entries": {
"nested": {
"path": "Costs"
},
"aggs": {
"min_position": {
"min": {
"field": "Costs.price"
}
}
}
}
}
},
"maxValue": {
"max_bucket": {
"buckets_path": "id_terms>nested_entries>min_position"
}
}
}
}
Search Result:
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 100.0
}
}
},
{
"key": 2,
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 300.0
}
}
}
]
},
"maxValue": {
"value": 300.0,
"keys": [
"2"
]
}
}

Post Filtering Date histogram aggregation bucket results not working

I have an aggregation query where I am trying to calculate the max standard deviation of the number of destination ips per IP Address for a certain time range. As everyone knows the common problem with the moving function std_dev aggregation function, the first 2 days' std dev values will always be null and 0 respectively due to no data being taken into account previously.
Here is my aggregation query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"exists": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
}
]
}
},
"aggs": {
"range": {
"date_range": {
"field": "Source Time",
"ranges": [
{
"from": "2018-04-25",
"to": "2018-05-02"
}
]
},
"aggs": {
"by ip": {
"terms": {
"field": "IP Address.keyword",
"size": 500
},
"aggs": {
"datehisto": {
"date_histogram": {
"field": "Source Time",
"interval": "day"
},
"aggs": {
"max_dest_ips": {
"sum": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
},
"max_dest_ips_std_dev": {
"moving_fn": {
"buckets_path": "max_dest_ips",
"window": 3,
"script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))"
}
}
}
}
}
}
}
}
},
"post_filter": {
"range": {
"Source Time": {
"gte": "2018-05-01"
}
}
}
}
Here is a snippet of the response:
{
"key": "192.168.0.1",
"doc_count": 6,
"datehisto": {
"buckets": [
{
"key_as_string": "2018-04-25T00:00:00.000Z",
"key": 1524614400000,
"doc_count": 1,
"max_dest_ips": {
"value": 309
},
"max_dest_ips_std_dev": {
"value": null
}
},
{
"key_as_string": "2018-04-26T00:00:00.000Z",
"key": 1524700800000,
"doc_count": 1,
"max_dest_ips": {
"value": 529
},
"max_dest_ips_std_dev": {
"value": 0
}
},
{
"key_as_string": "2018-04-27T00:00:00.000Z",
"key": 1524787200000,
"doc_count": 1,
"max_dest_ips": {
"value": 408
},
"max_dest_ips_std_dev": {
"value": 110
}
},
{
"key_as_string": "2018-04-28T00:00:00.000Z",
"key": 1524873600000,
"doc_count": 1,
"max_dest_ips": {
"value": 187
},
"max_dest_ips_std_dev": {
"value": 89.96419040682551
}
}
]
}
}
What I want is for the first 2 days' bucket data (25th and 26th) to be filtered and removed from the above bucket results. I have tried the post filter above and the normal query filter below:
"filter": {
"range": {
"Source Time": {
"gte": "2018-04-27"
}
}
}
The Post Filter does nothing and doesn't work. The above filter range query makes the buckets start from the 27th but also makes the standard deviation calculations start on 27th as well (resulting in 27th being null and 28th being 0) when I want it to start from the 25th instead.
Any other alternative solutions? Help is greatly appreciated!

Elastic aggregation to identify period A vs B percentage increases

I have some daily sales data indexed into Elasticsearch. I successfully run a number of aggregations to identify top sellers across a date range etc.
I am now trying to write a single query to do the following:
Identify Top n sellers over a date range (Period A)
Take the results of Period A and sum sales for these products over second date range (Period B)
Compare sales in period A to Period B and identify those with percentage increases above X%.
My attempt so far:
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_periods": {
"date_range": {
"field": "date",
"format": "YYYY-MM-dd",
"ranges": [
{
"from": "2017-10-01",
"to": "2017-10-07"
},
{
"from": "2017-10-08",
"to": "2017-10-14"
}
]
},
"aggs": {
"product_id_split": {
"terms": {
"field": "product_id"
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
}
}
}
}
}
}
}
Although this outputs results for two periods, I don't think this is quite what I want as the initial filter is running from Period A start date to Period B end date and I think summing results for that range instead of Period A only. I also don't get the % comparison, I would probably do this at my application level, but I understand could be handled with a scripted Elastic query?
It would be especially awesome if instead of top n results in period A, I could set a sales threshold of say 1,000 sales.
Any pointers would be much appreciated. Thanks in advance!
Currently running Elastic 5.6
{
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-14"
}
}
}
]
}
},
"size": 0,
"aggs": {
"data_split": {
"terms": {
"size": 10,
"field": "product_id"
},
"aggs": {
"date_period1": {
"filter": {
"range": {
"date": {
"gte": "2017-10-01",
"lte": "2017-10-07"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"date_period2": {
"filter": {
"range": {
"date": {
"gte": "2017-10-08",
"lte": "2017-10-14"
}
}
},
"aggs": {
"unit_sum": {
"sum": {
"field": "units"
}
}
}
},
"percentage_increase": {
"bucket_script": {
"buckets_path": {
"firstPeriod": "date_period1>unit_sum",
"secondPeriod": "date_period2>unit_sum"
},
"script": "(params.secondPeriod-params.firstPeriod)*100/params.firstPeriod"
}
},
"retain_buckets": {
"bucket_selector": {
"buckets_path": {
"percentage": "percentage_increase"
},
"script": "params.percentage > 5"
}
}
}
}
}
}
And a full test data in this gist.
The result of this aggregation is giving you this:
"aggregations": {
"data_split": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 6,
"date_period1": {
"doc_count": 3,
"unit_sum": {
"value": 150
}
},
"date_period2": {
"doc_count": 3,
"unit_sum": {
"value": 160
}
},
"percentage_increase": {
"value": 6.666666666666667
}
},
{
"key": "C",
"doc_count": 2,
"date_period1": {
"doc_count": 1,
"unit_sum": {
"value": 50
}
},
"date_period2": {
"doc_count": 1,
"unit_sum": {
"value": 70
}
},
"percentage_increase": {
"value": 40
}
}
]
}
}
The idea is that you use two filter type of aggregations for the two date intervals. And for each you calculate a sum. Then, using a third aggregation of type bucket_script you calculate the percentage increase (note, though, that it will be a negative number of there is a decrease in sales for example).
Then, using yet another aggregation - of type bucket_selector - you keep the product_ids where the percentage is larger than 5%.

sum all values in specific hours of date dsl queries

I have index contains docs looks like this:
"_source": {
"price": "11",
"loggingdate": "15/02/2016 08:56:58",
}
I need a query that sum all values in price values between 10:00 - 12:00 in this year
I need total of each hour(10:00,11:00,12:00) This gives me total 3 hours of the result(only 1 item in bucket) but I need in seperated for each hour(three item in bucket)
{
"size":0,
"query":{
"filtered":{
"filter":{
"bool":{
"must":[
{
"range":{
"loggingdate":{
"gte":"now-1y"
}
}
},
{
},
{
"script":{
"script":"doc.loggingdate.date.getHourOfDay() >= 10 && doc.loggingdate.date.getHourOfDay() <= 12"
}
}
]
}
}
}
},
"aggs": {
"by_hour": {
"date_histogram": {
"field": "loggingdate",
"interval": "hour"
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
}
}
result:
"aggregations": {
"by_hour": {
"buckets": [
{
"key_as_string": "15/01/2016 10:00:00",
"key": 1452852000000,
"doc_count": 58453,
"total": {
"value": 2106110494
}
},
{
"key_as_string": "15/01/2016 11:00:00",
"key": 1452855600000,
"doc_count": 23243,
"total": {
"value": 849522038
}
},
{
"key_as_string": "15/01/2016 12:00:00",
"key": 1452859200000,
"doc_count": 11994,
"total": {
"value": 430906409
}
},
{
"key_as_string": "17/01/2016 10:00:00",
"key": 1453024800000,
"doc_count": 1,
"total": {
"value": 0
}
},...
I think I need use range with date_histogram but how can I sum all price values in other docs date_histogram gives me only docs in range..
Any idea?
You basically need a range filter to select only documents of the desired year and then another script filter in order to only select documents with the hours between 10am and 12am. Finally, you simply need a sum aggregation to sum all the prices of the matching documents.
{
"query": {
"bool": {
"filter": [
{
"range": {
"loggingdate": {
"gte": "2016-01-01",
"lt": "2017-01-01"
}
}
},
{
"script": {
"script": "doc.loggingdate.date.getHourOfDay() >= min && doc.loggingdate.date.getHourOfDay() <= max",
"params": {
"min": 10,
"max": 12
}
}
}
]
}
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
UPDATE
If you need the total by hour, you can use this aggregations instead:
"aggs": {
"by_hour": {
"terms": {
"script": "doc.loggingdate.date.getHourOfDay()"
},
"aggs": {
"total": {
"sum": {
"field": "price"
}
}
}
}
}

Resources