How to exclude the buckets having doc count equal to 0 - elasticsearch

I want to exclude those buckets from the date histogram aggregation response, whose doc count is equal to 0. And then, get the count of the filtered buckets.
The query is :
GET metricbeat-*/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"host.cpu.usage": {
"gte": 0.8
}
}
},
{
"range": {
"#timestamp": {
"gte": "2022-09-22T10:16:00.000Z",
"lte": "2022-09-22T10:18:00.000Z"
}
}
}
]
}
},
"aggs": {
"hostName": {
"terms": {
"field": "host.name"
},
"aggs": {
"docsOverTimeFrame": {
"date_histogram": {
"field": "#timestamp",
"fixed_interval": "10s"
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "docsOverTimeFrame._bucket_count"
},
"script": {
"source": "params.count == 12"
}
}
}
}
}
}
}
The response that I get right now is :
{
"took" : 8,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 38,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"hostName" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "datahot01",
"doc_count" : 3,
"docsOverTimeFrame" : {
"buckets" : [
{
"key_as_string" : "2022-09-22T10:16:00.000Z",
"key" : 1663841760000,
"doc_count" : 1
},
{
"key_as_string" : "2022-09-22T10:16:10.000Z",
"key" : 1663841770000,
"doc_count" : 1
},
{
"key_as_string" : "2022-09-22T10:16:20.000Z",
"key" : 1663841780000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:30.000Z",
"key" : 1663841790000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:40.000Z",
"key" : 1663841800000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:50.000Z",
"key" : 1663841810000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:00.000Z",
"key" : 1663841820000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:10.000Z",
"key" : 1663841830000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:20.000Z",
"key" : 1663841840000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:30.000Z",
"key" : 1663841850000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:40.000Z",
"key" : 1663841860000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:50.000Z",
"key" : 1663841870000,
"doc_count" : 0
}
]
}
}
]
}
}
}
So, if I am able to exclude those buckets that have doc count = 0, then on the basis of the number of buckets (that is bucket count), I want to check whether the count of buckets formed is equal to 12 or not (which I am doing using the bucket selector aggregation).
Is there some way to exclude the buckets having doc count = 0, and get the bucket count = 2 instead of 12

I was able to solve the above use case, by using a pipeline aggregation (i.e a bucket_selector aggregation) inside of the date histogram aggregation.
The modified query is :
{
"query": {
"bool": {
"must": [
{
"range": {
"#timestamp": {
"gte": "2022-09-22T10:16:00.000Z",
"lte": "2022-09-22T10:22:00.000Z"
}
}
},
{
"range": {
"system.cpu.total.norm.pct": {
"gte": 0.8
}
}
}
]
}
},
"aggs": {
"hostName": {
"terms": {
"field": "host.name"
},
"aggs": {
"docsOverTimeFrame": {
"date_histogram": {
"field": "#timestamp",
"fixed_interval": "10s"
},
"aggs": {
"histogram_doc_count": {
"bucket_selector": {
"buckets_path": {
"the_doc_count": "_count"
},
"script": "params.the_doc_count > 0"
}
}
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "docsOverTimeFrame._bucket_count"
},
"script": {
"source": "params.count == 12"
}
}
}
}
}
}
}

Related

Elasticsearch How to count total docs by date

As my theme, I wanna count docs the day and before by date, it's sample to understand that the chart.
{"index":{"_index":"login-2015.12.23","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-23T12:00:00Z"}
{"index":{"_index":"login-2015.12.23","_type":"logs"}}
{"uid":"2","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-23T12:00:00Z"}
{"index":{"_index":"login-2015.12.24","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-24T12:00:00Z"}
{"index":{"_index":"login-2015.12.25","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-25T12:00:00Z"}
As you see, index login-2015.12.23 has two docs, index login-2015.12.24 has one doc, index login-2015.12.23 has one doc.
And now I wanna get the result
{
"hits" : {
"total" : 6282,
"max_score" : 1.0,
"hits" : []
},
"aggregations" : {
"group_by_date" : {
"buckets" : [
{
"key_as_string" : "2015-12-23T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 2,
},
{
"key_as_string" : "2015-12-24T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 3,
},
{
"key_as_string" : "2015-12-25T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 4,
}
]
}
If I count the date 2015-12-24T12:00:00Z and it means I must count day 2015-12-23T12:00:00Z and 2015-12-24T12:00:00Z at the same time.
In my project I have many indices like that, and I searching many ways to make this goal come true but not, this is my demo:
{
"query": {"match_all": {}},
"size": 0,
"aggs": {
"group_by_date": {
"date_histogram": {
"field": "timestamp",
"interval": "day"
},
"aggs": {
"intersect": {
"scripted_metric": {
"init_script": "state.inner=[]",
"map_script": "state.inner.add(params.param1 == 3 ? params.param2 * params.param1 : params.param1 * params.param2)",
"combine_script": "return state.inner",
"reduce_script": "return states",
"params": {
"param1": 3,
"param2": 5
}
}
}
}
}
}
}
I wanna group by date, and use scripted_metric to iter the date list, not the second iteration just can in its bucket and not for all the document, so do anyone has better idea to solve this problem?
You can simply use the cumulative sum pipeline aggregation
{
"query": {"match_all": {}},
"size": 0,
"aggs": {
"group_by_date": {
"date_histogram": {
"field": "login_time",
"interval": "day"
},
"aggs": {
"cumulative_docs": {
"cumulative_sum": {
"buckets_path": "_count"
}
}
}
}
}
}
And the results will look like this:
"aggregations" : {
"group_by_date" : {
"buckets" : [
{
"key_as_string" : "2015-12-23T00:00:00.000Z",
"key" : 1450828800000,
"doc_count" : 2,
"cumulative_docs" : {
"value" : 2.0
}
},
{
"key_as_string" : "2015-12-24T00:00:00.000Z",
"key" : 1450915200000,
"doc_count" : 1,
"cumulative_docs" : {
"value" : 3.0
}
},
{
"key_as_string" : "2015-12-25T00:00:00.000Z",
"key" : 1451001600000,
"doc_count" : 1,
"cumulative_docs" : {
"value" : 4.0
}
}
]
}
}

Query filter for searching rollup index works with epoch time fails with date math

`How do we query (filter) a rollup index?
For example, based on the query here
Request:
{
"size": 0,
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}
Response:
{
"took" : 93,
"timed_out" : false,
"terminated_early" : false,
"_shards" : ... ,
"hits" : {
"total" : {
"value": 0,
"relation": "eq"
},
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"timeline" : {
"buckets" : [
{
"key_as_string" : "2018-01-18T00:00:00.000Z",
"key" : 1516233600000,
"doc_count" : 6,
"nodes" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "a",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 5.1499998569488525
}
},
{
"key" : "b",
"doc_count" : 2,
"max_temperature" : {
"value" : 201.0
},
"avg_voltage" : {
"value" : 5.700000047683716
}
},
{
"key" : "c",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 4.099999904632568
}
}
]
}
}
]
}
}
}
How to filter say last 3 days, is it possible?
For a test case, I used fixed_interval rate of 1m (one minute, and also 60 minutes) and I tried the following and the error was all query shards failed. Is it possible to query filter rollup agggregations?
Test Query for searching rollup index
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-3d/d",
"lt": "now/d"
}
}
}
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}

Nested Aggregation for AND Query Not Working

Please can someone help with the below Question.
https://discuss.elastic.co/t/nested-aggregation-with-and-always-return-0-match/315722?u=chattes
I have used following aggregations
1. Terms aggregation
2. Bucket selector
3. Nested aggregation
First I have grouped by user id using terms aggregation. Then further grouped by skill Id. Using bucket selector I have filtered users which have documents under two skills.
Query
GET index5/_search
{
"size": 0,
"aggs": {
"users": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"skills": {
"nested": {
"path": "skills"
},
"aggs": {
"filter_skill": {
"terms": {
"field": "skills.id",
"size": 10,
"include": [
553,
426
]
}
}
}
},
"bucket_count": {
"bucket_selector": {
"buckets_path": {
"skill_count": "skills>filter_skill._bucket_count"
},
"script": "params.skill_count ==2"
}
}
}
}
}
}
Results
"aggregations" : {
"users" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 1,
"skills" : {
"doc_count" : 3,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
},
{
"key" : 2,
"doc_count" : 1,
"skills" : {
"doc_count" : 2,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
}
]
}

Elasticsearch sub agregation

With the following query, I get the minimum value in each chunk of 15 minutes. I use the moving_fn function. Now I need to get the maximum value in each chunk in 1 hour from the previous request. As I understand it cannot be used for aggregation after moving_fn. How can you do this?
This is my query:
GET logstash-2021.12.2*/_search
{
"query": {
"bool": {
"filter": [
{
"range": {
"#timestamp": {
"gte": "now-24h"
}
}
},
{
"bool": {
"should": [
{
"match_phrase": {
"company": "BLAH-BLAH"
}
}
]
}
}
]
}
},
"size": 0,
"aggs": {
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"the_count": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
}
}
}
}
My response:
"aggregations" : {
"myDatehistogram" : {
"buckets" : [
{
"key_as_string" : "2021-12-25T05:58:30.000Z",
"key" : 1640411910000,
"doc_count" : 1196,
"the_count" : {
"value" : null
}
},
{
"key_as_string" : "2021-12-25T05:59:30.000Z",
"key" : 1640411970000,
"doc_count" : 1942,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:00:30.000Z",
"key" : 1640412030000,
"doc_count" : 1802,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:01:30.000Z",
"key" : 1640412090000,
"doc_count" : 1735,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:02:30.000Z",
"key" : 1640412150000,
"doc_count" : 1699,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:03:30.000Z",
"key" : 1640412210000,
"doc_count" : 1506,
"the_count" : {
"value" : 1196.0
}
}
From this response, I need to get the maximum value for each hour. Thank you in advance
Just add a second agg:
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"min_15": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
"max_60": {
"moving_fn": {
"buckets_path": "_count",
"window": 60,
"script": "MovingFunctions.max(values)"
}
}
}
}

Getting avg sub aggregation

I'd like to get the avg of a sub aggregation. For example, i have daily profit of each branch. I want to sum them so that i can get total daily profit. and then i want to get the monthly or week average of that daily profit. So far i have done this
{
"size" : 0,
"aggs" : {
"group_by_month": {
"date_histogram": {
"field": "Profit_Day",
"interval": "month",
"format" : "MM-yyyy"
},
"aggs": {
"avgProf": {
"avg": {
"field": "ProfitValue"
}
},
"group_by_day": {
"date_histogram": {
"field": "Profit_Day",
"interval": "day",
"format" : "yyyy-MM-dd"
},
"aggs": {
"prof": {
"sum": {
"field": "ProfitValue"
}
}
}
}
}
}
}
}
Issue is i am getting daaily sum which is correct
but instead of getting monthly average of daily sum
i am getting monthly average of profit from each branch.
You need to use average bucket aggragetion
Query:
GET sales1/_search
{
"size": 0,
"aggs": {
"group_by_month": {
"date_histogram": {
"field": "proffit_day",
"interval": "month",
"format": "MM-yyyy"
},
"aggs": {
"group_by_day": {
"date_histogram": {
"field": "proffit_day",
"interval": "day",
"format": "yyyy-MM-dd"
},
"aggs": {
"prof": {
"sum": {
"field": "proffit_value"
}
}
}
},
"avg_monthly_sales": {
"avg_bucket": {
"buckets_path": "group_by_day>prof"
}
}
}
}
}
}
Response:
{
"group_by_month" : {
"buckets" : [
{
"key_as_string" : "09-2019",
"key" : 1567296000000,
"doc_count" : 2,
"group_by_day" : {
"buckets" : [
{
"key_as_string" : "2019-09-25",
"key" : 1569369600000,
"doc_count" : 2,
"prof" : {
"value" : 15.0
}
}
]
},
"avg_monthly_sales" : {
"value" : 15.0
}
},
{
"key_as_string" : "10-2019",
"key" : 1569888000000,
"doc_count" : 2,
"group_by_day" : {
"buckets" : [
{
"key_as_string" : "2019-10-01",
"key" : 1569888000000,
"doc_count" : 1,
"prof" : {
"value" : 10.0
}
},
{
"key_as_string" : "2019-10-02",
"key" : 1569974400000,
"doc_count" : 0,
"prof" : {
"value" : 0.0
}
},
{
"key_as_string" : "2019-10-03",
"key" : 1570060800000,
"doc_count" : 1,
"prof" : {
"value" : 15.0
}
}
]
},
"avg_monthly_sales" : {
"value" : 12.5
}
}
]
}
}
}

Resources