Elasticsearch sub agregation

Elasticsearch sub agregation - elasticsearch

With the following query, I get the minimum value in each chunk of 15 minutes. I use the moving_fn function. Now I need to get the maximum value in each chunk in 1 hour from the previous request. As I understand it cannot be used for aggregation after moving_fn. How can you do this?
This is my query:
GET logstash-2021.12.2*/_search
{
"query": {
"bool": {
"filter": [
{
"range": {
"#timestamp": {
"gte": "now-24h"
}
}
},
{
"bool": {
"should": [
{
"match_phrase": {
"company": "BLAH-BLAH"
}
}
]
}
}
]
}
},
"size": 0,
"aggs": {
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"the_count": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
}
}
}
}
My response:
"aggregations" : {
"myDatehistogram" : {
"buckets" : [
{
"key_as_string" : "2021-12-25T05:58:30.000Z",
"key" : 1640411910000,
"doc_count" : 1196,
"the_count" : {
"value" : null
}
},
{
"key_as_string" : "2021-12-25T05:59:30.000Z",
"key" : 1640411970000,
"doc_count" : 1942,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:00:30.000Z",
"key" : 1640412030000,
"doc_count" : 1802,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:01:30.000Z",
"key" : 1640412090000,
"doc_count" : 1735,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:02:30.000Z",
"key" : 1640412150000,
"doc_count" : 1699,
"the_count" : {
"value" : 1196.0
}
},
{
"key_as_string" : "2021-12-25T06:03:30.000Z",
"key" : 1640412210000,
"doc_count" : 1506,
"the_count" : {
"value" : 1196.0
}
}
From this response, I need to get the maximum value for each hour. Thank you in advance

Just add a second agg:
"myDatehistogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"offset": "+30s"
}, "aggs": {
"min_15": {
"moving_fn": {
"buckets_path": "_count",
"window": 15,
"script": "MovingFunctions.min(values)"
}
}
"max_60": {
"moving_fn": {
"buckets_path": "_count",
"window": 60,
"script": "MovingFunctions.max(values)"
}
}
}
}

Related

Query filter for searching rollup index works with epoch time fails with date math

`How do we query (filter) a rollup index?
For example, based on the query here
Request:
{
"size": 0,
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}
Response:
{
"took" : 93,
"timed_out" : false,
"terminated_early" : false,
"_shards" : ... ,
"hits" : {
"total" : {
"value": 0,
"relation": "eq"
},
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"timeline" : {
"buckets" : [
{
"key_as_string" : "2018-01-18T00:00:00.000Z",
"key" : 1516233600000,
"doc_count" : 6,
"nodes" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "a",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 5.1499998569488525
}
},
{
"key" : "b",
"doc_count" : 2,
"max_temperature" : {
"value" : 201.0
},
"avg_voltage" : {
"value" : 5.700000047683716
}
},
{
"key" : "c",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 4.099999904632568
}
}
]
}
}
]
}
}
}
How to filter say last 3 days, is it possible?
For a test case, I used fixed_interval rate of 1m (one minute, and also 60 minutes) and I tried the following and the error was all query shards failed. Is it possible to query filter rollup agggregations?
Test Query for searching rollup index
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-3d/d",
"lt": "now/d"
}
}
}
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}

How to exclude the buckets having doc count equal to 0

I want to exclude those buckets from the date histogram aggregation response, whose doc count is equal to 0. And then, get the count of the filtered buckets.
The query is :
GET metricbeat-*/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"host.cpu.usage": {
"gte": 0.8
}
}
},
{
"range": {
"#timestamp": {
"gte": "2022-09-22T10:16:00.000Z",
"lte": "2022-09-22T10:18:00.000Z"
}
}
}
]
}
},
"aggs": {
"hostName": {
"terms": {
"field": "host.name"
},
"aggs": {
"docsOverTimeFrame": {
"date_histogram": {
"field": "#timestamp",
"fixed_interval": "10s"
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "docsOverTimeFrame._bucket_count"
},
"script": {
"source": "params.count == 12"
}
}
}
}
}
}
}
The response that I get right now is :
{
"took" : 8,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 38,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"hostName" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "datahot01",
"doc_count" : 3,
"docsOverTimeFrame" : {
"buckets" : [
{
"key_as_string" : "2022-09-22T10:16:00.000Z",
"key" : 1663841760000,
"doc_count" : 1
},
{
"key_as_string" : "2022-09-22T10:16:10.000Z",
"key" : 1663841770000,
"doc_count" : 1
},
{
"key_as_string" : "2022-09-22T10:16:20.000Z",
"key" : 1663841780000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:30.000Z",
"key" : 1663841790000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:40.000Z",
"key" : 1663841800000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:16:50.000Z",
"key" : 1663841810000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:00.000Z",
"key" : 1663841820000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:10.000Z",
"key" : 1663841830000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:20.000Z",
"key" : 1663841840000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:30.000Z",
"key" : 1663841850000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:40.000Z",
"key" : 1663841860000,
"doc_count" : 0
},
{
"key_as_string" : "2022-09-22T10:17:50.000Z",
"key" : 1663841870000,
"doc_count" : 0
}
]
}
}
]
}
}
}
So, if I am able to exclude those buckets that have doc count = 0, then on the basis of the number of buckets (that is bucket count), I want to check whether the count of buckets formed is equal to 12 or not (which I am doing using the bucket selector aggregation).
Is there some way to exclude the buckets having doc count = 0, and get the bucket count = 2 instead of 12

I was able to solve the above use case, by using a pipeline aggregation (i.e a bucket_selector aggregation) inside of the date histogram aggregation.
The modified query is :
{
"query": {
"bool": {
"must": [
{
"range": {
"#timestamp": {
"gte": "2022-09-22T10:16:00.000Z",
"lte": "2022-09-22T10:22:00.000Z"
}
}
},
{
"range": {
"system.cpu.total.norm.pct": {
"gte": 0.8
}
}
}
]
}
},
"aggs": {
"hostName": {
"terms": {
"field": "host.name"
},
"aggs": {
"docsOverTimeFrame": {
"date_histogram": {
"field": "#timestamp",
"fixed_interval": "10s"
},
"aggs": {
"histogram_doc_count": {
"bucket_selector": {
"buckets_path": {
"the_doc_count": "_count"
},
"script": "params.the_doc_count > 0"
}
}
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "docsOverTimeFrame._bucket_count"
},
"script": {
"source": "params.count == 12"
}
}
}
}
}
}
}

Nested filter in Elasticsearch aggregation query

I am running following aggregation query with nested filter
GET <indexname>/_search
{
"aggs": {
"NAME": {
"nested": {
"path": "crm.LeadStatusHistory"
},
"aggs": {
"agg_filter": {
"filter": {
"bool": {
"must": [
{
"nested": {
"path": "crm",
"query": {
"terms": {
"crm.City.keyword": [
"Rewa"
]
}
}
}
},
{
"nested": {
"path": "crm",
"query": {
"terms": {
"crm.LeadID": [
27961
]
}
}
}
}
]
}
},
"aggs": {
"agg_terms":{
"terms": {
"field": "crm.LeadStatusHistory.StatusID",
"size": 1000
}
}
}
}
}
}
}
}
-----> i have following document
{
"_index" : "crm",
"_type" : "_doc",
"_id" : "4478",
"_score" : 1.0,
"_source" : {
"crm" : [
{
"LeadStatusHistory" : [
{
"StatusID" : 3
},
{
"StatusID" : 2
},
{
"StatusID" : 1
}
],
"LeadID" : 27961,
"City" : "Rewa"
},
{
"LeadStatusHistory" : [
{
"StatusID" : 1
},
{
"StatusID" : 3
},
{
"StatusID" : 2
}
],
"LeadID" : 27959,
"City" : "Rewa"
}
]
}
}]
However in response i am getting following result
"aggregations" : {
"NAME" : {
"doc_count" : 4332,
"agg_filter" : {
"doc_count" : 1,
"agg_terms" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 1
}
]
}
}
}
}
Query===> As per source document, i have 3 nested 'crm.LeadStatusHistory' documents for crm.LeadID = 27961. However, results shows for agg_filter equals to 1 instead of 3. Can you please let me know the reason for this case.

Your agg_filter is on the crm.LeadStatusHistory => will target only 1 doc (LeadStatusHistory is one doc, contaning in your case link to others doc).
i build a query who show that, and i thinck will answer to your problem. You will see the different doc_count for each aggregation.
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "crm"
},
"aggs": {
"agg_LeadID": {
"terms": {
"field": "crm.LeadID"
},
"aggs": {
"agg_LeadStatusHistory": {
"nested": {
"path": "crm.LeadStatusHistory"
},
"aggs": {
"home_type_name": {
"terms": {
"field": "crm.LeadStatusHistory.StatusID"
}
}
}
}
}
}
}
}
}
}
with this one you can count them, with a script (and filter if needed so):
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "crm"
},
"aggs": {
"agg_LeadID": {
"terms": {
"field": "crm.LeadID"
},
"aggs": {
"agg_LeadStatusHistory": {
"nested": {
"path": "crm.LeadStatusHistory"
},
"aggs": {
"agg_LeadStatusHistory_sum": {
"sum": {
"script": "doc['crm.LeadStatusHistory.StatusID'].values.length"
}
}
}
}
}
}
}
}
}
}
note: if want to get the number of nested documents, take a look to inner_hits:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-inner-hits

I differ with the response that in 'crm.LeadStatusHistory' is one doc. I have run aggregation query on crm.LeadstatusHistory without filters.
GET crm/_search
{
"_source": ["crm.LeadID","crm.LeadStatusHistory.StatusID","crm.City"],
"size": 10000,
"query": {
"nested": {
"path": "crm",
"query": {
"match": {
"crm.LeadID": "27961"
}
}
}
},
"aggs": {
"agg_statuscount": {
"nested": {
"path": "crm.LeadStatusHistory"
},
"aggs": {
"agg_terms":{
"terms": {
"field": "crm.LeadStatusHistory.StatusID",
"size": 1000
}
}
}
}
}
}
I get following response from above query which shows 'agg_statuscount' as 6 docs without filters
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "crm",
"_type" : "_doc",
"_id" : "4478",
"_score" : 1.0,
"_source" : {
"crm" : [
{
"LeadStatusHistory" : [
{
"StatusID" : 3
},
{
"StatusID" : 2
},
{
"StatusID" : 1
}
],
"LeadID" : 27961,
"City" : "Rewa"
},
{
"LeadStatusHistory" : [
{
"StatusID" : 1
},
{
"StatusID" : 3
},
{
"StatusID" : 2
}
],
"LeadID" : 27959,
"City" : "Rewa"
}
]
}
}
]
},
"aggregations" : {
"agg_statuscount" : {
"doc_count" : 6,
"agg_terms" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 2
},
{
"key" : 2,
"doc_count" : 2
},
{
"key" : 3,
"doc_count" : 2
}
]
}
}
}
}
Hence with crm.LeadID = 27961 in aggregation filter, i expected 3 'crm.LeadStatusHistory' docs. Currently the response is 1 as in my original question.

Getting avg sub aggregation

I'd like to get the avg of a sub aggregation. For example, i have daily profit of each branch. I want to sum them so that i can get total daily profit. and then i want to get the monthly or week average of that daily profit. So far i have done this
{
"size" : 0,
"aggs" : {
"group_by_month": {
"date_histogram": {
"field": "Profit_Day",
"interval": "month",
"format" : "MM-yyyy"
},
"aggs": {
"avgProf": {
"avg": {
"field": "ProfitValue"
}
},
"group_by_day": {
"date_histogram": {
"field": "Profit_Day",
"interval": "day",
"format" : "yyyy-MM-dd"
},
"aggs": {
"prof": {
"sum": {
"field": "ProfitValue"
}
}
}
}
}
}
}
}
Issue is i am getting daaily sum which is correct
but instead of getting monthly average of daily sum
i am getting monthly average of profit from each branch.

You need to use average bucket aggragetion
Query:
GET sales1/_search
{
"size": 0,
"aggs": {
"group_by_month": {
"date_histogram": {
"field": "proffit_day",
"interval": "month",
"format": "MM-yyyy"
},
"aggs": {
"group_by_day": {
"date_histogram": {
"field": "proffit_day",
"interval": "day",
"format": "yyyy-MM-dd"
},
"aggs": {
"prof": {
"sum": {
"field": "proffit_value"
}
}
}
},
"avg_monthly_sales": {
"avg_bucket": {
"buckets_path": "group_by_day>prof"
}
}
}
}
}
}
Response:
{
"group_by_month" : {
"buckets" : [
{
"key_as_string" : "09-2019",
"key" : 1567296000000,
"doc_count" : 2,
"group_by_day" : {
"buckets" : [
{
"key_as_string" : "2019-09-25",
"key" : 1569369600000,
"doc_count" : 2,
"prof" : {
"value" : 15.0
}
}
]
},
"avg_monthly_sales" : {
"value" : 15.0
}
},
{
"key_as_string" : "10-2019",
"key" : 1569888000000,
"doc_count" : 2,
"group_by_day" : {
"buckets" : [
{
"key_as_string" : "2019-10-01",
"key" : 1569888000000,
"doc_count" : 1,
"prof" : {
"value" : 10.0
}
},
{
"key_as_string" : "2019-10-02",
"key" : 1569974400000,
"doc_count" : 0,
"prof" : {
"value" : 0.0
}
},
{
"key_as_string" : "2019-10-03",
"key" : 1570060800000,
"doc_count" : 1,
"prof" : {
"value" : 15.0
}
}
]
},
"avg_monthly_sales" : {
"value" : 12.5
}
}
]
}
}
}

Elastic Search: Selecting multiple vlaues in aggregates

In Elastic Search I have the following index with 'allocated_bytes', 'total_bytes' and other fields:
{
"_index" : "metrics-blockstore_capacity-2017_06",
"_type" : "datapoint",
"_id" : "AVzHwgsi9KuwEU6jCXy5",
"_score" : 1.0,
"_source" : {
"timestamp" : 1498000001000,
"resource_guid" : "2185d15c-5298-44ac-8646-37575490125d",
"allocated_bytes" : 1.159196672E9,
"resource_type" : "machine",
"total_bytes" : 1.460811776E11,
"machine" : "2185d15c-5298-44ac-8646-37575490125d"
}
I have the following query to
1)get a point for 30 minute interval using date-histogram
2)group by field on resource_guid.
3)max aggregate to find the max value.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValue": {
"max": {
"field": "allocated_bytes"
}
}
}
},
"sumUnique": {
"sum_bucket": {
"buckets_path": "groupByField>maxValue"
}
}
}
}
}
}
But with this query I am able to get only allocated_bytes, but I need to have both allocated_bytes and total_bytes at the result point.
Following is the result from the above query:
{
"key_as_string" : "2017-06-20T21:00:00.000Z",
"key" : 1497992400000,
"doc_count" : 9,
"groupByField" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "2185d15c-5298-44ac-8646-37575490125d",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
}, {
"key" : "c3513cdd-58bb-4f8e-9b4c-467230b4f6e2",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156165632E9
}
}, {
"key" : "eff13403-9737-4d08-9dca-fb6c12c3a6fa",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
} ]
},
"sumUnique" : {
"value" : 3.468529664E9
}
}
I do need both allocated_bytes and total_bytes. How do I get multiple fields( allocated_bytes, total_bytes) for each point?
For example:
"sumUnique" : {
"Allocatedvalue" : 3.468529664E9,
"TotalValue" : 9.468529664E9
}
or like this:
"allocatedBytessumUnique" : {
"value" : 3.468529664E9
}
"totalBytessumUnique" : {
"value" : 9.468529664E9
},

You can just add another aggregation:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValueAllocated": {
"max": {
"field": "allocated_bytes"
}
},
"maxValueTotal": {
"max": {
"field": "total_bytes"
}
}
}
},
"sumUniqueAllocatedBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueAllocated"
}
},
"sumUniqueTotalBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueTotal"
}
}
}
}
}
}
I hope you are aware that sum_bucket calculates sibling aggregations only, in this case gives sum of max values, not the sum of total_bytes. If you want to get sum of total_bytes you can use sum aggregation

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch sub agregation - elasticsearch

Related

Query filter for searching rollup index works with epoch time fails with date math

How to exclude the buckets having doc count equal to 0

Nested filter in Elasticsearch aggregation query

Getting avg sub aggregation

Elastic Search: Selecting multiple vlaues in aggregates

Categories

Resources