How to use pipeline aggs on elasticsearch top aggs - elasticsearch

I'd like to filter the elasticsearch aggs result, on the first aggs. At first I thought the bucket selector in the sub-aggs will filter the inner aggs, but in fact I found it worked on the first aggs, then I wondered how I could filter on the inner aggs. I tried to put the bucket_selector as slibing to the first aggs, didn't work, put it as sibling to the inner aggs and adjusted the bucket_path, didn't find the way.
My working aggs, it filter the first aggs result:
GET network/_search
{
"size": 0,
"aggs": {
"asns": {
"terms": {
"field": "asn",
"size": 100000
},
"aggs": {
"users": {
"terms": {
"field": "user.keyword",
"size": 1000
}
},
"asns_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"num": "_count"
},
"script": "params.num <= 200"
}
}
}
}
}
}
I read this https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-pipeline.html#buckets-path-syntax, and tried this and so on:
GET network/_search
{
"size": 0,
"aggs": {
"asns": {
"terms": {
"field": "asn",
"size": 100000
},
"aggs": {
"users": {
"terms": {
"field": "user.keyword",
"size": 1000
}
},
"asns_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"num": "asns>users._count"
},
"script": "params.num <= 200"
}
}
}
}
}
}

Related

How to define percentage of result items with specific field in Elasticsearch query?

I have a search query that returns all items matching users that have type manager or lead.
{
"from": 0,
"size": 20,
"query": {
"bool": {
"should": [
{
"terms": {
"type": ["manager", "lead"]
}
}
]
}
}
}
Is there a way to define what percentage of the results should be of type "manager"?
In other words, I want the results to have 80% of users with type manager and 20% with type lead.
I want to make a suggestion to use bucket_path aggregation. As I know this aggregation needs to be run in sub-aggs of a histogram aggregation. As you have such field in your mapping so I think this query should work for you:
{
"size": 0,
"aggs": {
"NAME": {
"date_histogram": {
"field": "my_datetime",
"interval": "month"
},
"aggs": {
"role_type": {
"terms": {
"field": "type",
"size": 10
},
"aggs": {
"count": {
"value_count": {
"field": "_id"
}
}
}
},
"role_1_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_1 / (params.role_1+params.role_2)*100"
}
},
"role_2_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_2 / (params.role_1+params.role_2)*100"
}
}
}
}
}
}
Please let me know if it didn't work well for you.

ElasticSearch: Filter by distinct count during aggregation

The following query returns distinct Ids in order by largest distinct count of Ids. What I would like to do is "include only those IDs for which total number of documents is less than 2000"
{
"size": "0",
"query": {
"range": {
"#timestamp": {
"gte": "2020-10-20T00:00:00",
"lt": "2020-10-21T00:00:00"
}
}
},
"aggs": {
"ids": {
"terms": {
"field": "Id.keyword",
"size": 1000
}
}
}
}
I tried adding filter by 'doc_count' but that didn't help. How do I do this?
You can filter the buckets using bucket_selector aggregation
Bucket Selector Aggregation is a parent pipeline aggregation which executes a script which determines
whether the current bucket will be retained in the parent multi-bucket
aggregation.
{
"size": "0",
"query": {
"range": {
"#timestamp": {
"gte": "2020-10-20T00:00:00",
"lt": "2020-10-21T00:00:00"
}
}
},
"aggs": {
"ids": {
"terms": {
"field": "Id.keyword",
"size": 1000
},
"aggs": {
"count_filter": {
"bucket_selector": {
"buckets_path": {
"values": "_count"
},
"script": "params.values < 2000" <-- note this
}
}
}
}
}
}

How to mention from and size for the first level of elastic search aggregation in nested aggregation?

I have written a query to get the buckets based on id and then sort it. This works fine. But how to make it return buckets from position 100 till 200 for aggregation_by_id rule?
{
"query": {
"match_all": {}
},
"size": 0,
"aggregations": {
"aggregation_by_id": {
"terms": {
"field": "id.keyword"
"size" : 200
},
"aggs": {
"sort_timestamp": {
"top_hits": {
"sort": [{
"timestamp": {
"order": "desc",
"unmapped_type": "long"
}
}],
"size": 1
}
}
}
}
}
}

Elasticsearch term aggregation and range with timestamp

I'm trying to count # of logs grouped by user agent.
This is what I have.
GET /myindex/_search
{
"size": 30,
"stored_fields": ["req.headers.user-agent.keyword"],
"aggs": {
"group_by_userAgent": {
"terms": {
"field": "req.headers.user-agent.keyword"
}
}
}
}
I wanted to add "Query last 15 mins" feature. I've tried to add 'range' query and I ended up the following query, which does not work.
GET /myindex/_search
{
"size": 30,
"stored_fields": ["req.headers.user-agent.keyword"],
"aggs": {
"group_by_userAgent": {
"terms": {
"field": "req.headers.user-agent.keyword"
},
"range": {
"timestamp": {
"gt": "now-15m"
}
}
}
}
}
How do I query terms aggregation with range with "now-x15min" syntax?
The range should go inside the query section, not aggs. The time range is good as it is
I think what you're looking for is this, the number of docs in the first 30 user-agent buckets, i.e. the top 30 user agents producing the most logs
GET /myindex/_search
{
"size": 0,
"query": {
"range": {
"#timestamp": {
"gt": "now-15m"
}
}
},
"aggs": {
"group_by_userAgent": {
"terms": {
"field": "req.headers.user-agent.keyword",
"size": 30
}
}
}
}
you can do this in two ways to achieve aggregation results for user-agent.
POST phrase_index/_search
{
"aggs": {
"date_range_filtered_agg": {
"filter": {
"range": {
"timestamp": {
"gte": "now-15m/m"
}
}
},
"aggs": {
"group_by_userAgent": {
"terms": {
"field": "req.headers.user-agent.keyword",
"size": 10
}
}
}
}
},
"size": 30,
"stored_fields": ["req.headers.user-agent.keyword"]
}
POST phrase_index/_search
{
"query": {
"range": {
"timestamp": {
"gte": "now-15m/m"
}
}
},
"aggs": {
"group_by_userAgent": {
"terms": {
"field": "req.headers.user-agent.keyword",
"size": 10
}
}
},
"size": 30,
"stored_fields": ["req.headers.user-agent.keyword"]
}
You need a filter aggregation first to apply the range query, then add a terms sub-aggregation.
See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-filter-aggregation.html

bucket script not working - elasticsearch 2.4.2

I have tried to subtract the aggregations
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"total_query_id": {
"sum": {
"field": "query_id"
}
},
"total_num_results": {
"sum": {
"field": "num_results"
}
},
"minus_value": {
"bucket_script": {
"buckets_path": {
"qid": "total_query_id",
"nrs": "total_num_results"
},
"script": "qid - nrs"
}
}
}
}
it throws the below error
"reason": "Invalid pipeline aggregation named [minus_value] of type [bucket_script]. Only sibling pipeline aggregations are allowed at the top level"
I have moved to back and forth minus_value node to aggs node but it does not solve my problem.
can anyone help me on this?
The idea is that pipeline aggregations must work on a parent bucket aggregation.
It is not the case in your example, so you must have one parent aggregation. Since you have a match_all query, you could try using a global bucket aggregation and then embed your 3 aggregations inside it, like this:
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"all": {
"global": {},
"aggs": {
"total_query_id": {
"sum": {
"field": "query_id"
}
},
"total_num_results": {
"sum": {
"field": "num_results"
}
},
"minus_value": {
"bucket_script": {
"buckets_path": {
"qid": "total_query_id",
"nrs": "total_num_results"
},
"script": "qid - nrs"
}
}
}
}
}
}

Resources