Bucket_selector aggregation and size. optimizations - elasticsearch

I have question about bucket_selector aggregation.
(Environment tested: ES6.8 and ES7 basic on centos7)
In my use case I need to drop documents if there are dupes by selected property. Index is not big about 2mln records.
Query to find those records looks like this:
GET index_id1/_search
{
"size": 0,
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"sameIds": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
},
"size": 1000
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totalCount": "byId._count"
},
"script": {
"source": "params.totalCount > 1"
}
}
}
}
}
}
}
}
}
I get the buckets back. But to relax the query and the load. I do it by size: 1000. So, next query issued to get more dupes until zero is back.
The problem is however - too small amount of dupes. I checked the result of the query by setting size: 2000000:
GET index_id1/_search
{
"size": 0,
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"sameIds": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
},
"size": 2000000 <-- too big
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totalCount": "byId._count"
},
"script": {
"source": "params.totalCount > 1"
}
}
}
}
}
}
}
}
}
As I understand first step is: it actually creates the buckets as stated in the query and then bucket_selector filters only what i need. And that's why i see this kind of behavior. In order to get all the buckets I have to adjust "search.max_buckets" to 2000000.
Converted to query with composite aggregation:
GET index_id1/_search
{
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"compositeAgg": {
"composite": {
"after": {
"termsAgg": "03f10a7d-0162-4409-8647-c643274d6727"
},
"size": 1000,
"sources": [
{
"termsAgg": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
}
}
}
}
]
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"script": {
"source": "params.totalCount > 1"
},
"buckets_path": {
"totalCount": "byId._count"
}
}
}
}
}
}
}
},
"size": 0
}
As I understand it does the same thing except that I need to make 2000 calls (size: 1000 each) to go over the whole index.
Is composite agg caches the results or why this is better?
Maybe there is a better approach in this case?

Related

How to define percentage of result items with specific field in Elasticsearch query?

I have a search query that returns all items matching users that have type manager or lead.
{
"from": 0,
"size": 20,
"query": {
"bool": {
"should": [
{
"terms": {
"type": ["manager", "lead"]
}
}
]
}
}
}
Is there a way to define what percentage of the results should be of type "manager"?
In other words, I want the results to have 80% of users with type manager and 20% with type lead.
I want to make a suggestion to use bucket_path aggregation. As I know this aggregation needs to be run in sub-aggs of a histogram aggregation. As you have such field in your mapping so I think this query should work for you:
{
"size": 0,
"aggs": {
"NAME": {
"date_histogram": {
"field": "my_datetime",
"interval": "month"
},
"aggs": {
"role_type": {
"terms": {
"field": "type",
"size": 10
},
"aggs": {
"count": {
"value_count": {
"field": "_id"
}
}
}
},
"role_1_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_1 / (params.role_1+params.role_2)*100"
}
},
"role_2_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_2 / (params.role_1+params.role_2)*100"
}
}
}
}
}
}
Please let me know if it didn't work well for you.

Percentile based filtering elastic search

I'm trying to calculate the 15th and 75th percentiles on an aggregrated dervied field(latency) -> and trying to retrieve those records with field value > (p75-p15). I am able to calculate the aggs and the thresholds but unable to filter out the required values. Tried the below query and am running into "buckets_path must reference either a number value or a single value numeric metric aggregation, got: java.lang.Object[]". I'm just trying to retrieve records with average latency > threshold. Any pointers?
"aggs": {
"by_name": {
"terms": {
"script": "doc['name'].value + ',' + doc['valf'].value ,
"size": 5000
},
"aggs": {
"single_round_block": {
"date_histogram": {
"field": "start_time",
"interval": "300s"
},
"aggs": {
"overallSumLatency": {
"sum": {
"field": "sum_latency_ms"
}
},
"overallNumLatencyMeasurements": {
"sum": {
"field": "num_valid_latency_measurements"
}
},
"avgLatency": {
"bucket_script": {
"buckets_path": {
"sumLatency": "overallSumLatency",
"numPoints": "overallNumLatencyMeasurements"
},
"script": "(params.numPoints == 0)?0:(params.sumLatency / params.numPoints)"
}
}
}
},
"percentiles_vals": {
"percentiles_bucket": {
"buckets_path": "single_round_block>avgLatency",
"percents": [ 15.0,75.0]
}
},
"threshold":{
"bucket_script": {
"buckets_path": {
"perc75":"percentiles_vals[75.0]",
"perc15":"percentiles_vals[15.0]"
},
"script": "Math.abs(params.perc75 - params.perc15)"
}
},
"filter_out_records": {
"bucket_selector": {
"buckets_path": {
"threshold":"threshold",
"avgLatency":"single_round_block>avgLatency"
},
"script": "params.avgLatency > params.threshold"
}
}
}
}
}
}

Elasticsearch : How get result buckets size

Here is my query result
GET _search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"serviceName.keyword": "directory-view-service"
}
},
{
"match": {
"path": "thewall"
}
},
{
"range": {
"#timestamp": {
"from": "now-31d",
"to": "now"
}
}
}
]
}
},
"aggs": {
"by_day": {
"date_histogram": {
"field": "date",
"interval": "7d"
},
"aggs": {
"byUserUid": {
"terms": {
"field": "token_userId.keyword",
"size": 150000
},
"aggs": {
"filterByCallNumber": {
"bucket_selector": {
"buckets_path": {
"doc_count": "_count"
},
"script": {
"inline": "params.doc_count <= 1"
}
}
}
}
}
}
}
}
}
I want my query return all user call my endpoint min. once time by 1 month range by 7 days interval, until then everything is good.
But my result is a buckets with 370 elements and I just need to know the array size...
Are there any keyword or how can I handle it ?
Thanks

How to display only the key from the bucket

I have an index with millions of documents. Suppose each of my documents has some code, and I need to find the list of codes matching some criteria. The only way I found doing that, is using whole lot of aggregations, so I created an ugly query which does exactly what I want:
POST my-index/_search
{
"query": {
"range": {
"timestamp": {
"gte": "2017-08-01T00:00:00.000",
"lt": "2017-08-08T00:00:00.000"
}
}
},
"size": 0,
"aggs": {
"codes": {
"terms": {
"field": "code",
"size": 10000
},
"aggs": {
"days": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"format": "dd"
},
"aggs": {
"hours": {
"date_histogram": {
"field": "timestamp",
"interval": "hour",
"format": "yyyy-MM-dd:HH"
},
"aggs": {
"hour_income": {
"sum": {
"field": "price"
}
}
}
},
"max_income": {
"max_bucket": {
"buckets_path": "hours>hour_income"
}
},
"day_income": {
"sum_bucket": {
"buckets_path": "hours.hour_income"
}
},
"more_than_sixty_percent": {
"bucket_script": {
"buckets_path": {
"dayIncome": "day_income",
"maxIncome": "max_income"
},
"script": "params.maxIncome - params.dayIncome * 60 / 100 > 0 ? 1 : 0"
}
}
}
},
"amount_of_days": {
"sum_bucket": {
"buckets_path": "days.more_than_sixty_percent"
}
},
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amountOfDays": "amount_of_days"
},
"script": "params.amountOfDays >= 3"
}
}
}
}
}
}
The response I get is a few millions lines of JSON, consisting of buckets. Each bucket has more than 700 lines (and buckets of its own), but all I need is its key, so that I have my list of codes. I guess it's not good having a response a few thousand times larger than neccessary, and there might be problems with parsing. So I wanted to ask, is there any way to hide the other info in the bucket and get only the keys?
Thanks.

Can We Apply Bucket Selector Aggregation on Nested Aggregation in ElasticSearch?

I want to use PipeLine Aggregation(Bucket Selector Aggregation) to Nested Field Aggregation in ElasticSearch 2.4. I want to do something similar to below but I am not successful. Could you please suggest me if it is possible to do the PipeLine Aggregation in the nested field?
{
"size": 0,
"aggregations": {
"totalPaidAmount": {
"nested": {
"path": "count"
},
"aggregations": {
"paidAmountTotal": {
"sum": {
"field": "count.totalPaidAmount"
}
},
"paidAmount_filter": {
"bucket_selector": {
"script": {
"inline": "amount > 5000000"
},
"buckets_path": {
"amount": "paidAmountTotal"
}
}
}
}
}
}
}
I found the solution for the query. Actually, bucket selector Aggregation should be parallel to the nested aggregation and path should be referenced by '>' as shown below:
{
"size": 0,
"aggregations": {
"amount": {
"terms": {
"field": "countId",
"size": 0
},
"aggregations": {
"totalPaidAmount": {
"nested": {
"path": "count"
},
"aggregations": {
"paidAmountTotal": {
"sum": {
"field": "count.totalPaidAmount"
}
}
}
},
"paidAmount_filter": {
"bucket_selector": {
"script": {
"inline": "amount > 1000"
},
"buckets_path": {
"amount": "totalPaidAmount>paidAmountTotal"
}
}
}
}
}
}
}
You are missing params in script value. so, paidAmount_filter should look like:
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amount ": "paidAmountTotal"
},
"script": "params.amount > 5000000"
}
}

Resources