I have question about bucket_selector aggregation.
(Environment tested: ES6.8 and ES7 basic on centos7)
In my use case I need to drop documents if there are dupes by selected property. Index is not big about 2mln records.
Query to find those records looks like this:
GET index_id1/_search
{
"size": 0,
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"sameIds": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
},
"size": 1000
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totalCount": "byId._count"
},
"script": {
"source": "params.totalCount > 1"
}
}
}
}
}
}
}
}
}
I get the buckets back. But to relax the query and the load. I do it by size: 1000. So, next query issued to get more dupes until zero is back.
The problem is however - too small amount of dupes. I checked the result of the query by setting size: 2000000:
GET index_id1/_search
{
"size": 0,
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"sameIds": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
},
"size": 2000000 <-- too big
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totalCount": "byId._count"
},
"script": {
"source": "params.totalCount > 1"
}
}
}
}
}
}
}
}
}
As I understand first step is: it actually creates the buckets as stated in the query and then bucket_selector filters only what i need. And that's why i see this kind of behavior. In order to get all the buckets I have to adjust "search.max_buckets" to 2000000.
Converted to query with composite aggregation:
GET index_id1/_search
{
"aggs": {
"byNested": {
"nested": {
"path": "nestedObjects"
},
"aggs": {
"compositeAgg": {
"composite": {
"after": {
"termsAgg": "03f10a7d-0162-4409-8647-c643274d6727"
},
"size": 1000,
"sources": [
{
"termsAgg": {
"terms": {
"script": {
"lang": "painless",
"source": "return doc['nestedObjects.id'].value"
}
}
}
}
]
},
"aggs": {
"byId": {
"reverse_nested": {}
},
"byId_bucket_filter": {
"bucket_selector": {
"script": {
"source": "params.totalCount > 1"
},
"buckets_path": {
"totalCount": "byId._count"
}
}
}
}
}
}
}
},
"size": 0
}
As I understand it does the same thing except that I need to make 2000 calls (size: 1000 each) to go over the whole index.
Is composite agg caches the results or why this is better?
Maybe there is a better approach in this case?
Related
I have a search query that returns all items matching users that have type manager or lead.
{
"from": 0,
"size": 20,
"query": {
"bool": {
"should": [
{
"terms": {
"type": ["manager", "lead"]
}
}
]
}
}
}
Is there a way to define what percentage of the results should be of type "manager"?
In other words, I want the results to have 80% of users with type manager and 20% with type lead.
I want to make a suggestion to use bucket_path aggregation. As I know this aggregation needs to be run in sub-aggs of a histogram aggregation. As you have such field in your mapping so I think this query should work for you:
{
"size": 0,
"aggs": {
"NAME": {
"date_histogram": {
"field": "my_datetime",
"interval": "month"
},
"aggs": {
"role_type": {
"terms": {
"field": "type",
"size": 10
},
"aggs": {
"count": {
"value_count": {
"field": "_id"
}
}
}
},
"role_1_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_1 / (params.role_1+params.role_2)*100"
}
},
"role_2_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_2 / (params.role_1+params.role_2)*100"
}
}
}
}
}
}
Please let me know if it didn't work well for you.
I'm trying to calculate the 15th and 75th percentiles on an aggregrated dervied field(latency) -> and trying to retrieve those records with field value > (p75-p15). I am able to calculate the aggs and the thresholds but unable to filter out the required values. Tried the below query and am running into "buckets_path must reference either a number value or a single value numeric metric aggregation, got: java.lang.Object[]". I'm just trying to retrieve records with average latency > threshold. Any pointers?
"aggs": {
"by_name": {
"terms": {
"script": "doc['name'].value + ',' + doc['valf'].value ,
"size": 5000
},
"aggs": {
"single_round_block": {
"date_histogram": {
"field": "start_time",
"interval": "300s"
},
"aggs": {
"overallSumLatency": {
"sum": {
"field": "sum_latency_ms"
}
},
"overallNumLatencyMeasurements": {
"sum": {
"field": "num_valid_latency_measurements"
}
},
"avgLatency": {
"bucket_script": {
"buckets_path": {
"sumLatency": "overallSumLatency",
"numPoints": "overallNumLatencyMeasurements"
},
"script": "(params.numPoints == 0)?0:(params.sumLatency / params.numPoints)"
}
}
}
},
"percentiles_vals": {
"percentiles_bucket": {
"buckets_path": "single_round_block>avgLatency",
"percents": [ 15.0,75.0]
}
},
"threshold":{
"bucket_script": {
"buckets_path": {
"perc75":"percentiles_vals[75.0]",
"perc15":"percentiles_vals[15.0]"
},
"script": "Math.abs(params.perc75 - params.perc15)"
}
},
"filter_out_records": {
"bucket_selector": {
"buckets_path": {
"threshold":"threshold",
"avgLatency":"single_round_block>avgLatency"
},
"script": "params.avgLatency > params.threshold"
}
}
}
}
}
}
Here is my query result
GET _search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"serviceName.keyword": "directory-view-service"
}
},
{
"match": {
"path": "thewall"
}
},
{
"range": {
"#timestamp": {
"from": "now-31d",
"to": "now"
}
}
}
]
}
},
"aggs": {
"by_day": {
"date_histogram": {
"field": "date",
"interval": "7d"
},
"aggs": {
"byUserUid": {
"terms": {
"field": "token_userId.keyword",
"size": 150000
},
"aggs": {
"filterByCallNumber": {
"bucket_selector": {
"buckets_path": {
"doc_count": "_count"
},
"script": {
"inline": "params.doc_count <= 1"
}
}
}
}
}
}
}
}
}
I want my query return all user call my endpoint min. once time by 1 month range by 7 days interval, until then everything is good.
But my result is a buckets with 370 elements and I just need to know the array size...
Are there any keyword or how can I handle it ?
Thanks
I have an index with millions of documents. Suppose each of my documents has some code, and I need to find the list of codes matching some criteria. The only way I found doing that, is using whole lot of aggregations, so I created an ugly query which does exactly what I want:
POST my-index/_search
{
"query": {
"range": {
"timestamp": {
"gte": "2017-08-01T00:00:00.000",
"lt": "2017-08-08T00:00:00.000"
}
}
},
"size": 0,
"aggs": {
"codes": {
"terms": {
"field": "code",
"size": 10000
},
"aggs": {
"days": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"format": "dd"
},
"aggs": {
"hours": {
"date_histogram": {
"field": "timestamp",
"interval": "hour",
"format": "yyyy-MM-dd:HH"
},
"aggs": {
"hour_income": {
"sum": {
"field": "price"
}
}
}
},
"max_income": {
"max_bucket": {
"buckets_path": "hours>hour_income"
}
},
"day_income": {
"sum_bucket": {
"buckets_path": "hours.hour_income"
}
},
"more_than_sixty_percent": {
"bucket_script": {
"buckets_path": {
"dayIncome": "day_income",
"maxIncome": "max_income"
},
"script": "params.maxIncome - params.dayIncome * 60 / 100 > 0 ? 1 : 0"
}
}
}
},
"amount_of_days": {
"sum_bucket": {
"buckets_path": "days.more_than_sixty_percent"
}
},
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amountOfDays": "amount_of_days"
},
"script": "params.amountOfDays >= 3"
}
}
}
}
}
}
The response I get is a few millions lines of JSON, consisting of buckets. Each bucket has more than 700 lines (and buckets of its own), but all I need is its key, so that I have my list of codes. I guess it's not good having a response a few thousand times larger than neccessary, and there might be problems with parsing. So I wanted to ask, is there any way to hide the other info in the bucket and get only the keys?
Thanks.
I want to use PipeLine Aggregation(Bucket Selector Aggregation) to Nested Field Aggregation in ElasticSearch 2.4. I want to do something similar to below but I am not successful. Could you please suggest me if it is possible to do the PipeLine Aggregation in the nested field?
{
"size": 0,
"aggregations": {
"totalPaidAmount": {
"nested": {
"path": "count"
},
"aggregations": {
"paidAmountTotal": {
"sum": {
"field": "count.totalPaidAmount"
}
},
"paidAmount_filter": {
"bucket_selector": {
"script": {
"inline": "amount > 5000000"
},
"buckets_path": {
"amount": "paidAmountTotal"
}
}
}
}
}
}
}
I found the solution for the query. Actually, bucket selector Aggregation should be parallel to the nested aggregation and path should be referenced by '>' as shown below:
{
"size": 0,
"aggregations": {
"amount": {
"terms": {
"field": "countId",
"size": 0
},
"aggregations": {
"totalPaidAmount": {
"nested": {
"path": "count"
},
"aggregations": {
"paidAmountTotal": {
"sum": {
"field": "count.totalPaidAmount"
}
}
}
},
"paidAmount_filter": {
"bucket_selector": {
"script": {
"inline": "amount > 1000"
},
"buckets_path": {
"amount": "totalPaidAmount>paidAmountTotal"
}
}
}
}
}
}
}
You are missing params in script value. so, paidAmount_filter should look like:
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amount ": "paidAmountTotal"
},
"script": "params.amount > 5000000"
}
}