ElasticSearch group and distribute to buckets - elasticsearch

I am quite new to elasticsearch but it seems that there is no easy way to create aggregation and distribute doc_count to buckets once previous aggregation is done.
For example I have below set of data and I would like to create 4 buckets and group profiles that have specific numbers of transactions between the buckets.
Total number of profiles should be distributed to below buckets, where each bucket outlines min and max number of transactions that one profile could have.
number of profiles that has 0-1 transaction
number of profiles that has 2-5 transactions
number of profiles that has 6-20 transactions
number of profiles that has 20+ transactions
[
{
"profileId": "AVdiZnj6YuzD-vV0m9lx",
"transactionId": "sdsfsdghfd"
},
{
"profileId": "SRGDDUUDaasaddsaf",
"transactionId": "asdadscfdvdvd"
},
{
"profileId": "AVdiZnj6YuzD-vV0m9lx",
"transactionId": "sdsacfsfcsafcs"
}
]
Below request would show number of transactions per each profile but additional bucket grouping is required in order to group profiles to respective buckets using doc_cont.
{ "size":0,
"aggs" : {
"profileTransactions" : {
"terms" : {
"field" : "profileId"
}
}
}
}
"buckets": [
{
"key": "AVdiZnj6YuzD-vV0m9lx",
"doc_count": 2
},
{
"key": "SRGDDUUDaasaddsaf",
"doc_count": 1
}
]
Any Ideas?

You could do additional grouping with the help of pipeline bucket selector aggregation. The value count aggregation is used since bucket aggregation is checked against a numeric field. This query will require ES 2.x version.
{
"size": 0,
"aggs": {
"unique_profileId0": {
"terms": {
"field": "profileId"
},
"aggs": {
"total_profile_count": {
"value_count": {
"field": "profileId"
}
},
"range_0-1_bucket": {
"bucket_selector": {
"buckets_path": {
"totalTransaction": "total_profile_count"
},
"script": "totalTransaction < 2"
}
}
}
},
"unique_profileId1": {
"terms": {
"field": "profileId"
},
"aggs": {
"total_profile_count": {
"value_count": {
"field": "profileId"
}
},
"range_2-5_bucket": {
"bucket_selector": {
"buckets_path": {
"totalTransaction": "total_profile_count"
},
"script": "totalTransaction >= 2 && totalTransaction <= 5"
}
}
}
},
"unique_profileId2": {
"terms": {
"field": "profileId"
},
"aggs": {
"total_profile_count": {
"value_count": {
"field": "profileId"
}
},
"range_6-20_bucket": {
"bucket_selector": {
"buckets_path": {
"totalTransaction": "total_profile_count"
},
"script": "totalTransaction >= 6 && totalTransaction <= 20"
}
}
}
},
"unique_profileId3": {
"terms": {
"field": "profileId"
},
"aggs": {
"total_profile_count": {
"value_count": {
"field": "profileId"
}
},
"range_20_more_bucket": {
"bucket_selector": {
"buckets_path": {
"totalTransaction": "total_profile_count"
},
"script": "totalTransaction > 20"
}
}
}
}
}
}
You need to enable dynamic scripting for this to work, add following two lines to the YML file
script.inline: on
script.indexed: on
and restart each node.
Hope it helps!

Related

How to define percentage of result items with specific field in Elasticsearch query?

I have a search query that returns all items matching users that have type manager or lead.
{
"from": 0,
"size": 20,
"query": {
"bool": {
"should": [
{
"terms": {
"type": ["manager", "lead"]
}
}
]
}
}
}
Is there a way to define what percentage of the results should be of type "manager"?
In other words, I want the results to have 80% of users with type manager and 20% with type lead.
I want to make a suggestion to use bucket_path aggregation. As I know this aggregation needs to be run in sub-aggs of a histogram aggregation. As you have such field in your mapping so I think this query should work for you:
{
"size": 0,
"aggs": {
"NAME": {
"date_histogram": {
"field": "my_datetime",
"interval": "month"
},
"aggs": {
"role_type": {
"terms": {
"field": "type",
"size": 10
},
"aggs": {
"count": {
"value_count": {
"field": "_id"
}
}
}
},
"role_1_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_1 / (params.role_1+params.role_2)*100"
}
},
"role_2_ratio": {
"bucket_script": {
"buckets_path": {
"role_1": "role_type['manager']>count",
"role_2": "role_type['lead']>count"
},
"script": "params.role_2 / (params.role_1+params.role_2)*100"
}
}
}
}
}
}
Please let me know if it didn't work well for you.

Percentile based filtering elastic search

I'm trying to calculate the 15th and 75th percentiles on an aggregrated dervied field(latency) -> and trying to retrieve those records with field value > (p75-p15). I am able to calculate the aggs and the thresholds but unable to filter out the required values. Tried the below query and am running into "buckets_path must reference either a number value or a single value numeric metric aggregation, got: java.lang.Object[]". I'm just trying to retrieve records with average latency > threshold. Any pointers?
"aggs": {
"by_name": {
"terms": {
"script": "doc['name'].value + ',' + doc['valf'].value ,
"size": 5000
},
"aggs": {
"single_round_block": {
"date_histogram": {
"field": "start_time",
"interval": "300s"
},
"aggs": {
"overallSumLatency": {
"sum": {
"field": "sum_latency_ms"
}
},
"overallNumLatencyMeasurements": {
"sum": {
"field": "num_valid_latency_measurements"
}
},
"avgLatency": {
"bucket_script": {
"buckets_path": {
"sumLatency": "overallSumLatency",
"numPoints": "overallNumLatencyMeasurements"
},
"script": "(params.numPoints == 0)?0:(params.sumLatency / params.numPoints)"
}
}
}
},
"percentiles_vals": {
"percentiles_bucket": {
"buckets_path": "single_round_block>avgLatency",
"percents": [ 15.0,75.0]
}
},
"threshold":{
"bucket_script": {
"buckets_path": {
"perc75":"percentiles_vals[75.0]",
"perc15":"percentiles_vals[15.0]"
},
"script": "Math.abs(params.perc75 - params.perc15)"
}
},
"filter_out_records": {
"bucket_selector": {
"buckets_path": {
"threshold":"threshold",
"avgLatency":"single_round_block>avgLatency"
},
"script": "params.avgLatency > params.threshold"
}
}
}
}
}
}

How to display only the key from the bucket

I have an index with millions of documents. Suppose each of my documents has some code, and I need to find the list of codes matching some criteria. The only way I found doing that, is using whole lot of aggregations, so I created an ugly query which does exactly what I want:
POST my-index/_search
{
"query": {
"range": {
"timestamp": {
"gte": "2017-08-01T00:00:00.000",
"lt": "2017-08-08T00:00:00.000"
}
}
},
"size": 0,
"aggs": {
"codes": {
"terms": {
"field": "code",
"size": 10000
},
"aggs": {
"days": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"format": "dd"
},
"aggs": {
"hours": {
"date_histogram": {
"field": "timestamp",
"interval": "hour",
"format": "yyyy-MM-dd:HH"
},
"aggs": {
"hour_income": {
"sum": {
"field": "price"
}
}
}
},
"max_income": {
"max_bucket": {
"buckets_path": "hours>hour_income"
}
},
"day_income": {
"sum_bucket": {
"buckets_path": "hours.hour_income"
}
},
"more_than_sixty_percent": {
"bucket_script": {
"buckets_path": {
"dayIncome": "day_income",
"maxIncome": "max_income"
},
"script": "params.maxIncome - params.dayIncome * 60 / 100 > 0 ? 1 : 0"
}
}
}
},
"amount_of_days": {
"sum_bucket": {
"buckets_path": "days.more_than_sixty_percent"
}
},
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amountOfDays": "amount_of_days"
},
"script": "params.amountOfDays >= 3"
}
}
}
}
}
}
The response I get is a few millions lines of JSON, consisting of buckets. Each bucket has more than 700 lines (and buckets of its own), but all I need is its key, so that I have my list of codes. I guess it's not good having a response a few thousand times larger than neccessary, and there might be problems with parsing. So I wanted to ask, is there any way to hide the other info in the bucket and get only the keys?
Thanks.

How to use cumulative_sum with a previous aggregation?

I would like to plot a cumulative sum of some events, per day. The cumulative sum aggregation seems to be the way to go so I tried to reuse the example given in the docs.
The first aggregation works fine, the following query
{
"aggs": {
"vulns_day" : {
"date_histogram" :{
"field": "HOST_START_iso",
"interval": "day"
}
}
}
}
gives replies such as
(...)
{
"key_as_string": "2016-09-08T00:00:00.000Z",
"key": 1473292800000,
"doc_count": 76330
},
{
"key_as_string": "2016-09-09T00:00:00.000Z",
"key": 1473379200000,
"doc_count": 37712
},
(...)
I then wanted to query the cumulative sum of doc_count above via
{
"aggs": {
"vulns_day" : {
"date_histogram" :{
"field": "HOST_START_iso",
"interval": "day"
}
},
"aggs": {
"vulns_cumulated": {
"cumulative_sum": {
"buckets_path": "doc_count"
}
}
}
}
}
but it gives an error:
"reason": {
"type": "search_parse_exception",
"reason": "Could not find aggregator type [vulns_cumulated] in [aggs]",
I see that bucket_path should point to the elements to be summed and the example for cumulative aggregations created a specific intermediate sum but I do not have anything to sum (beside doc_count).
I guess, you should change your query like this:
{
"aggs": {
"vulns_day": {
"date_histogram": {
"field": "HOST_START_iso",
"interval": "day"
},
"aggs": {
"document_count": {
"value_count": {
"field": "HOST_START_iso"
}
},
"vulns_cumulated": {
"cumulative_sum": {
"buckets_path": "document_count"
}
}
}
}
}
}
I found the solution. Since doc_count did not seem to be available, I tried to retrieve stats for the time parameter, and use its count value. It worked:
{
"size": 0,
"aggs": {
"vulns_day": {
"date_histogram": {
"field": "HOST_START_iso",
"interval": "day"
},
"aggs": {
"dates_stats": {
"stats": {
"field": "HOST_START_iso"
}
},
"vulns_cumulated": {
"cumulative_sum": {
"buckets_path": "dates_stats.count"
}
}
}
}
}
}

For each country/colour/brand combination , find sum of number of items in elasticsearch

This is a portion of the data I have indexed in elasticsearch:
{
"country" : "India",
"colour" : "white",
"brand" : "sony"
"numberOfItems" : 3
}
I want to get the total sum of numberOfItems on a per country basis, per colour basis and per brand basis. Is there any way to do this in elasticsearch?
The following should land you straight to the answer.
Make sure you enable scripting before using it.
{
"aggs": {
"keys": {
"terms": {
"script": "doc['country'].value + doc['color'].value + doc['brand'].value"
},
"aggs": {
"keySum": {
"sum": {
"field": "numberOfItems"
}
}
}
}
}
}
To get a single result you may use sum aggregation applied to a filtered query with term (terms) filter, e.g.:
{
"query": {
"filtered": {
"filter": {
"term": {
"country": "India"
}
}
}
},
"aggs": {
"total_sum": {
"sum": {
"field": "numberOfItems"
}
}
}
}
To get statistics for all countries/colours/brands in a single pass over the data you may use the following query with 3 multi-bucket aggregations, each of them containing a single-bucket sum sub-aggregation:
{
"query": {
"match_all": {}
},
"aggs": {
"countries": {
"terms": {
"field": "country"
},
"aggs": {
"country_sum": {
"sum": {
"field": "numberOfItems"
}
}
}
},
"colours": {
"terms": {
"field": "colour"
},
"aggs": {
"colour_sum": {
"sum": {
"field": "numberOfItems"
}
}
}
},
"brands": {
"terms": {
"field": "brand"
},
"aggs": {
"brand_sum": {
"sum": {
"field": "numberOfItems"
}
}
}
}
}
}

Resources