Elasticsearch analytics percent - elasticsearch

I am using Elasticsearch 1.7.3 to accumulate data for analytics reports.
I have an index that holds documents where each document has a numeric field called 'duration' (how many milliseconds the request took), and a string field called 'component'. There can be many documents with the same component name.
Eg.
{"component": "A", "duration": 10}
{"component": "B", "duration": 27}
{"component": "A", "duration": 5}
{"component": "C", "duration": 2}
I would like to produce a report that states for each component:
The sum of all 'duration' fields for this component.
A: 15
B: 27
C: 2
The percentage of this sum out of the total sum of duration of all documents. In my example
A: (10+5) / (10+27+5+2) * 100
B: 27 / (10+27+5+2) * 100
C: 2 / (10+27+5+2) * 100
The percentage of the documents for each component, out of the total components.
A: 2 / 4 * 100
B: 1 / 4 * 100
C: 1 / 4 * 100
How do I do that with Elasticsearch 1.7.3?

With ES 1.7.3, there is no way to compute data based on the results of two different aggregations, this is something that can be done in ES 2.0 with pipeline aggregations, though.
However, what you're asking is not too complicated to do on the client-side with 1.7.3. If you use the query below, you'll get all you need to get the figures you expect:
POST components/_search
{
"size": 0,
"aggs": {
"total_duration": {
"sum": {
"field": "duration"
}
},
"components": {
"terms": {
"field": "component"
},
"aggs": {
"duration_sum": {
"sum": {
"field": "duration"
}
}
}
}
}
}
The results would look like this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"total_duration": {
"value": 44
},
"components": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2,
"duration_sum": {
"value": 15
}
},
{
"key": "b",
"doc_count": 1,
"duration_sum": {
"value": 27
}
},
{
"key": "c",
"doc_count": 1,
"duration_sum": {
"value": 2
}
}
]
}
}
}
Now all you need to do would be the following. I'm using JavaScript, but you can do it in any other language that can read JSON.
var response = ...the JSON response above...
var total_duration = response.aggregations.total_duration.value;
var total_docs = response.hits.total;
response.aggregations.components.buckets.forEach(function(comp_stats) {
// total duration for the component
var total_duration_comp = comp_stats.duration_sum.value;
// percentage duration of the component
var perc_duration_comp = total_duration_comp / total_duration * 100;
// percentage documents for the component
var perc_doc_comp = comp_stats.doc_count / total_docs * 100;
});

In ElasticSearch[2.x], You can use the bucket script aggregation, which is perfectly meet your needs!
eg:
{
"bucket_script": {
"buckets_path": {
"my_var1": "the_sum",
"my_var2": "the_value_count"
},
"script": "my_var1 / my_var2"
}
}
detail:
POST /sales/_search
{
"size": 0,
"aggs" : {
"sales_per_month" : {
"date_histogram" : {
"field" : "date",
"interval" : "month"
},
"aggs": {
"total_sales": {
"sum": {
"field": "price"
}
},
"t-shirts": {
"filter": {
"term": {
"type": "t-shirt"
}
},
"aggs": {
"sales": {
"sum": {
"field": "price"
}
}
}
},
"t-shirt-percentage": {
"bucket_script": {
"buckets_path": {
"tShirtSales": "t-shirts>sales",
"totalSales": "total_sales"
},
"script": "params.tShirtSales / params.totalSales * 100"
}
}
}
}
}
}

Related

Interval search for messages in Elasticsearch

I need to split the found messages into intervals. Can this be done with Elasticsearch?
For example. There are 10 messages, you need to divide them into 3 intervals. It should look like this...
[0,1,2,3,4,5,6,7,8,9] => {[0,1,2], [3,4,5,6], [7,8,9]}.
I'm only interested in the beginning of the intervals. For example: {[count - 3, min 0], [count - 4, min 3], [count - 3, min - 7]}
Example.
PUT /test_index
{
"mappings": {
"properties": {
"id": {
"type": "long"
}
}
}
}
POST /test_index/_doc/0
{
"id": 0
}
POST /test_index/_doc/1
{
"id": 1
}
POST /test_index/_doc/2
{
"id": 2
}
POST /test_index/_doc/3
{
"id": 3
}
POST /test_index/_doc/4
{
"id": 4
}
POST /test_index/_doc/5
{
"id": 5
}
POST /test_index/_doc/6
{
"id": 6
}
POST /test_index/_doc/7
{
"id": 7
}
POST /test_index/_doc/8
{
"id": 8
}
POST /test_index/_doc/9
{
"id": 9
}
It is necessary to divide the values ​​into 3 intervals with the same number of elements in each interval:
{
...
"aggregations": {
"result": {
"buckets": [
{
"min": 0.0,
"doc_count": 3
},
{
"min": 3.0,
"doc_count": 4
},
{
"min": 7.0,
"doc_count": 3
}
]
}
}
}
There is a similar function: "variable width histogram":
GET /test_index/_search?size=0
{
"aggs": {
"result": {
"variable_width_histogram": {
"field": "id",
"buckets": 3
}
}
},
"query": {
"match_all": {}
}
}
But "variable width histogram" separates documents by id value, not by the number of elements in the bucket
Assuming your mapping is like:
{
"some_numeric_field" : {"type" : "integer"}
}
Then you can build histograms out of it with fixed interval sizes:
POST /my_index/_search?size=0
{
"aggs": {
"some_numeric_field": {
"histogram": {
"field": "some_numeric_field",
"interval": 7
}
}
}
}
Results:
{
...
"aggregations": {
"prices": {
"buckets": [
{
"key": 0.0,
"doc_count": 7
},
{
"key": 7.0,
"doc_count": 7
},
{
"key": 14.0,
"doc_count": 7
}
]
}
}
}
To get the individual values inside each bucket, just add a sub-aggregation, maybe "top_hits" or anything else like a "terms"
aggregation.
Without knowing more about your data, I really cannot help further.

Elasticsearch count doc_count occurrences on aggs

I have an elasticsearch aggregation query like this.
{
"size":0,
"aggs": {
"Domains": {
"terms": {
"field": "domains",
"size": 0
},
"aggs":{
"Identifier": {
"terms": {
"field":"alertIdentifier",
"size": 0
}
}
}
}
}
}
And it results in bucket aggregation like following:
"aggregations": {
"Domains": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "IT",
"doc_count": 147,
"Identifier": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "-2623493027134706869",
"doc_count": 7
},
{
"key": "-6590617724257725266",
"doc_count": 7
},
{
"key": "1106147277275983835",
"doc_count": 4
},
{
"key": "-3070527890944301111",
"doc_count": 4
},
{
"key": "-530975388352676402",
"doc_count": 3
},
{
"key": "-6225620509938623294",
"doc_count": 2
},
{
"key": "1652134630535374656",
"doc_count": 1
},
{
"key": "4191687133126999365",
"doc_count": 8
},
{
"key": "6882920925888555081",
"doc_count": 2
}
]
}
}
What I need is to count the number of doc_counts occurrences like this:
1 times: 0
2 times: 2
3 times: 1
equal or more than 4 times: 5
any idea how to build the ES query to count the occurrences of doc_count?
Thanks in advance.
below the ES query:
POST /xt-history*/_search
{
"query": {
"filtered": {"query": {"match_all": {} },
"filter": {
"and": [
{"term": {"type": "10"}}
]
}
}
},
"size": 0,
"aggs": {
"repetitions": {
"scripted_metric": {
"init_script" : "_agg['all'] = []; _agg['all2'] = [];",
"map_script" : "_agg['all'].add(_source['alert']['alertIdentifier'])",
"combine_script" : "for (alertId in _agg['all']) { _agg['all2'].add(alertId); }; return _agg['all2']",
"reduce_script" : "all3 = []; answer = {}; answer['one'] = []; answer['two'] = []; answer['three'] = []; answer['four'] = []; answer['five'] = []; answer['five_plus'] = []; for (alertIds in _aggs) { for (alertId1 in alertIds) { all3.add(alertId1); }; }; for (alertId in all3) { if (answer['five_plus'].contains(alertId)) { } else if(answer['five'].contains(alertId)) {answer['five'].remove(alertId); answer['five_plus'].add(alertId);} else if(answer['four'].contains(alertId)) {answer['four'].remove(alertId); answer['five'].add(alertId);} else if(answer['three'].contains(alertId)) {answer['three'].remove(alertId); answer['four'].add(alertId);} else if(answer['two'].contains(alertId)) {answer['two'].remove(alertId); answer['three'].add(alertId);} else if(answer['one'].contains(alertId)) {answer['one'].remove(alertId); answer['two'].add(alertId);} else {answer['one'].add(alertId);}; }; fans = []; fans.add(answer['one'].size()); fans.add(answer['two'].size()); fans.add(answer['three'].size()); fans.add(answer['four'].size()); fans.add(answer['five'].size()); fans.add(answer['five_plus'].size()); return fans"
}
}
}
}
query output:
{
"took": 4770,
"timed_out": false,
"_shards": {
"total": 190,
"successful": 189,
"failed": 0
},
"hits": {
"total": 334,
"max_score": 0,
"hits": []
},
"aggregations": {
"repetitions": {
"value": [
63,
39,
3,
10,
2,
13
]
}
}
}
where first value is the number of repetitions for doc_count=1, second value is the number of repetitions for doc_count=2, ... last value is the number of repetition for doc_count >=5

Elasticsearch Histogram of visits

I'm quite new to Elasticsearch and I fail to build a histogram based on ranges of visits. I am not even sure that it's possible to create this kind of chart by using a single query in Elasticsearch, but I'm the feeling that could be possible with pipeline aggregation or may be scripted aggregation.
Here is a test dataset with which I'm working:
PUT /test_histo
{ "settings": { "number_of_shards": 1 }}
PUT /test_histo/_mapping/visit
{
"properties": {
"user": {"type": "string" },
"datevisit": {"type": "date"},
"page": {"type": "string"}
}
}
POST test_histo/visit/_bulk
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Jean","page":"productXX.hmtl","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Robert","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Mary","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Mary","page":"media_center.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"media_center.html","datevisit":"2015-11-26"}
If we consider the ranges [1,2[, [2,3[, [3, inf.[
The expected result should be :
[1,2[ = 2
[2,3[ = 1
[3, inf.[ = 1
All my efforts to find the histogram showing a customer visit frequency remained to date unsuccessful. I would be pleased to have a few tips, tricks or ideas to get a response to my problem.
There are two ways you can do it.
First is doing it in ElasticSearch which will require Scripted Metric Aggregation. You can read more about it here.
Your query would look like this
{
"size": 0,
"aggs": {
"visitors_over_time": {
"date_histogram": {
"field": "datevisit",
"interval": "week"
},
"aggs": {
"no_of_visits": {
"scripted_metric": {
"init_script": "_agg['values'] = new java.util.HashMap();",
"map_script": "if (_agg.values[doc['user'].value]==null) {_agg.values[doc['user'].value]=1} else {_agg.values[doc['user'].value]+=1;}",
"combine_script": "someHashMap = new java.util.HashMap();for(x in _agg.values.keySet()) {value=_agg.values[x];if(value<3){key='[' + value +',' + (value + 1) + '[';}else{key='[' + value +',inf[';}; if(someHashMap[key]==null){someHashMap[key] = 1}else{someHashMap[key] += 1}}; return someHashMap;"
}
}
}
}
}
}
where you can change period of time in date_histogram object in the field interval by values like day, week, month.
Your response would look like this
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"visitors_over_time": {
"buckets": [
{
"key_as_string": "2015-11-23T00:00:00.000Z",
"key": 1448236800000,
"doc_count": 7,
"no_of_visits": {
"value": [
{
"[2,3[": 1,
"[3,inf[": 1,
"[1,2[": 2
}
]
}
}
]
}
}
}
Second method is to the work of scripted_metric in client side. You can use the result of Terms Aggregation. You can read more about it here.
Your query will look like this
GET test_histo/visit/_search
{
"size": 0,
"aggs": {
"visitors_over_time": {
"date_histogram": {
"field": "datevisit",
"interval": "week"
},
"aggs": {
"no_of_visits": {
"terms": {
"field": "user",
"size": 10
}
}
}
}
}
}
and the response will be
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"visitors_over_time": {
"buckets": [
{
"key_as_string": "2015-11-23T00:00:00.000Z",
"key": 1448236800000,
"doc_count": 7,
"no_of_visits": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "john",
"doc_count": 3
},
{
"key": "mary",
"doc_count": 2
},
{
"key": "jean",
"doc_count": 1
},
{
"key": "robert",
"doc_count": 1
}
]
}
}
]
}
}
}
where on the response you can do count for each doc_count for each period.
Have a look at:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-datehistogram-aggregation.html
If you whant to show it in fancy already fixed UI use Kibana.
A query like this:
GET _search
{
"query": {
"match_all": {}
},
{
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "datevisit",
"interval" : "month"
}
}
}
}
}
Should give you a histogram, I don't have elastic here at the moment so I might have some fat finggered typos.
Then you could ad query terms to only show histogram for specific page our you could have an aouter aggregation bucket wich aggregates / page or user.
Something like this:
GET _search
{
"query": {
"match_all": {}
},
{
{
"aggs" : {
"users" : {
"terms" : {
"field" : "user",
},
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "datevisit",
"interval" : "month"
}
}
}
}
}
Have a look to this solution:
{
"query": {
"match_all": {}
},
"aggs": {
"periods": {
"filters": {
"filters": {
"1-2": {
"range": {
"datevisit": {
"gte": "2015-11-25",
"lt": "2015-11-26"
}
}
},
"2-3": {
"range": {
"datevisit": {
"gte": "2015-11-26",
"lt": "2015-11-27"
}
}
},
"3-": {
"range": {
"datevisit": {
"gte": "2015-11-27",
}
}
}
}
},
"aggs": {
"users": {
"terms": {"field": "user"}
}
}
}
}
}
Step by step:
Filter aggregation: You can define ranged values for the next aggregation, in this case we define 3 periods based on date range filter
Nested Users aggregation: This aggregation returns as many results as filters you'd defined. So, in this case, you'll get 3 values using range date filtering
You'll get a result like this:
{
...
"aggregations" : {
"periods" : {
"buckets" : {
"1-2" : {
"users" : {
"buckets" : [
{"key" : XXX,"doc_count" : NNN},
{"key" : YYY,"doc_count" : NNN},
]
}
},
"2-3" : {
"users" : {
"buckets" : [
{"key" : XXX1,"doc_count" : NNN1},
{"key" : YYY1,"doc_count" : NNN1},
]
}
},
"3-" : {
"users" : {
"buckets" : [
{"key" : XXX2,"doc_count" : NNN2},
{"key" : YYY2,"doc_count" : NNN2},
]
}
},
}
}
}
}
Try it, and tell if it works

Limit aggregations to list of values

Can I limit aggregations to return only specific list of values? I have something like this:
{ "aggs" : {
"province" : {
"terms" : {
"field" : "province"
}
}
},
"query": {
"bool": {
//my query..
But let's say I know list of province for which I want make count ({'province1', 'province2', 'province3'}). Is it possible to restrict returned list of province without influence on my query results?
I want to get:
//list of hits..
//
"aggregations": {
"province": {
"buckets": [
{
"key": "province1",
"doc_count": 200
},
{
"key": "province2",
"doc_count": 162
},
{
"key": "province3",
"doc_count": 162
}
// even if there is more possible provinces
// I don't want to see them
Sure, just use term filters.
Here's an example. Let's say I have visit stats for a bunch of different IP addresses, but I only want to get counts of document for two of them, I could do this:
POST /test_index/_search?search_type=count
{
"aggregations": {
"ip": {
"terms": {
"field": "ip",
"size": 10,
"include": [
"146.233.189.126",
"193.33.153.89"
]
}
}
}
}
and get back something like:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"ip": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "146.233.189.126",
"doc_count": 3
},
{
"key": "193.33.153.89",
"doc_count": 3
}
]
}
}
}
Here is some code I used to play around with it:
http://sense.qbox.io/gist/68697646ef7afc9f0375995b6f84181a7ac4cba9
So your example might look like:
{
"aggs": {
"province": {
"terms": {
"field": "province",
"include": [
"province1",
"province2",
"province3"
]
}
}
}
}

Elasticsearch Terms or Cardinality Aggregation - Order by number of distinct values

Friends,
I am doing some analysis to find unique pairs from 100s of millions of documents. The mock example is as shown below:
doc field1 field2
AAA : BBB
AAA : CCC
PPP : QQQ
PPP : QQQ
XXX : YYY
XXX : YYY
MMM : NNN
90% of the document contains an unique pair as shown above in doc 3, 4, 5, 6 and 7 which I am not interested on my aggregation result. I am interested to aggregate doc 1 and 2.
Terms Aggregation Query:
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
Term Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "QQQ",
"doc_count": 2
}
]
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "YYY",
"doc_count": 2
}
]
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "BBB",
"doc_count": 1
},
{
"key": "CCC",
"doc_count": 1
}
]
}
}
]
}
}
I am interested only on key AAA to be in the aggregation result. What is the best way to filter the aggregation result containing distinct pairs?
I tried with cardinality aggregation which result unque value count. However I am not able to filter out what I am not interested from the aggregation results.
Cardinality Aggregation Query
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"cardinality": {
"field": "FIELD2"
}
}
}
}
}
Cardinality Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"value" : 2
}
}
]
}
}
Atleast if I could sort by cardinal value, that would be help me to find some workarounds. Please help me in this regard.
P.S: Writing a spark/mapreduce program to post process/filter the aggregation result is not expected solution for this issue.
I suggest to use filter query along with aggregations, since you are only interested in field1=AAA.
I have a similar example here.
For example, I have an index of all patients in my hospital. I store their drug use in a nested object DRUG. Each patient could take different drugs, and each could take a single drug for multiple times.
Now if I wanted to find the number of patients who took aspirin at least once, the query could be:
{
"size": 0,
"_source": false,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "DRUG",
"filter": {
"bool": {
"must": [{ "term": { "DRUG.NAME": "aspirin" } }]
}}}}}},
"aggs": {
"DRUG_FACETS": {
"nested": {
"path": "DRUG"
},
"aggs": {
"DRUG_NAME_FACETS": {
"terms": { "field": "DRUG.NAME", "size": 0 },
"aggs": {
"DISTINCT": { "cardinality": { "field": "DRUG.PATIENT" } }
}
}}}}
}
Sample result:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"DRUG_FACETS": {
"doc_count": 11,
"DRUG_NAME_FACETS": {
"buckets": [
{
"key": "aspirin",
"doc_count": 6,
"DISTINCT": {
"value": 6
}
},
{
"key": "vitamin-b",
"doc_count": 3,
"DISTINCT": {
"value": 2
}
},
{
"key": "vitamin-c",
"doc_count": 2,
"DISTINCT": {
"value": 2
}
}
]
}
}
}
}
The first one in the buckets would be aspirin. But you can see other 2 patients had also taken vitamin-b when they took aspirin.
If you change the field value of DRUG.NAME to another drug name for example "vitamin-b", I suppose you would get vitamin-b in the first position of the buckets.
Hopefully this is helpful to your question.
A bit late, hope it would help for others.
A simple approach is to filter only 'AAA' records in top aggregation:
{
"size": 0,
"aggregations": {
"filterAAA": {
"filter": {
"term": {
"FIELD1": "AAA"
}
},
"aggregations": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggregations": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
}
}
}

Resources