Elasticsearch count doc_count occurrences on aggs - elasticsearch

I have an elasticsearch aggregation query like this.
{
"size":0,
"aggs": {
"Domains": {
"terms": {
"field": "domains",
"size": 0
},
"aggs":{
"Identifier": {
"terms": {
"field":"alertIdentifier",
"size": 0
}
}
}
}
}
}
And it results in bucket aggregation like following:
"aggregations": {
"Domains": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "IT",
"doc_count": 147,
"Identifier": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "-2623493027134706869",
"doc_count": 7
},
{
"key": "-6590617724257725266",
"doc_count": 7
},
{
"key": "1106147277275983835",
"doc_count": 4
},
{
"key": "-3070527890944301111",
"doc_count": 4
},
{
"key": "-530975388352676402",
"doc_count": 3
},
{
"key": "-6225620509938623294",
"doc_count": 2
},
{
"key": "1652134630535374656",
"doc_count": 1
},
{
"key": "4191687133126999365",
"doc_count": 8
},
{
"key": "6882920925888555081",
"doc_count": 2
}
]
}
}
What I need is to count the number of doc_counts occurrences like this:
1 times: 0
2 times: 2
3 times: 1
equal or more than 4 times: 5
any idea how to build the ES query to count the occurrences of doc_count?
Thanks in advance.

below the ES query:
POST /xt-history*/_search
{
"query": {
"filtered": {"query": {"match_all": {} },
"filter": {
"and": [
{"term": {"type": "10"}}
]
}
}
},
"size": 0,
"aggs": {
"repetitions": {
"scripted_metric": {
"init_script" : "_agg['all'] = []; _agg['all2'] = [];",
"map_script" : "_agg['all'].add(_source['alert']['alertIdentifier'])",
"combine_script" : "for (alertId in _agg['all']) { _agg['all2'].add(alertId); }; return _agg['all2']",
"reduce_script" : "all3 = []; answer = {}; answer['one'] = []; answer['two'] = []; answer['three'] = []; answer['four'] = []; answer['five'] = []; answer['five_plus'] = []; for (alertIds in _aggs) { for (alertId1 in alertIds) { all3.add(alertId1); }; }; for (alertId in all3) { if (answer['five_plus'].contains(alertId)) { } else if(answer['five'].contains(alertId)) {answer['five'].remove(alertId); answer['five_plus'].add(alertId);} else if(answer['four'].contains(alertId)) {answer['four'].remove(alertId); answer['five'].add(alertId);} else if(answer['three'].contains(alertId)) {answer['three'].remove(alertId); answer['four'].add(alertId);} else if(answer['two'].contains(alertId)) {answer['two'].remove(alertId); answer['three'].add(alertId);} else if(answer['one'].contains(alertId)) {answer['one'].remove(alertId); answer['two'].add(alertId);} else {answer['one'].add(alertId);}; }; fans = []; fans.add(answer['one'].size()); fans.add(answer['two'].size()); fans.add(answer['three'].size()); fans.add(answer['four'].size()); fans.add(answer['five'].size()); fans.add(answer['five_plus'].size()); return fans"
}
}
}
}
query output:
{
"took": 4770,
"timed_out": false,
"_shards": {
"total": 190,
"successful": 189,
"failed": 0
},
"hits": {
"total": 334,
"max_score": 0,
"hits": []
},
"aggregations": {
"repetitions": {
"value": [
63,
39,
3,
10,
2,
13
]
}
}
}
where first value is the number of repetitions for doc_count=1, second value is the number of repetitions for doc_count=2, ... last value is the number of repetition for doc_count >=5

Related

elasticsearch date_histogram offset and extended_bounds don't work together?

I'm doing a date_histogram with week interval. i need my weeks to start on sundays rather than mondays, and i need the result to include weeks in which there are no docs (empty records).
to get that i use offset = -1d to change the start to sunday, and extended_bounds to get the empty records.
elasticsearch nicely figures out the first day of the interval, so if i supply a start date that's, say wednesday, i get a record for the week starting the previous sunday.
the problem is, if i set offset = -1d, i get an extra week. my hypothesis is that it calculates the first day of the interval without taking the offset into account.
in the example shown, i would not expect to get the 2017-09-24 record:
query:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"utility.utility_uuid.orig": "17245998142979832061"
}
},
{
"range": {
"user.date_created": {
"gte": "2017-10-01",
"lt": "2017-10-31"
}
}
}
]
}
},
"aggs": {
"eow_accounts_and_users": {
"date_histogram": {
"format": "yyyy-MM-dd",
"interval": "week",
"offset": "-1d",
"time_zone": "US/Pacific",
"field": "user.date_created",
"min_doc_count": 0,
"extended_bounds": {
"min": "2017-10-01",
"max": "2017-10-31"
}
}
}
}
}
result:
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 0,
"max_score": 0,
"hits": []
},
"aggregations": {
"eow_accounts_and_users": {
"buckets": [
{
"key_as_string": "2017-09-24",
"key": 1506236400000,
"doc_count": 0
},
{
"key_as_string": "2017-10-01",
"key": 1506841200000,
"doc_count": 0
},
{
"key_as_string": "2017-10-08",
"key": 1507446000000,
"doc_count": 0
},
{
"key_as_string": "2017-10-15",
"key": 1508050800000,
"doc_count": 0
},
{
"key_as_string": "2017-10-22",
"key": 1508655600000,
"doc_count": 0
},
{
"key_as_string": "2017-10-29",
"key": 1509260400000,
"doc_count": 0
}
]
}
}
}
Add an extra day to the extended bounds:
"extended_bounds": {
"min": dateParams.startTime+86400000, // an extra day
"max": dateParams.endTime
}

Elastic search document count passing array

i am new in ElasticSearch i want count document based on id but i want to pass array in id like "myId":[1,2,3,4,5]
for every id i want count number
Current input
GET /probedb_v1/probe/_count
{
"query": {
"match_phrase": {
"myId": 1
}
}
}
Current output
{ "count": 6929,
"_shards":{ "total": 1,
"successful": 1,
"failed": 0
}
}
What is input for my
Required Output
{ "count": [6929,5222,65241,5241,6521],
"_shards":{ "total": 1,
"successful": 1,
"failed": 0
}
}
also need code for elasticsearch java-api
You can do it like this:
GET /probedb_v1/probe/_search
{
"size": 0,
"query": {
"terms": {
"myId": [123, 44]
}
},
"aggs": {
"NAME": {
"terms": {
"field": "myId",
"size": 50
}
}
}
}
This will give you this output:
"aggregations": {
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 3
},
{
"key": 44,
"doc_count": 2
}
]
}
}

Elasticsearch analytics percent

I am using Elasticsearch 1.7.3 to accumulate data for analytics reports.
I have an index that holds documents where each document has a numeric field called 'duration' (how many milliseconds the request took), and a string field called 'component'. There can be many documents with the same component name.
Eg.
{"component": "A", "duration": 10}
{"component": "B", "duration": 27}
{"component": "A", "duration": 5}
{"component": "C", "duration": 2}
I would like to produce a report that states for each component:
The sum of all 'duration' fields for this component.
A: 15
B: 27
C: 2
The percentage of this sum out of the total sum of duration of all documents. In my example
A: (10+5) / (10+27+5+2) * 100
B: 27 / (10+27+5+2) * 100
C: 2 / (10+27+5+2) * 100
The percentage of the documents for each component, out of the total components.
A: 2 / 4 * 100
B: 1 / 4 * 100
C: 1 / 4 * 100
How do I do that with Elasticsearch 1.7.3?
With ES 1.7.3, there is no way to compute data based on the results of two different aggregations, this is something that can be done in ES 2.0 with pipeline aggregations, though.
However, what you're asking is not too complicated to do on the client-side with 1.7.3. If you use the query below, you'll get all you need to get the figures you expect:
POST components/_search
{
"size": 0,
"aggs": {
"total_duration": {
"sum": {
"field": "duration"
}
},
"components": {
"terms": {
"field": "component"
},
"aggs": {
"duration_sum": {
"sum": {
"field": "duration"
}
}
}
}
}
}
The results would look like this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"total_duration": {
"value": 44
},
"components": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2,
"duration_sum": {
"value": 15
}
},
{
"key": "b",
"doc_count": 1,
"duration_sum": {
"value": 27
}
},
{
"key": "c",
"doc_count": 1,
"duration_sum": {
"value": 2
}
}
]
}
}
}
Now all you need to do would be the following. I'm using JavaScript, but you can do it in any other language that can read JSON.
var response = ...the JSON response above...
var total_duration = response.aggregations.total_duration.value;
var total_docs = response.hits.total;
response.aggregations.components.buckets.forEach(function(comp_stats) {
// total duration for the component
var total_duration_comp = comp_stats.duration_sum.value;
// percentage duration of the component
var perc_duration_comp = total_duration_comp / total_duration * 100;
// percentage documents for the component
var perc_doc_comp = comp_stats.doc_count / total_docs * 100;
});
In ElasticSearch[2.x], You can use the bucket script aggregation, which is perfectly meet your needs!
eg:
{
"bucket_script": {
"buckets_path": {
"my_var1": "the_sum",
"my_var2": "the_value_count"
},
"script": "my_var1 / my_var2"
}
}
detail:
POST /sales/_search
{
"size": 0,
"aggs" : {
"sales_per_month" : {
"date_histogram" : {
"field" : "date",
"interval" : "month"
},
"aggs": {
"total_sales": {
"sum": {
"field": "price"
}
},
"t-shirts": {
"filter": {
"term": {
"type": "t-shirt"
}
},
"aggs": {
"sales": {
"sum": {
"field": "price"
}
}
}
},
"t-shirt-percentage": {
"bucket_script": {
"buckets_path": {
"tShirtSales": "t-shirts>sales",
"totalSales": "total_sales"
},
"script": "params.tShirtSales / params.totalSales * 100"
}
}
}
}
}
}

Limit aggregations to list of values

Can I limit aggregations to return only specific list of values? I have something like this:
{ "aggs" : {
"province" : {
"terms" : {
"field" : "province"
}
}
},
"query": {
"bool": {
//my query..
But let's say I know list of province for which I want make count ({'province1', 'province2', 'province3'}). Is it possible to restrict returned list of province without influence on my query results?
I want to get:
//list of hits..
//
"aggregations": {
"province": {
"buckets": [
{
"key": "province1",
"doc_count": 200
},
{
"key": "province2",
"doc_count": 162
},
{
"key": "province3",
"doc_count": 162
}
// even if there is more possible provinces
// I don't want to see them
Sure, just use term filters.
Here's an example. Let's say I have visit stats for a bunch of different IP addresses, but I only want to get counts of document for two of them, I could do this:
POST /test_index/_search?search_type=count
{
"aggregations": {
"ip": {
"terms": {
"field": "ip",
"size": 10,
"include": [
"146.233.189.126",
"193.33.153.89"
]
}
}
}
}
and get back something like:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"ip": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "146.233.189.126",
"doc_count": 3
},
{
"key": "193.33.153.89",
"doc_count": 3
}
]
}
}
}
Here is some code I used to play around with it:
http://sense.qbox.io/gist/68697646ef7afc9f0375995b6f84181a7ac4cba9
So your example might look like:
{
"aggs": {
"province": {
"terms": {
"field": "province",
"include": [
"province1",
"province2",
"province3"
]
}
}
}
}

Elasticsearch Terms or Cardinality Aggregation - Order by number of distinct values

Friends,
I am doing some analysis to find unique pairs from 100s of millions of documents. The mock example is as shown below:
doc field1 field2
AAA : BBB
AAA : CCC
PPP : QQQ
PPP : QQQ
XXX : YYY
XXX : YYY
MMM : NNN
90% of the document contains an unique pair as shown above in doc 3, 4, 5, 6 and 7 which I am not interested on my aggregation result. I am interested to aggregate doc 1 and 2.
Terms Aggregation Query:
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
Term Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "QQQ",
"doc_count": 2
}
]
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "YYY",
"doc_count": 2
}
]
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "BBB",
"doc_count": 1
},
{
"key": "CCC",
"doc_count": 1
}
]
}
}
]
}
}
I am interested only on key AAA to be in the aggregation result. What is the best way to filter the aggregation result containing distinct pairs?
I tried with cardinality aggregation which result unque value count. However I am not able to filter out what I am not interested from the aggregation results.
Cardinality Aggregation Query
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"cardinality": {
"field": "FIELD2"
}
}
}
}
}
Cardinality Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"value" : 2
}
}
]
}
}
Atleast if I could sort by cardinal value, that would be help me to find some workarounds. Please help me in this regard.
P.S: Writing a spark/mapreduce program to post process/filter the aggregation result is not expected solution for this issue.
I suggest to use filter query along with aggregations, since you are only interested in field1=AAA.
I have a similar example here.
For example, I have an index of all patients in my hospital. I store their drug use in a nested object DRUG. Each patient could take different drugs, and each could take a single drug for multiple times.
Now if I wanted to find the number of patients who took aspirin at least once, the query could be:
{
"size": 0,
"_source": false,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "DRUG",
"filter": {
"bool": {
"must": [{ "term": { "DRUG.NAME": "aspirin" } }]
}}}}}},
"aggs": {
"DRUG_FACETS": {
"nested": {
"path": "DRUG"
},
"aggs": {
"DRUG_NAME_FACETS": {
"terms": { "field": "DRUG.NAME", "size": 0 },
"aggs": {
"DISTINCT": { "cardinality": { "field": "DRUG.PATIENT" } }
}
}}}}
}
Sample result:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"DRUG_FACETS": {
"doc_count": 11,
"DRUG_NAME_FACETS": {
"buckets": [
{
"key": "aspirin",
"doc_count": 6,
"DISTINCT": {
"value": 6
}
},
{
"key": "vitamin-b",
"doc_count": 3,
"DISTINCT": {
"value": 2
}
},
{
"key": "vitamin-c",
"doc_count": 2,
"DISTINCT": {
"value": 2
}
}
]
}
}
}
}
The first one in the buckets would be aspirin. But you can see other 2 patients had also taken vitamin-b when they took aspirin.
If you change the field value of DRUG.NAME to another drug name for example "vitamin-b", I suppose you would get vitamin-b in the first position of the buckets.
Hopefully this is helpful to your question.
A bit late, hope it would help for others.
A simple approach is to filter only 'AAA' records in top aggregation:
{
"size": 0,
"aggregations": {
"filterAAA": {
"filter": {
"term": {
"FIELD1": "AAA"
}
},
"aggregations": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggregations": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
}
}
}

Resources