ELASTICSEARCH - Count unique value with a condition - elasticsearch

I would like a query which it returns the number of times a field is repeated, according to the unique value of another field
I have this json:
"name" : james,
"city" : "chicago" <----------- same
},
{
"name" : james,
"city" : "san francisco"
},
{
"name" : james,
"city" : "chicago" <-----------same
},
{
"name" : Mike,
"city" : "chicago"
},
{
"name" : Mike,
"city" : "texas"<-----------same
},
{
"name" : Mike,
"city" : "texas"<-----------same
},
{
"name" : Peter,
"city" : "chicago"
},
I want to make a query where I count based on the unique value of two fields.
For example, james is equal to 2, because there are two equal fields (name: james, city, chicago) and a different field (name: james, city: san francisco)
The output would then be the following:
{
"key" : "james",
"doc_count" : 2
},
{
"key" : "Mike",
"doc_count" : 2
},
{
"key" : "Peter",
"doc_count" : 1
},
It is possible to do a single value count of two fields?

You can do a two level terms aggregation:
{
"size": 0,
"aggs": {
"names": {
"terms": {
"field": "name.keyword",
"size": 10
},
"aggs": {
"citys_by_name": {
"terms": {
"field": "city.keyword",
"size": 10,
"min_doc_count": 2
}
}
}
}
}
}
The response will looks like this:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james",
"doc_count" : 15,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "chicago",
"doc_count" : 14
}
]
}
},
{
"key" : "Peter",
"doc_count" : 2,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "chicago",
"doc_count" : 2
}
]
}
},
{
"key" : "mike",
"doc_count" : 2,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
]
}
}
Or you can concatenate fields:
GET test/_search
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
}
}
}
The response will looks lie this:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james chicago",
"doc_count" : 14
},
{
"key" : "Peter chicago",
"doc_count" : 2
}
]
}
}
If you want more stats on buckets, use the stats_buckets aggregation:
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
},
"names_stats":{
"stats_bucket": {
"buckets_path":"names._count"
}
}
}
}
Will result:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james PARIS",
"doc_count" : 15
},
{
"key" : "james chicago",
"doc_count" : 13
},
{
"key" : "samuel PARIS",
"doc_count" : 11
},
{
"key" : "fred PARIS",
"doc_count" : 2
}
]
},
"names_stats" : {
"count" : 4,
"min" : 2.0,
"max" : 15.0,
"avg" : 10.25,
"sum" : 41.0
}
}

This was the solution that solved the problem for me
GET test/_search?filter_path=aggregations.count
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
},
"count":{
"cardinality": {"script": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value"
}
}
}
}
Output:
{
"aggregations" : {
"count" : {
"value" : 2
}
}
}

Related

Elasticsearch How to count total docs by date

As my theme, I wanna count docs the day and before by date, it's sample to understand that the chart.
{"index":{"_index":"login-2015.12.23","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-23T12:00:00Z"}
{"index":{"_index":"login-2015.12.23","_type":"logs"}}
{"uid":"2","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-23T12:00:00Z"}
{"index":{"_index":"login-2015.12.24","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-24T12:00:00Z"}
{"index":{"_index":"login-2015.12.25","_type":"logs"}}
{"uid":"1","register_time":"2015-12-23T12:00:00Z","login_time":"2015-12-25T12:00:00Z"}
As you see, index login-2015.12.23 has two docs, index login-2015.12.24 has one doc, index login-2015.12.23 has one doc.
And now I wanna get the result
{
"hits" : {
"total" : 6282,
"max_score" : 1.0,
"hits" : []
},
"aggregations" : {
"group_by_date" : {
"buckets" : [
{
"key_as_string" : "2015-12-23T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 2,
},
{
"key_as_string" : "2015-12-24T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 3,
},
{
"key_as_string" : "2015-12-25T12:00:00Z",
"key" : 1662163200000,
"doc_count" : 4,
}
]
}
If I count the date 2015-12-24T12:00:00Z and it means I must count day 2015-12-23T12:00:00Z and 2015-12-24T12:00:00Z at the same time.
In my project I have many indices like that, and I searching many ways to make this goal come true but not, this is my demo:
{
"query": {"match_all": {}},
"size": 0,
"aggs": {
"group_by_date": {
"date_histogram": {
"field": "timestamp",
"interval": "day"
},
"aggs": {
"intersect": {
"scripted_metric": {
"init_script": "state.inner=[]",
"map_script": "state.inner.add(params.param1 == 3 ? params.param2 * params.param1 : params.param1 * params.param2)",
"combine_script": "return state.inner",
"reduce_script": "return states",
"params": {
"param1": 3,
"param2": 5
}
}
}
}
}
}
}
I wanna group by date, and use scripted_metric to iter the date list, not the second iteration just can in its bucket and not for all the document, so do anyone has better idea to solve this problem?
You can simply use the cumulative sum pipeline aggregation
{
"query": {"match_all": {}},
"size": 0,
"aggs": {
"group_by_date": {
"date_histogram": {
"field": "login_time",
"interval": "day"
},
"aggs": {
"cumulative_docs": {
"cumulative_sum": {
"buckets_path": "_count"
}
}
}
}
}
}
And the results will look like this:
"aggregations" : {
"group_by_date" : {
"buckets" : [
{
"key_as_string" : "2015-12-23T00:00:00.000Z",
"key" : 1450828800000,
"doc_count" : 2,
"cumulative_docs" : {
"value" : 2.0
}
},
{
"key_as_string" : "2015-12-24T00:00:00.000Z",
"key" : 1450915200000,
"doc_count" : 1,
"cumulative_docs" : {
"value" : 3.0
}
},
{
"key_as_string" : "2015-12-25T00:00:00.000Z",
"key" : 1451001600000,
"doc_count" : 1,
"cumulative_docs" : {
"value" : 4.0
}
}
]
}
}

Nested Aggregation for AND Query Not Working

Please can someone help with the below Question.
https://discuss.elastic.co/t/nested-aggregation-with-and-always-return-0-match/315722?u=chattes
I have used following aggregations
1. Terms aggregation
2. Bucket selector
3. Nested aggregation
First I have grouped by user id using terms aggregation. Then further grouped by skill Id. Using bucket selector I have filtered users which have documents under two skills.
Query
GET index5/_search
{
"size": 0,
"aggs": {
"users": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"skills": {
"nested": {
"path": "skills"
},
"aggs": {
"filter_skill": {
"terms": {
"field": "skills.id",
"size": 10,
"include": [
553,
426
]
}
}
}
},
"bucket_count": {
"bucket_selector": {
"buckets_path": {
"skill_count": "skills>filter_skill._bucket_count"
},
"script": "params.skill_count ==2"
}
}
}
}
}
}
Results
"aggregations" : {
"users" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 1,
"skills" : {
"doc_count" : 3,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
},
{
"key" : 2,
"doc_count" : 1,
"skills" : {
"doc_count" : 2,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
}
]
}

How to get word count in docs as a aggregate over time in elastic search?

I am trying to get word count trends in docs as aggregate result . Although using the following approach I am able to get the doc count aggregation result but I am not able to find any resources using which I can get word count for the month of jan , feb & mar
PUT test/_doc/1
{
"description" : "one two three four",
"month" : "jan"
}
PUT test/_doc/2
{
"description" : "one one test test test",
"month" : "feb"
}
PUT test/_doc/3
{
"description" : "one one one test",
"month" : "mar"
}
GET test/_search
{
"size": 0,
"query": {
"match": {
"description": {
"query": "one"
}
}
},
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
}
}
}
}
OUTPUT
{
"took" : 706,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1
},
{
"key" : "jan",
"doc_count" : 1
},
{
"key" : "mar",
"doc_count" : 1
}
]
}
}
}
EXPECTED WORD COUNT OVER MONTH
"aggregations" : {
"monthly_count" : {
"buckets" : [
{
"key" : "feb",
"word_count" : 2
},
{
"key" : "jan",
"word_count" : 1
},
{
"key" : "mar",
"word_count" : 3
}
]
}
}
Maybe this query can help you:
GET test/_search
{
"size": 0,
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
},
"aggs": {
"count_word_one": {
"terms": {
"script": {
"source": """
def str = doc['description.keyword'].value;
def array = str.splitOnToken(' ');
int i = 0;
for (item in array) {
if(item == 'one'){
i++
}
}
return i;
"""
},
"size": 10
}
}
}
}
}
}
Response:
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "2",
"doc_count" : 1
}
]
}
},
{
"key" : "jan",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1
}
]
}
},
{
"key" : "mar",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 1
}
]
}
}
]
}
}

ElasticSearch - Sort on the subaggregation

I am quite new to elasticsearch, I am trying to sort on a subaggregations. that is my results should be sorted based on the sub aggregations first. I have tried lot of things to enable this sort but it isn't working. Can anyone help with this?
{
"aggs": {
"distinct_part": {
"terms": {
"field": "part",
"size": 1000
}
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer",
"size": 1000
}
}
}
}
I am trying to sort on the manufacturer, my entire result should be sorted on that? Can someone point me on how I can achieve that?
I tried to do a test locally with your query. I did a small correction if I understood your issue well. I ingested the following data in the index "subsorting":
"part": "car",
"manufacturer": "brandA"
"part": "car",
"manufacturer": "brandB"
"part": "car",
"manufacturer": "brandC"
"part": "motor",
"manufacturer": "brandA"
"part": "motor",
"manufacturer": "brandB"
"part": "motor",
"manufacturer": "brandC"
Note: Both part and manufacturer are mapped as text.
GET subsorting/_search
{
"size": 0,
"aggs": {
"distinct_part": {
"terms": {
"field": "part.keyword",
"size": 1000
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer.keyword",
"order": {
"_key": "asc"
},
"size": 1000
}
}
}
}
}
}
If both fields "part" and "manufacturer" are mapped as keywords, remove the ".keywords" from the query.
The response from the above query is as follows if sorted as ascending order:
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 2
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
}
]
}
}
If you need the result as descending order, here is the response where "_key": "desc":
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 2
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 1
}
]
}
}
]
}
}
Links:
https://www.elastic.co/guide/en/elasticsearch/reference/7.9/search-aggregations-bucket-terms-aggregation.html

Wrong terms aggregate doc_count in elasticsearch

I am trying to find duplicates in the index by aggregating users by [array] + field using script.
My question is why does terms aggregate only counts once document by a given key (smith#gmail.com_SMITH). And is it possible to change this behavior.
Data:
POST users/user
{
"name" :"SMITH",
"emails" : [
"smith#gmail.com"
]
}
POST users/user
{
"name" :"SMITH",
"emails" : [
"mrsmith#gmail.com",
"smith#gmail.com"
]
}
Distinct query:
POST users/_search
{
"size": 0,
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"inline": "doc['emails.keyword'].value + '_' + doc['name.keyword'].value"
}
}
}
}
}
Result:
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mrsmith#gmail.com_SMITH",
"doc_count": 1
},
{
"key": "smith#gmail.com_SMITH",
"doc_count": 1
}
]
}
}
You seem to be only to get the right terms aggregation count with "terms" + "field".
If you try out this query, you can see the difference of "terms" + "field" and "terms" + "script":
{
"from" : 0,
"size" : 0,
"_source" : true,
"query" : {
"bool" : {
"must" : [ {
"match" : {
"name" : {
"query" : "SMITH",
"operator" : "OR",
"fuzziness" : "AUTO",
"prefix_length" : 1,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1
}
}
} ]
}
},
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"inline": "doc['emails.keyword'].value + '_' + doc['name.keyword'].value"
}
}
},
"duplicateCount2": {
"terms": {
"field": "emails.keyword"
}
}
}
}
Here are the results. See duplicateCount2:
{
"took" : 53,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"duplicateCount2" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "smith#gmail.com",
"doc_count" : 2
}, {
"key" : "mrsmith#gmail.com",
"doc_count" : 1
} ]
},
"duplicateCount" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "mrsmith#gmail.com_SMITH",
"doc_count" : 1
}, {
"key" : "smith#gmail.com_SMITH",
"doc_count" : 1
} ]
}
}
}
Ok. So I worked around it by iterating over terms array and manually creating desired keys:
def keys = [];
for (p in doc['emails.keyword'].values) {
keys.add(p + doc['name.keyword'].value);
}
return keys;
Here's the result:
"buckets": [
{
"key": "smith#gmail.com_SMITH",
"doc_count": 2
},
{
"key": "mrsmith#gmail.com_SMITH",
"doc_count": 1
}
]

Resources