Wrong terms aggregate doc_count in elasticsearch

Wrong terms aggregate doc_count in elasticsearch - elasticsearch

I am trying to find duplicates in the index by aggregating users by [array] + field using script.
My question is why does terms aggregate only counts once document by a given key (smith#gmail.com_SMITH). And is it possible to change this behavior.
Data:
POST users/user
{
"name" :"SMITH",
"emails" : [
"smith#gmail.com"
]
}
POST users/user
{
"name" :"SMITH",
"emails" : [
"mrsmith#gmail.com",
"smith#gmail.com"
]
}
Distinct query:
POST users/_search
{
"size": 0,
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"inline": "doc['emails.keyword'].value + '_' + doc['name.keyword'].value"
}
}
}
}
}
Result:
"aggregations": {
"duplicateCount": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mrsmith#gmail.com_SMITH",
"doc_count": 1
},
{
"key": "smith#gmail.com_SMITH",
"doc_count": 1
}
]
}
}

You seem to be only to get the right terms aggregation count with "terms" + "field".
If you try out this query, you can see the difference of "terms" + "field" and "terms" + "script":
{
"from" : 0,
"size" : 0,
"_source" : true,
"query" : {
"bool" : {
"must" : [ {
"match" : {
"name" : {
"query" : "SMITH",
"operator" : "OR",
"fuzziness" : "AUTO",
"prefix_length" : 1,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1
}
}
} ]
}
},
"aggs": {
"duplicateCount": {
"terms": {
"script": {
"inline": "doc['emails.keyword'].value + '_' + doc['name.keyword'].value"
}
}
},
"duplicateCount2": {
"terms": {
"field": "emails.keyword"
}
}
}
}
Here are the results. See duplicateCount2:
{
"took" : 53,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"duplicateCount2" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "smith#gmail.com",
"doc_count" : 2
}, {
"key" : "mrsmith#gmail.com",
"doc_count" : 1
} ]
},
"duplicateCount" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "mrsmith#gmail.com_SMITH",
"doc_count" : 1
}, {
"key" : "smith#gmail.com_SMITH",
"doc_count" : 1
} ]
}
}
}

Ok. So I worked around it by iterating over terms array and manually creating desired keys:
def keys = [];
for (p in doc['emails.keyword'].values) {
keys.add(p + doc['name.keyword'].value);
}
return keys;
Here's the result:
"buckets": [
{
"key": "smith#gmail.com_SMITH",
"doc_count": 2
},
{
"key": "mrsmith#gmail.com_SMITH",
"doc_count": 1
}
]

Related

Nested Aggregation for AND Query Not Working

Please can someone help with the below Question.
https://discuss.elastic.co/t/nested-aggregation-with-and-always-return-0-match/315722?u=chattes

I have used following aggregations
1. Terms aggregation
2. Bucket selector
3. Nested aggregation
First I have grouped by user id using terms aggregation. Then further grouped by skill Id. Using bucket selector I have filtered users which have documents under two skills.
Query
GET index5/_search
{
"size": 0,
"aggs": {
"users": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"skills": {
"nested": {
"path": "skills"
},
"aggs": {
"filter_skill": {
"terms": {
"field": "skills.id",
"size": 10,
"include": [
553,
426
]
}
}
}
},
"bucket_count": {
"bucket_selector": {
"buckets_path": {
"skill_count": "skills>filter_skill._bucket_count"
},
"script": "params.skill_count ==2"
}
}
}
}
}
}
Results
"aggregations" : {
"users" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 1,
"skills" : {
"doc_count" : 3,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
},
{
"key" : 2,
"doc_count" : 1,
"skills" : {
"doc_count" : 2,
"filter_skill" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "426",
"doc_count" : 1
},
{
"key" : "553",
"doc_count" : 1
}
]
}
}
}
]
}

How to get word count in docs as a aggregate over time in elastic search?

I am trying to get word count trends in docs as aggregate result . Although using the following approach I am able to get the doc count aggregation result but I am not able to find any resources using which I can get word count for the month of jan , feb & mar
PUT test/_doc/1
{
"description" : "one two three four",
"month" : "jan"
}
PUT test/_doc/2
{
"description" : "one one test test test",
"month" : "feb"
}
PUT test/_doc/3
{
"description" : "one one one test",
"month" : "mar"
}
GET test/_search
{
"size": 0,
"query": {
"match": {
"description": {
"query": "one"
}
}
},
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
}
}
}
}
OUTPUT
{
"took" : 706,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1
},
{
"key" : "jan",
"doc_count" : 1
},
{
"key" : "mar",
"doc_count" : 1
}
]
}
}
}
EXPECTED WORD COUNT OVER MONTH
"aggregations" : {
"monthly_count" : {
"buckets" : [
{
"key" : "feb",
"word_count" : 2
},
{
"key" : "jan",
"word_count" : 1
},
{
"key" : "mar",
"word_count" : 3
}
]
}
}

Maybe this query can help you:
GET test/_search
{
"size": 0,
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
},
"aggs": {
"count_word_one": {
"terms": {
"script": {
"source": """
def str = doc['description.keyword'].value;
def array = str.splitOnToken(' ');
int i = 0;
for (item in array) {
if(item == 'one'){
i++
}
}
return i;
"""
},
"size": 10
}
}
}
}
}
}
Response:
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "2",
"doc_count" : 1
}
]
}
},
{
"key" : "jan",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1
}
]
}
},
{
"key" : "mar",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 1
}
]
}
}
]
}
}

ELASTICSEARCH - Count unique value with a condition

I would like a query which it returns the number of times a field is repeated, according to the unique value of another field
I have this json:
"name" : james,
"city" : "chicago" <----------- same
},
{
"name" : james,
"city" : "san francisco"
},
{
"name" : james,
"city" : "chicago" <-----------same
},
{
"name" : Mike,
"city" : "chicago"
},
{
"name" : Mike,
"city" : "texas"<-----------same
},
{
"name" : Mike,
"city" : "texas"<-----------same
},
{
"name" : Peter,
"city" : "chicago"
},
I want to make a query where I count based on the unique value of two fields.
For example, james is equal to 2, because there are two equal fields (name: james, city, chicago) and a different field (name: james, city: san francisco)
The output would then be the following:
{
"key" : "james",
"doc_count" : 2
},
{
"key" : "Mike",
"doc_count" : 2
},
{
"key" : "Peter",
"doc_count" : 1
},
It is possible to do a single value count of two fields?

You can do a two level terms aggregation:
{
"size": 0,
"aggs": {
"names": {
"terms": {
"field": "name.keyword",
"size": 10
},
"aggs": {
"citys_by_name": {
"terms": {
"field": "city.keyword",
"size": 10,
"min_doc_count": 2
}
}
}
}
}
}
The response will looks like this:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james",
"doc_count" : 15,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "chicago",
"doc_count" : 14
}
]
}
},
{
"key" : "Peter",
"doc_count" : 2,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "chicago",
"doc_count" : 2
}
]
}
},
{
"key" : "mike",
"doc_count" : 2,
"citys_by_name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
]
}
}
Or you can concatenate fields:
GET test/_search
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
}
}
}
The response will looks lie this:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james chicago",
"doc_count" : 14
},
{
"key" : "Peter chicago",
"doc_count" : 2
}
]
}
}
If you want more stats on buckets, use the stats_buckets aggregation:
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
},
"names_stats":{
"stats_bucket": {
"buckets_path":"names._count"
}
}
}
}
Will result:
"aggregations" : {
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "james PARIS",
"doc_count" : 15
},
{
"key" : "james chicago",
"doc_count" : 13
},
{
"key" : "samuel PARIS",
"doc_count" : 11
},
{
"key" : "fred PARIS",
"doc_count" : 2
}
]
},
"names_stats" : {
"count" : 4,
"min" : 2.0,
"max" : 15.0,
"avg" : 10.25,
"sum" : 41.0
}
}

This was the solution that solved the problem for me
GET test/_search?filter_path=aggregations.count
{
"size": 0,
"aggs": {
"names": {
"terms": {
"script": {
"source": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value",
"lang": "painless"
},
"field": "name.keyword",
"size": 10,
"min_doc_count": 2
}
},
"count":{
"cardinality": {"script": "return doc['name.keyword'].value + ' ' + doc['city.keyword'].value"
}
}
}
}
Output:
{
"aggregations" : {
"count" : {
"value" : 2
}
}
}

About elasticsearch group by two fields and then filter or order

There is a shareholder index want to get below info
which holder invest the same company the most times
select hld_id, com_id, count(*) from shareholder group by hld_id, com_id order by count(*) desc;
which holder invest company just two times, maybe duplicate records
select hld_id, com_id from shareholder group by hld_id, com_id having count(*) = 2;
So how to implement above requirements by elasticsearch search query?

Below is the sample mapping, documents and aggregation query. I've figured three possible ways this can be done/achieved.
Mapping:
PUT shareholder
{
"mappings": {
"properties": {
"hld_id": {
"type": "keyword"
},
"com_id":{
"type": "keyword"
}
}
}
}
Documents:
POST shareholder/_doc/1
{
"hld_id": "001",
"com_id": "001"
}
POST shareholder/_doc/2
{
"hld_id": "001",
"com_id": "002"
}
POST shareholder/_doc/3
{
"hld_id": "002",
"com_id": "001"
}
POST shareholder/_doc/4
{
"hld_id": "002",
"com_id": "002"
}
POST shareholder/_doc/5
{
"hld_id": "002",
"com_id": "002" <--- Note I've changed this
}
Solution 1: Using Elasticsearch's aggregation
Aggregation Query: 1
Note that I've just made use of Terms Query pipelined firstly with hld_id and then with com_id
POST shareholder/_search
{
"size": 0,
"aggs": {
"share_hoder": {
"terms": {
"field": "hld_id"
},
"aggs": {
"com_aggs": {
"terms": {
"field": "com_id",
"order": {
"_count": "desc"
}
}
}
}
}
}
}
Below is how the response appear:
Response:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"share_hoder" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002",
"doc_count" : 3,
"com_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002",
"doc_count" : 2 <---- Count you are looking for
},
{
"key" : "001",
"doc_count" : 1
}
]
}
},
{
"key" : "001",
"doc_count" : 2,
"com_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "001",
"doc_count" : 1
},
{
"key" : "002",
"doc_count" : 1
}
]
}
}
]
}
}
}
Of course you may not get the representation of the result exactly as you are looking for because of the way Elasticsearch's aggregation works.
Aggregation Query: 2
For this, most of it is same as aggregation_1, where I've used two Terms Query but I've additionally made use of Cardinality Aggregation Query to get the count of hld_id and then I used further Bucket Selector Aggregation in which I've added the conditions for count()==2
POST shareholder/_search
{
"size": 0,
"aggs": {
"share_holder": {
"terms": {
"field": "hld_id",
"order": {
"_key": "desc"
}
},
"aggs": {
"com_aggs": {
"terms": {
"field": "com_id"
},
"aggs": {
"count_filter":{
"bucket_selector": {
"buckets_path": {
"count_path": "_count"
},
"script": "params.count_path == 2"
}
}
}
}
}
}
}
}
Below is how the response appears.
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"share_holder" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002",
"doc_count" : 3,
"com_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002",
"doc_count" : 2 <---- Count == 2
}
]
}
},
{
"key" : "001",
"doc_count" : 2,
"com_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
]
}
}
}
Note that the second bucket is empty. I'm trying to see if I can filter the above query so that "key": "001" doesn't appear in first place.
Solution 2: Using Elasticsearch SQL:
If you have the x-pack version of Kibana, you can probably execute the below queries in SQLish style:
Query:1
POST /_sql?format=txt
{
"query": "SELECT hld_id, com_id, count(*) FROM shareholder GROUP BY hld_id, com_id ORDER BY count(*) desc"
}
Response:
hld_id | com_id | count(*)
---------------+---------------+---------------
002 |002 |2
001 |001 |1
001 |002 |1
002 |001 |1
Query 2:
POST /_sql?format=txt
{
"query": "SELECT hld_id, com_id FROM shareholder GROUP BY hld_id, com_id HAVING count(*) = 2"
}
Response:
hld_id | com_id
---------------+---------------
002 |002
Solution 3: Using Script in Terms Aggregation
Aggregation Query:
POST shareholder/_search
{
"size": 0,
"aggs": {
"query_groupby_count": {
"terms": {
"script": {
"source": """
doc['hld_id'].value + ", " + doc['com_id'].value
"""
}
}
},
"query_groupby_count_equals_2": {
"terms": {
"script": {
"source": """
doc['hld_id'].value + ", " + doc['com_id'].value
"""
}
},
"aggs": {
"myaggs": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count == 2"
}
}
}
}
}
}
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"query_groupby_count_equals_2" : { <---- Group By Query For Count == 2
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002, 002",
"doc_count" : 2
}
]
},
"query_groupby_count" : { <---- Group By Query
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "002, 002",
"doc_count" : 2
},
{
"key" : "001, 001",
"doc_count" : 1
},
{
"key" : "001, 002",
"doc_count" : 1
},
{
"key" : "002, 001",
"doc_count" : 1
}
]
}
}
}
Using CURL:
First let us save the query in a .txt or .json file.
For e.g I created a file called query.json, copy and pasted only the query in that file.
{
"query": "SELECT hld_id, com_id, count(*) FROM shareholder GROUP BY hld_id, com_id ORDER BY count(*) desc"
}
Now execute the below curl command where you'd refer the file as shown below:
curl -XGET http://localhost:9200/_sql?format=txt -H "Content-Type: application/json" -d #query.json
Hope this helps!

Is it possible with aggregation to amalgamate all values of an array property from all grouped documents into the coalesced document?

I have documents with the format similar to the following:
[
{
"name": "fred",
"title": "engineer",
"division_id": 20
"skills": [
"walking",
"talking"
]
},
{
"name": "ed",
"title": "ticket-taker",
"division_id": 20
"skills": [
"smiling"
]
}
]
I would like to run an aggs query that would show the complete set of skills for the division: ie,
{
"aggs":{
"distinct_skills":{
"cardinality":{
"field":"division_id"
}
}
},
"_source":{
"includes":[
"division_id",
"skills"
]
}
}
.. so that the resulting hit would look like:
{
"division_id": 20,
"skills": [
"walking",
"talking",
"smiling"
]
}
I know I can retrieve inner_hits and iterate through the list and amalgamate values "manually". I assume it would perform better if I could do it a query.

Just pipe two Terms Aggregation queries as shown below:
POST <your_index_name>/_search
{
"size": 0,
"aggs": {
"my_division_ids": {
"terms": {
"field": "division_id",
"size": 10
},
"aggs": {
"my_skills": {
"terms": {
"field": "skills", <---- If it is not keyword field use `skills.keyword` field if using dynamic mapping.
"size": 10
}
}
}
}
}
}
Below is the sample response:
Response:
{
"took" : 490,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"my_division_ids" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 20, <---- division_id
"doc_count" : 2,
"my_skills" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ <---- Skills
{
"key" : "smiling",
"doc_count" : 1
},
{
"key" : "talking",
"doc_count" : 1
},
{
"key" : "walking",
"doc_count" : 1
}
]
}
}
]
}
}
}
Hope this helps!

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Wrong terms aggregate doc_count in elasticsearch - elasticsearch

Related

Nested Aggregation for AND Query Not Working

How to get word count in docs as a aggregate over time in elastic search?

ELASTICSEARCH - Count unique value with a condition

About elasticsearch group by two fields and then filter or order

Is it possible with aggregation to amalgamate all values of an array property from all grouped documents into the coalesced document?

Categories

Resources