Elasticsearch nested aggregations returns duplicate results [duplicate] - elasticsearch

This question already has an answer here:
how to return the count of unique documents by using elasticsearch aggregation
(1 answer)
Closed 5 years ago.
With this mapping:
PUT pizzas
{
"mappings": {
"pizza": {
"properties": {
"name": {
"type": "keyword"
},
"types": {
"type": "nested",
"properties": {
"topping": {
"type": "keyword"
},
"base": {
"type": "keyword"
}
}
}
}
}
}
}
And this data:
PUT pizzas/pizza/1
{
"name": "meat",
"types": [
{
"topping": "bacon",
"base": "normal"
},
{
"topping": "pepperoni",
"base": "normal"
}
]
}
PUT pizzas/pizza/2
{
"name": "veg",
"types": [
{
"topping": "broccoli",
"base": "normal"
}
]
}
If I run this nested aggregation query:
GET pizzas/_search
{
"size": 0,
"aggs": {
"types_agg": {
"nested": {
"path": "types"
},
"aggs": {
"base_agg": {
"terms": {
"field": "types.base"
}
}
}
}
}
}
I get this result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"types_agg": {
"doc_count": 3,
"base_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "normal",
"doc_count": 3
}
]
}
}
}
}
I expected my aggregation to return a doc_count of 2 because there are only two documents which match my query. However it is clear that because it's an inverted index, it is finding 3 results and therefore 3 documents.
Is there anyway to get it to return unique document counts?
(tested in Elasticsearch 5.4.3)

Just discovered the answer shortly after asking the question.
Changing the aggregation query to be:
GET pizzas/_search
{
"size": 0,
"aggs": {
"types_agg": {
"nested": {
"path": "types"
},
"aggs": {
"base_agg": {
"terms": {
"field": "types.base"
},
"aggs": {
"top_reverse_nested": {
"reverse_nested": {}
}
}
}
}
}
}
}
Yields the result:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"types_agg": {
"doc_count": 3,
"base_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "normal",
"doc_count": 3,
"top_reverse_nested": {
"doc_count": 2
}
}
]
}
}
}
}
The important part which was added to the query was:
"aggs": {
"top_reverse_nested": {
"reverse_nested": {}
}
}
Reverse nested join back to the root of the document so it only gets unique aggregations.
You can read about reverse_nested here.

Related

How to count number of values per group?

I have an index with the following mapping:
"my_index":{
"mapping": {
"properties": {
"rec_values": {
"type": "nested",
"properties": {
"name": {
"type:" "keyword"
},
"schm_p": {
"type:" "keyword"
},
"tbl_p": {
"type:" "keyword"
},
I want to count number values for each schm_p
something like:
select count(*)
from my_index
group by rec_values.schm_p
How can I do it ?
You need to do a Composite Aggregation, like this:
{
"size": 0,
"aggs": {
"parameters": {
"nested": {
"path": "rec_values"
},
"aggs": {
"group": {
"composite": {
"size": 100, // your size
"sources": [{
"count_schm_p": {
"terms": {
"field": "rec_values.schm_p"
}
}
}]
}
}
}
}
}
}
you need to use the aggregation for this query something like this:
GET my_index/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"count_schm_p": {
"terms": {
"field": "rec_values.schm_p.keyword",
"size": 100
}
}
}
}
this query would return a response like this
{
"took": 561,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"count_schm_p": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 829099,
"buckets": [
{
"key": "type_a",
"doc_count": 1704640
},
{
"key": "type_b",
"doc_count": 1454079
},
{
"key": "type_c",
"doc_count": 894678
},
{
"key": "type_d",
"doc_count": 208489
}
]
}
}
}
the count of each schm_p is inside your aggregation key
note: the size inside your query need to match with how many schm_p types do you have.

ElasticSearch - aggregations on nested fields to return additional field in buckets

I don't know if it is possible to return additional fields in the response for each bucket.
The current request returns correct results, but I'm missing additional field information required for later processing.
{
"query": {
"bool": {
"must": {
"match_all": {}
}
}
},
"track_total_hits": true,
"from": 0,
"size": 0,
"aggs": {
"strings": {
"nested": {
"path": "filter_data.string_facet"
},
"aggs": {
"names": {
"terms": {
"field": "filter_data.string_facet.facet-name"
},
"aggs": {
"values": {
"terms": {
"field": "filter_data.string_facet.facet-value"
}
}
}
}
}
}
}
Here is the result. Note the data in field filter_data how nested fields are structured.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1,
"hits": [{
"_index": "my_index",
"_type": "_doc",
"_id": "7000043",
"_score": 1,
"_source": {
"item_data": {
"doc_id": 7000043,
"id": 7000043,
"live_state": 1,
"item_sku": "7000043",
"manufacturer_id": 1394
},
"filter_data": {
"string_facet": [{
"facet-name": "Thread size",
"facet-value": "G1/2",
"facet-name-id": 12,
"facet-value-id": 34
}]
}
}
}]
},
"aggregations": {
"strings": {
"doc_count": 5,
"names": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Thread size",
"doc_count": 2,
"values": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "G1 1/4",
"doc_count": 1
}, {
"key": "G1/2",
"doc_count": 1
}]
}
}]
}
}
}
Is it possible to add additional fields to each bucket? It would be ideal to have such a format in the response. Basically add field facet-name-id anf facet-value-id to each bucket.
....
"buckets": [{
"key": "Thread size",
"doc_count": 2,
"values": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "G1 1/4",
"facet-name-id": 12,
"facet-value-id": 34
"doc_count": 1
}, {
"key": "G1/2",
"facet-name-id": 12,
"facet-value-id": 35
"doc_count": 1
}]
}
}]
...
If this is not possible, what would you recommend?
Thanx.
Sure, you can use top_hits as a sub-aggrgation of your deepest facet-value aggregation:
POST my_index/_search?filter_path=aggregations.*.*.buckets.key,aggregations.*.*.buckets.values.buckets.key,aggregations.*.*.buckets.values.buckets.*.hits.hits._source
{
"query": {
"bool": {
"must": {
"match_all": {}
}
}
},
"track_total_hits": true,
"from": 0,
"size": 0,
"aggs": {
"strings": {
"nested": {
"path": "filter_data.string_facet"
},
"aggs": {
"names": {
"terms": {
"field": "filter_data.string_facet.facet-name"
},
"aggs": {
"values": {
"terms": {
"field": "filter_data.string_facet.facet-value"
},
"aggs": {
"my_top_hits": {
"top_hits": {
"size": 10,
"_source": ["filter_data.string_facet"]
}
}
}
}
}
}
}
}
}
}
which'd yield:
{
"aggregations" : {
"strings" : {
"names" : {
"buckets" : [
{
"key" : "Thread size",
"values" : {
"buckets" : [
{
"key" : "G1/2",
"my_top_hits" : {
"hits" : {
"hits" : [
{
"_source" : {
"facet-value" : "G1/2",
"facet-name" : "Thread size",
"facet-value-id" : 34,
"facet-name-id" : 12
}
}
]
}
}
}
]
}
}
]
}
}
}
}
Notice that my_top_hits is an array of string_facet objects instead of an object as you requested. That's because although you're already 2 facets deep (facet-name and then facet-value), there may still be multiple different facet-value-id and facet-name-id combinations covered by a given facet-value bucket.
Having said that, you can of course limit the top_hits count with the size parameter but then you wouldn't be able to say with certainty whether or not the first top hit's facets are representative of the whole bucket .

Get count of particular field in a document using Elasticsearch

Requirement:
I want to find the count of aID for a particular category ID.
(i.e for categoryID 2532 i want the count as 2 that means it is assigned to two aID's).
I tried with aggregations but with that i can able to get only the doc count rather than field count.
Mappings
"List": {
"properties": {
"aId": {
"type": "long"
},
"CategoryList": {
"properties": {
"categoryId": {
"type": "long"
},
"categoryName": {
"type": "string"
}
}
}
}
}
Sample Document:
"List": [
{
"aId": 33074,
"CategoryList": [
{
"categoryId": 2532,
"categoryName": "VODAFONE"
}
]
},
{
"aId": 12074,
"CategoryList": [
{
"categoryId": 2532,
"categoryName": "VODAFONE"
}
]
},
{
"aId": 120755,
"CategoryList": [
{
"categoryId": 1234,
"categoryName": "SMPLKE"
}
]
}
]
using cardinality aggregation will not help you getting the desired results. Cardinality aggregation returns the count of distinct values for the field, where are you want to find the count of appearance for number of times for a field.
You can use the following query, Here you can first filter the document for CategoryList.categoryId and then run a simple terms aggregation on this field
POST index_name1111/_search
{
"query": {
"bool": {
"must": [{
"term": {
"CategoryList.categoryId": {
"value": 2532
}
}
}]
}
},
"aggs": {
"count_is": {
"terms": {
"field": "CategoryList.categoryId",
"size": 10
}
}
}
}
Response of above query -
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"count_is": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2532,
"doc_count": 2
}
]
}
}
}
Or you can also chuck away the filter and running the aggregation only will return you all categoryId with their count of appearance.
POST index_name1111/_search
{
size: 0,
"aggs": {
"count_is": {
"terms": {
"field": "CategoryList.categoryId",
"size": 10
}
}
}
}
Response of above query
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"count_is": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2532,
"doc_count": 2
},
{
"key": 1234,
"doc_count": 1
}
]
}
}
}
Using cardinality aggregation you will get the following response with following query
POST index_name1111/_search
{
"size": 0,
"query": {
"bool": {
"must": [{
"term": {
"CategoryList.categoryId": {
"value": 2532
}
}
}]
}
},
"aggs": {
"id_count": {
"cardinality": {
"field": "CategoryList.categoryId"
}
}
}
}
Response of above query which doesn't give you desired result, since two documents matched both with categoryId as 252 so count of distinct is 1.
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"id_count": {
"value": 1
}
}
}
Hope this helps
Thanks

In elasticsearch, how to group by value inside nested array

Say, I have following documents:
1st doc:
{
productName: "product1",
tags: [
{
"name":"key1",
"value":"value1"
},
{
"name":"key2",
"value":"value2"
}
]
}
2nd doc:
{
productName: "product2",
tags: [
{
"name":"key1",
"value":"value1"
},
{
"name":"key2",
"value":"value3"
}
]
}
I know if I want to group by productName, I could use a terms aggregation
"terms": {
"field": "productName"
}
which will give me two buckets with two different keys "product1", "product2".
However, what should the query be if I would like to group by tag key? i.e. I would like to group by tag with name==key1, then I am expecting one bucket with key="value1"; while if I group by tag with name==key2, I am expecting the result to be two buckets with keys "value2", "value3".
What should the query look like if I would like to group by the 'value' inside a nested array but not group by the 'key'? Any suggestion?
It sounds like a nested terms aggregation is what you're looking for.
With the two documents you posted, this query:
POST /test_index/_search
{
"size": 0,
"aggs": {
"product_name_terms": {
"terms": {
"field": "product_name"
}
},
"nested_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"tags_name_terms": {
"terms": {
"field": "tags.name"
}
},
"tags_value_terms": {
"terms": {
"field": "tags.value"
}
}
}
}
}
}
returns this:
{
"took": 67,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"product_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"nested_tags": {
"doc_count": 4,
"tags_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "key1",
"doc_count": 2
},
{
"key": "key2",
"doc_count": 2
}
]
},
"tags_value_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "value1",
"doc_count": 2
},
{
"key": "value2",
"doc_count": 1
},
{
"key": "value3",
"doc_count": 1
}
]
}
}
}
}
Here is some code I used to test it:
http://sense.qbox.io/gist/a9a172f41dbd520d5e61063a9686055681110522
EDIT: Filter by Nested Value
As per your comment, if you want to filter the nested results by a value (of the nested results), you can add another "layer" of aggregation making use of the filter aggregation as follows:
POST /test_index/_search
{
"size": 0,
"aggs": {
"nested_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"filter_tag_name": {
"filter": {
"term": {
"tags.name": "key1"
}
},
"aggs": {
"tags_name_terms": {
"terms": {
"field": "tags.name"
}
},
"tags_value_terms": {
"terms": {
"field": "tags.value"
}
}
}
}
}
}
}
}
which returns:
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_tags": {
"doc_count": 4,
"filter_tag_name": {
"doc_count": 2,
"tags_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "key1",
"doc_count": 2
}
]
},
"tags_value_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "value1",
"doc_count": 2
}
]
}
}
}
}
}
Here's the updated code:
http://sense.qbox.io/gist/507c3aabf36b8f6ed8bb076c8c1b8552097c5458

query for elasticsearch returning count

I am struggling to create the query/rule that will help me create an alerting script. I want to query the elasticsearch API for counts on a specific index so that I can get alerted when the count reaches a certain threshold.
The following query is an attempt as I have no experience with this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"analyze_wildcard": true,
"query": "*"
}
},
"filter": {
"bool": {
"must": [
{
"query": {
"match": {
"PStream": {
"query": "*",
"type": "phrase"
}
}
}
},
{
"range": {
"#timestamp": {
"gte": 1447789445320,
"lte": 1447793045320
}
}
}
],
"must_not": []
}
}
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
},
"size": 500,
"sort": [
{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"aggs": {
"2": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"pre_zone": "-05:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 0,
"extended_bounds": {
"min": 1447789445317,
"max": 1447793045317
}
}
}
},
The field PStream is the field that I am focused on
EDIT:
An example of the data going to the index:
{
"_index": "logstash-2015.11.17",
"_type": "logs",
"_id": "AVEXMKu2YVnF1NOjr9YT",
"_score": null,
"_source": {
"authorUrl": "",
"postUrl": "",
"pubDate": "2015-11-17T15:18:24",
"scrapeDate": "2015-11-17T15:44:03",
"clientId": "136902834",
"query": "Jenny Balatsinou",
"PType": "post",
"tLatency": 1539,
"PLang": "en",
"PStream": "864321",
"PName": "xStackOverflow",
"#version": "1",
"#timestamp": "2015-11-17T20:44:03.400Z"
},
"fields": {
"#timestamp": [
1447793043400
],
"pubDate": [
1447773504000
],
"scrapeDate": [
1447775043000
]
},
"sort": [
1447793043400
]
there are about 20 million of these messages getting indexed daily into Elasticsearch. I have created a dashboard in Kibana where I view this data and stats. I would like to write the proper query that I can use in a java program that periodically runs and checks this index using this query. It should return the hourly total count grouped by the PStream variable which has multiple values. So anytime the value is 0 it will send an alert.
Eg. Output:
"result": {
"total": 74,
"successful": 63,
"failed": 11,
{
{
"index": "logstash-2015.11.08",
"PStream": "37647338933",
"Count": 1234532
},
{
"index": "logstash-2015.11.08",
"PStream": "45345343566",
"Count": 156532
},
As a quick example (per comments above), I just set up a trivial index:
DELETE /test_index
PUT /test_index
added some (simplified) data:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":2}}
{"PStream": "864321","#timestamp": "2015-11-17T21:44:03.400Z"}
{"index":{"_id":3}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":4}}
{"PStream": "864322","#timestamp": "2015-11-17T21:44:03.400Z"}
And now I can get the "PStream" terms inside an hour histogram:
POST /test_index/_search
{
"size": 0,
"aggs" : {
"timestamp_histogram" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
}
}
}
}
}
}
...
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 1
},
{
"key": "864322",
"doc_count": 1
}
]
}
}
]
}
}
}
or the other way around:
POST /test_index/_search
{
"size": 0,
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
},
"aggs": {
"timestamp_histogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "hour"
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 3,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
},
{
"key": "864322",
"doc_count": 1,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/6c0c30db1cf0fb8529bcfec21c0ce5c02a5ae94c

Resources