multiple field aggregation on documents with multiple elements gives unexpected result - elasticsearch

I have documents with the following structure (very much simplified for the example):
"documents": [
{
"name": "Document 1",
"collections" : [
{
"id": 30,
"title" : "Research"
},
{
"id": 45,
"title" : "Events"
},
{
"id" : 52,
"title" : "International"
}
]
},
{
"name": "Document 2",
"collections" : [
{
"id": 45,
"title" : "Events"
},
{
"id" : 63,
"title" : "Development"
}
]
}
]
I want an aggregation of the collection. It works fine when I do it like this:
"aggs": {
"collections": {
"terms": {
"field": "collections.title",
"size": 30
}
}
}
I get a nice result as expected:
"aggregations" : {
"collections" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Research",
"doc_count" : 18
},
{
"key" : "Events",
"doc_count" : 14
},
{
"key" : "International",
"doc_count" : 13
},
{
"key" : "Development",
"doc_count" : 8
}
]
}
}
However, I want the id included as well. So I tried this:
"aggs": {
"collections": {
"terms": {
"field": "collections.title",
"size": 30
}
},
"aggs": {
"id": {
"terms": {
"field": "collections.id",
"size": 1
}
}
}
}
This is the result:
"aggregations" : {
"collections" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Research",
"doc_count" : 18,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "30",
"doc_count" : 1
}
]
}
},
{
"key" : "Events",
"doc_count" : 14,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "45",
"doc_count" : 1
}
]
}
},
{
"key" : "International",
"doc_count" : 13,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "52",
"doc_count" : 1
}
]
}
},
{
"key" : "Development",
"doc_count" : 8,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "45",
"doc_count" : 1
}
]
}
}
]
}
}
At glance it looks good. But at a closer look the at the last element with Development (scroll down). The id should be 63, but is 45.
I have vague idea why this is, but I cannot find a solution for it. I also tried the multi_terms, but it gives a similar result. I think the issue has to do with the fact there are multiple collections within the document.
Does anyone know the correct solution to solve this issue?

The reason is in an object type mapping there is no relation between "title" and "id" , everything is flatenned by Elasticsearch under the hood, so:
"collections" : [
{
"id": 30,
"title" : "Research"
},
{
"id": 45,
"title" : "Events"
},
{
"id" : 52,
"title" : "International"
}
]
Becomes:
"collections.id": [30,45,52],
"collections.title": [Research, Events, International]
Elasticsearch doesn't know id 30 belongs to Research, or id 45 to Events.
You must use "nested" type to keep the relation between nested properties.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Solution: Use nested field type
Mappings
PUT test_nestedaggs
{
"mappings": {
"properties": {
"name": {
"type": "text"
},
"collections": {
"type": "nested",
"properties": {
"title": {
"type": "keyword"
},
"id": {
"type": "keyword"
}
}
}
}
}
}
Documents
POST test_nestedaggs/_doc
{
"name": "Document 1",
"collections": [
{
"id": 30,
"title": "Research"
},
{
"id": 45,
"title": "Events"
},
{
"id": 52,
"title": "International"
}
]
}
POST test_nestedaggs/_doc
{
"name": "Document 2",
"collections": [
{
"id": 45,
"title": "Events"
},
{
"id": 63,
"title": "Development"
}
]
}
Query
POST test_nestedaggs/_search?size=0
{
"aggs": {
"nested_collections": {
"nested": {
"path": "collections"
},
"aggs": {
"collections": {
"terms": {
"field": "collections.title"
},
"aggs": {
"ids": {
"terms": {
"field": "collections.id"
}
}
}
}
}
}
}
}
Results
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"nested_collections": {
"doc_count": 5,
"collections": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Events",
"doc_count": 2,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "45",
"doc_count": 2
}
]
}
},
{
"key": "Development",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "63",
"doc_count": 1
}
]
}
},
{
"key": "International",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "52",
"doc_count": 1
}
]
}
},
{
"key": "Research",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "30",
"doc_count": 1
}
]
}
}
]
}
}
}
}
You can read an article I wrote about that for details:
https://opster.com/guides/elasticsearch/data-architecture/elasticsearch-nested-field-object-field/
NOTE: If the number of child documents is too big and you are doing a lot of updates, consider changing the data model because each child document is an independent document in the index, and on each update on a child document the whole structure will reindex and that may affect the performance, there are also limits in the maximum of nested documents you can add. If the number is small like the example then it's fine.

Related

is it possible es aggreation?

I have a question about aggregation.
I want to do aggregation for a field declared as an object array.
It is not aggregation for each element, but aggregation for the whole value.
I have following documents:
PUT value-list-index
{
"mappings": {
"properties": {
"server": {
"type": "keyword"
},
"users": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
}
PUT value-list-index/_doc/1
{
"server": "server1",
"users": ["user1"]
}
PUT value-list-index/_doc/2
{
"server": "server2",
"users": ["user1","user2"]
}
PUT value-list-index/_doc/3
{
"server": "server3",
"users": ["user2", "user3"]
}
PUT value-list-index/_doc/4
{
"server": "server4",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/5
{
"server": "server5",
"users": ["user2", "user3","user4"]
}
PUT value-list-index/_doc/6
{
"server": "server6",
"users": ["user3","user4"]
}
PUT value-list-index/_doc/7
{
"server": "server7",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/8
{
"server": "server8",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/9
{
"server": "server9",
"users": ["user1","user2", "user3","user4"]
}
get value-list-index/_search
{
"size" : 0,
"aggs": {
"words": {
"terms": {
"field": "users"
},
"aggs": {
"total": {
"value_count": {
"field": "users"
}
}
}
}
}
}
i want following
"aggregations" : {
"words" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
**"key" : "user1",
"doc_count" : 1,**
"total" : {
"value" : xx
}
},
{
**"key" : "user1","user2",
"doc_count" : 1,**
"total" : {
"value" : xx
}
},
{
"key" : "user1","user2","user3","user4",
"doc_count" : 4,
"total" : {
"value" : xx
}
}
]
}
}
but return each element grouping result like this
"aggregations" : {
"words" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "user2",
"doc_count" : 7,
"total" : {
"value" : 23
}
},
{
"key" : "user3",
"doc_count" : 7,
"total" : {
"value" : 23
}
},
{
"key" : "user1",
"doc_count" : 6,
"total" : {
"value" : 19
}
},
{
"key" : "user4",
"doc_count" : 6,
"total" : {
"value" : 21
}
}
]
}
}
Is the aggregation I want possible?
Maybe this aggs can help you: Frequent items aggregation
But be careful with the performance.
Look this results:
"aggregations": {
"words": {
"buckets": [
{
"key": {
"users": [
"user2"
]
},
"doc_count": 7,
"support": 0.7777777777777778
},
{
"key": {
"users": [
"user2",
"user3"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user3",
"user4"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user1"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user2",
"user3",
"user4"
]
},
"doc_count": 5,
"support": 0.5555555555555556
},
{
"key": {
"users": [
"user2",
"user1"
]
},
"doc_count": 5,
"support": 0.5555555555555556
},
{
"key": {
"users": [
"user2",
"user3",
"user4",
"user1"
]
},
"doc_count": 4,
"support": 0.4444444444444444
}
]
}
}

Elasticsearch bucketing and add-to-list

How to go about bucketing on a field and then aggregating all the values of a different field into an array. Here's a sample list.
{
"product": "xyz",
"action": "add",
"user": "bob"
},
{
"product": "xyz",
"action": "update",
"user": "bob"
},
{
"product": "xyz",
"action": "add",
"user": "alice"
},
{
"product": "xyz",
"action": "add",
"user": "eve"
},
{
"product": "xyz",
"action": "delete",
"user": "eve"
}
Expected output:
{
"buckets": [
{
"key": "add",
"doc_count": 3,
"user": ["bob", "alice", "eve"]
},
{
"key": "update",
"doc_count": 1,
"user": ["bob"]
},
{
"key": "delete",
"doc_count": 1,
"user": ["eve"]
}
]
}
How to push user values to an array in each bucket? Is there something similar to mongodb $push or $addToFields in elastic aggregation? Appreciate the help.
Here's the work-in-progress aggregation.
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
}
}
}
}
}
}
Would this do? I just added chained one more Terms Aggregation as mentioned below:
Aggregation Query:
POST <your_index_name>
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
},
"aggs": {
"myUsers": {
"terms": {
"field": "user",
"size": 10
}
}
}
}
}
}
}
}
Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"product_filter" : {
"doc_count" : 5,
"group_by_action" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "add",
"doc_count" : 3,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "alice",
"doc_count" : 1
},
{
"key" : "bob",
"doc_count" : 1
},
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "delete",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "update",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "bob",
"doc_count" : 1
}
]
}
}
]
}
}
}
}
I'm not sure if it is possible to have them in a single list as you've mentioned.
Hope this helps!

Fetch all time date_histogram buckets results

I have the below query to fetch aggregations using Elasticsearch 7.1.
{
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"match": {
"viewedInFeed": true
}
}
]
}
}
]
}
},
"size": 0,
"aggs": {
"viewed_in_feed_by_day": {
"date_histogram": {
"field": "createdDate",
"interval" : "day",
"format" : "yyyy-MM-dd",
"min_doc_count": 1
}
}
}
}
The results are greater than 10,000 and I am not sure how to work since scroll is not available for aggregations. See the response below.
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"viewed_in_feed_by_day": {
"buckets": [
{
"key_as_string": "2020-03-19",
"key": 1584576000000,
"doc_count": 3028
},
{
"key_as_string": "2020-03-20",
"key": 1584662400000,
"doc_count": 5384
},
{
"key_as_string": "2020-03-21",
"key": 1584748800000,
"doc_count": 3521
}
]
}
}
}
When using _count the count of documents is greater than 10,000 and even without the "min_doc_count": 1 doesn't return results, I know there are more data anyway.
Building on top of Jaspreet's comments I suggest the following:
Use track_total_hits=true to get the exact counts (since 7.0) while keeping the size=0 to only aggregate.
Use the stats aggregation to gain more insights before running your histograms.
GET dates/_search
{
"track_total_hits": true,
"size": 0,
"aggs": {
"dates_insights": {
"stats": {
"field": "createdDate"
}
},
"viewed_in_feed_by_day": {
"date_histogram": {
"field": "createdDate",
"interval" : "month",
"format" : "yyyy-MM-dd",
"min_doc_count": 1
}
}
}
}
yielding
...
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"viewed_in_feed_by_day" : {
"buckets" : [
{
"key_as_string" : "2020-01-01",
"key" : 1577836800000,
"doc_count" : 1
},
{
"key_as_string" : "2020-02-01",
"key" : 1580515200000,
"doc_count" : 1
},
{
"key_as_string" : "2020-03-01",
"key" : 1583020800000,
"doc_count" : 1
}
]
},
"dates_insights" : {
"count" : 3,
...
"min_as_string" : "2020-01-22T13:09:21.588Z",
"max_as_string" : "2020-03-22T13:09:21.588Z",
...
}
}
...

Elasticsearch aggregation by nested object

I'm trying to build a product search with facet filtering for a eCommerce app. For the product brand I have the following structure:
"brand": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"id": {
"type": "integer"
}
}
}
I want to make an aggregation by brand id and return the whole object and the count of the documents. Something like this:
"brands" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : {
"name": "Apple",
"id": 1
},
"doc_count" : 34
},
{
"key" : {
"name": "Samsung",
"id": 2
},
"doc_count" : 23
}
]
}
Currently I'm writing the aggregation like this:
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
}
}
}
},
}
and the result looks like this:
"aggregations" : {
"brands" : {
"doc_count" : 15,
"brandIds" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 4
},
{
"key" : 2,
"doc_count" : 2
}
]
}
}
}
You can use a Term Aggregation within a Terms Aggregation like this :
GET {index_name}/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
},
"aggs": {
"by name": {
"terms": {
"field": "brand.name.keyword",
"size": 10
}
}
}
}
}
}
}
}
This would result in something like this:
"aggregations": {
"brands": {
"doc_count": 68,
"brandIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 46,
"by name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Apple",
"doc_count": 46
}
]
}
},
{
"key": 2,
"doc_count": 22,
"ny id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Samsung",
"doc_count": 22
}
]
}
}
]
}
}
}
Hope this helps!!

Condition on `terms` aggregation

I would like to put a condition in other word filter data based on aggregated data.
currently, I have a query
GET sense/_search
{
"size": 0,
"aggs": {
"dates": {
"date_histogram": {
"field": "#timestamp",
"interval": "1d",
"format": "yyyy-MM-dd",
"offset": "+4h"
},
"aggs": {
"unique_sessions": {
"terms": {
"field": "sessionId"
}
}
}
}
}
}
which returns this kind of data
{
"aggregations" : {
"dates" : {
"buckets" : [
{
"key_as_string" : "2019-03-31",
"key" : 1554004800000,
"doc_count" : 14,
"unique_sessions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "83e1c3a4-341c-4ac3-a81e-f00336ee1dfb",
"doc_count" : 3
},
{
"key" : "99c4d312-2477-4bf7-ad02-ef76f50443f9",
"doc_count" : 3
},
{
"key" : "425b840f-9604-4f1d-ab18-96a9a7ae44e0",
"doc_count" : 1
},
{
"key" : "580b1f6c-6256-4f38-9803-2cc79a0a63d7",
"doc_count" : 2
},
{
"key" : "8929d75d-153c-4b66-8dd7-2eacb7974b95",
"doc_count" : 1
},
{
"key" : "8da5d732-d1e7-4a63-8f02-2b84a8bdcb62",
"doc_count" : 2
}
]
}
},
{
"key_as_string" : "2019-04-01",
"key" : 1554091200000,
"doc_count" : 1,
"unique_sessions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "513d4532-304d-44c7-bdc7-398795800383",
"doc_count" : 1
},
{
"key" : "8da5d732-d1e7-4a63-8f02-2791poc34gq1",
"doc_count" : 2
}
]
}
}
]
}
}
}
So I would like to retrieve the count of unique sesssionId where doc_count equal to 1.
Which means I expect result where date histogram with key "2019-03-31"
will show 2 (because of aggregation with name unique_sessions in buckets has only two sessions with doc_count equal to one) and accordingly "2019-04-01" will show 1 as a result.
Have no clue how to realize this aggregation.
You would need to make use of Bucket Selector Aggregation on the terms aggregation that you have.
Below is how your query would appear:
Sample Query
POST <your_index_name>/_search
{
"size":0,
"aggs":{
"dates":{
"date_histogram":{
"field":"#timestamp",
"interval":"1d",
"format":"yyyy-MM-dd",
"offset":"+4h"
},
"aggs":{
"unique_sessions":{
"terms":{
"field":"sessionId"
},
"aggs":{
"unique_buckets":{
"bucket_selector":{
"buckets_path":{
"count":"_count"
},
"script":"params.count==1"
}
}
}
}
}
}
}
}
Note that you'd end up with empty buckets in that situation as mentioned in the below response.
Sample Response
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 9,
"max_score": 0,
"hits": []
},
"aggregations": {
"dates": {
"buckets": [
{
"key_as_string": "2018-12-31",
"key": 1546228800000,
"doc_count": 3,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "83e1c3a4-3AFA1c-4ac3-a81e-f00336ee1dfb",
"doc_count": 1
}
]
}
},
{
"key_as_string": "2019-01-01",
"key": 1546315200000,
"doc_count": 0,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key_as_string": "2019-01-02",
"key": 1546401600000,
"doc_count": 3,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key_as_string": "2019-01-03",
"key": 1546488000000,
"doc_count": 3,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "83e1c3a4-3AFA1c-4ab3-a81e-f00336ee1dfb",
"doc_count": 1
}
]
}
}
]
}
}
}
In that case, if you would want to filter the buckets to only show the parent buckets which matches the child buckets having count==1 just make use of the below query where I've added another bucket selector clause.
Note carefully the structure of the query.
Refined Query Solution:
POST <your_index_name>/_search
{
"size":0,
"aggs":{
"dates":{
"date_histogram":{
"field":"#timestamp",
"interval":"1d",
"format":"yyyy-MM-dd",
"offset":"+4h"
},
"aggs":{
"unique_sessions":{
"terms":{
"field":"sessionId"
},
"aggs":{
"unique_buckets":{
"bucket_selector":{
"buckets_path":{
"count":"_count"
},
"script":"params.count==1"
}
}
}
},
"terms_bucket_clause": {
"bucket_selector": {
"buckets_path": {
"count": "unique_sessions._bucket_count"
},
"script": "params.count>0"
}
}
}
}
}
}
Refined Query Response
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 9,
"max_score": 0,
"hits": []
},
"aggregations": {
"dates": {
"buckets": [
{
"key_as_string": "2018-12-31",
"key": 1546228800000,
"doc_count": 3,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "83e1c3a4-3AFA1c-4ac3-a81e-f00336ee1dfb",
"doc_count": 1
}
]
}
},
{
"key_as_string": "2019-01-03",
"key": 1546488000000,
"doc_count": 3,
"unique_sessions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "83e1c3a4-3AFA1c-4ab3-a81e-f00336ee1dfb",
"doc_count": 1
}
]
}
}
]
}
}
}
Do note the difference in the results in both the query. Hope this helps!

Resources