Get count distinct by nested objects from Elasticsearch - elasticsearch

I have index with following mapping
{
"mappings": {
"properties": {
"typed_obj": {
"type": "nested",
"properties": {
"id": {"type": "keyword"},
"type": {"type": "keyword"}
}
}
}
}
}
and documents
{"index" : {}}
{"typed_obj": [{"id": "1", "type": "one"}, {"id": "2", "type": "two"}]}
{"index" : {}}
{"typed_obj": [{"id": "1", "type": "one"}, {"id": "2", "type": "one"}]}
{"index" : {}}
{"typed_obj": [{"id": "1", "type": "one"}, {"id": "3", "type": "one"}]}
{"index" : {}}
{"typed_obj": [{"id": "1", "type": "one"}, {"id": "4", "type": "two"}]}
How can i group typed_obj by type and calculate unique id ?
Smth like
{
"type": "one",
"count": 3
},
{
"type": "two",
"count": 2
}
I made up query with agg
{
"query": {
"match_all": {}
},
"aggs": {
"obj_nested": {
"nested": {
"path": "typed_obj"
},
"aggs": {
"by_type_and_id": {
"multi_terms": {
"terms": [
{
"field": "typed_obj.type"
},
{
"field": "typed_obj.id"
}
]
}
}
}
}
},
"size": 0
}
and it returns
"buckets": [
{
"key": [
"one",
"1"
],
"key_as_string": "one|1",
"doc_count": 4
},
{
"key": [
"one",
"2"
],
"key_as_string": "one|2",
"doc_count": 1
},
{
"key": [
"one",
"3"
],
"key_as_string": "one|3",
"doc_count": 1
},
{
"key": [
"two",
"2"
],
"key_as_string": "two|2",
"doc_count": 1
},
{
"key": [
"two",
"4"
],
"key_as_string": "two|4",
"doc_count": 1
}
]
In backend app i can group keys by first element (it is typed_obj type) and then retriev length, but my question is - is it possible to get types count without obtain from index all id+type pairs ?

You need to use Cardinality aggregation to count distinct values.
Query:
{
"query": {
"match_all": {}
},
"aggs": {
"obj_nested": {
"nested": {
"path": "typed_obj"
},
"aggs": {
"type":{
"terms": {
"field": "typed_obj.type",
"size": 10
},
"aggs": {
"id": {
"cardinality": {
"field": "typed_obj.id"
}
}
}
}
}
}
},
"size": 0
}
Response
"aggregations" : {
"obj_nested" : {
"doc_count" : 8,
"type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "one",
"doc_count" : 6,
"id" : {
"value" : 3
}
},
{
"key" : "two",
"doc_count" : 2,
"id" : {
"value" : 2
}
}
]
}
}
}
Note:
A single-value metrics aggregation that calculates an approximate
count of distinct values.

Related

is it possible es aggreation?

I have a question about aggregation.
I want to do aggregation for a field declared as an object array.
It is not aggregation for each element, but aggregation for the whole value.
I have following documents:
PUT value-list-index
{
"mappings": {
"properties": {
"server": {
"type": "keyword"
},
"users": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
}
PUT value-list-index/_doc/1
{
"server": "server1",
"users": ["user1"]
}
PUT value-list-index/_doc/2
{
"server": "server2",
"users": ["user1","user2"]
}
PUT value-list-index/_doc/3
{
"server": "server3",
"users": ["user2", "user3"]
}
PUT value-list-index/_doc/4
{
"server": "server4",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/5
{
"server": "server5",
"users": ["user2", "user3","user4"]
}
PUT value-list-index/_doc/6
{
"server": "server6",
"users": ["user3","user4"]
}
PUT value-list-index/_doc/7
{
"server": "server7",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/8
{
"server": "server8",
"users": ["user1","user2", "user3","user4"]
}
PUT value-list-index/_doc/9
{
"server": "server9",
"users": ["user1","user2", "user3","user4"]
}
get value-list-index/_search
{
"size" : 0,
"aggs": {
"words": {
"terms": {
"field": "users"
},
"aggs": {
"total": {
"value_count": {
"field": "users"
}
}
}
}
}
}
i want following
"aggregations" : {
"words" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
**"key" : "user1",
"doc_count" : 1,**
"total" : {
"value" : xx
}
},
{
**"key" : "user1","user2",
"doc_count" : 1,**
"total" : {
"value" : xx
}
},
{
"key" : "user1","user2","user3","user4",
"doc_count" : 4,
"total" : {
"value" : xx
}
}
]
}
}
but return each element grouping result like this
"aggregations" : {
"words" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "user2",
"doc_count" : 7,
"total" : {
"value" : 23
}
},
{
"key" : "user3",
"doc_count" : 7,
"total" : {
"value" : 23
}
},
{
"key" : "user1",
"doc_count" : 6,
"total" : {
"value" : 19
}
},
{
"key" : "user4",
"doc_count" : 6,
"total" : {
"value" : 21
}
}
]
}
}
Is the aggregation I want possible?
Maybe this aggs can help you: Frequent items aggregation
But be careful with the performance.
Look this results:
"aggregations": {
"words": {
"buckets": [
{
"key": {
"users": [
"user2"
]
},
"doc_count": 7,
"support": 0.7777777777777778
},
{
"key": {
"users": [
"user2",
"user3"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user3",
"user4"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user1"
]
},
"doc_count": 6,
"support": 0.6666666666666666
},
{
"key": {
"users": [
"user2",
"user3",
"user4"
]
},
"doc_count": 5,
"support": 0.5555555555555556
},
{
"key": {
"users": [
"user2",
"user1"
]
},
"doc_count": 5,
"support": 0.5555555555555556
},
{
"key": {
"users": [
"user2",
"user3",
"user4",
"user1"
]
},
"doc_count": 4,
"support": 0.4444444444444444
}
]
}
}

How to group documents in Elasticsearch and get the documents in each group?

My Elasticsearch index contains products with a denormalized m:n relationship to categories.
My goal is to derive a categories index from it which contains the same information, but with the relationship inverted.
The index looks like this:
PUT /products
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"article_id": {
"type": "keyword"
},
"categories": {
"type": "nested",
"properties": {
"cat_name": {
"type": "keyword"
}
}
}
}
}
}
containing documents created like this:
POST /products/_doc
{
"name": "radio",
"article_id": "1001",
"categories": [
{ "cat_name": "audio" },
{ "cat_name": "electronics" }
]
}
POST /products/_doc
{
"name": "fridge",
"article_id": "1002",
"categories": [
{ "cat_name": "appliances" },
{ "cat_name": "electronics" }
]
}
I would like to get something like this back from Elasticsearch:
{
"name": "appliances",
"products": [
{
"name": "fridge",
"article_id": "1002"
}
]
},
{
"name": "audio",
"products": [
{
"name": "radio",
"article_id": "1001"
}
]
},
{
"name": "electronics",
"products": [
{
"name": "fridge",
"article_id": "1002"
},
{
"name": "radio",
"article_id": "1001"
}
]
}
which would eventually be put into an index such as:
PUT /categories
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"products": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"article_id": {
"type": "keyword"
}
}
}
}
}
}
I cannot figure out how to do this without loading and grouping all products programmatically.
Here's what I have tried:
Bucket aggregation on field categories.cat_name
This gives me the document count per category but not the product documents. Using top_hits sub-aggregation seems to be limited to 100 documents.
Group using collapse field with expansion
Collapsing is only possible on a single-valued field.
I'm using Elasticsearch 8.1.
The query you need is this one:
POST products/_search
{
"size": 0,
"aggs": {
"cats": {
"nested": {
"path": "categories"
},
"aggs": {
"categories": {
"terms": {
"field": "categories.cat_name",
"size": 10
},
"aggs": {
"root": {
"reverse_nested": {},
"aggs": {
"products": {
"terms": {
"field": "name",
"size": 10
}
}
}
}
}
}
}
}
}
}
Which produces exactly what you need (less the article id, but that's easy):
"buckets" : [
{
"key" : "electronics",
"doc_count" : 2,
"root" : {
"doc_count" : 2,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "fridge",
"doc_count" : 1
},
{
"key" : "radio",
"doc_count" : 1
}
]
}
}
},
{
"key" : "appliances",
"doc_count" : 1,
"root" : {
"doc_count" : 1,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "fridge",
"doc_count" : 1
}
]
}
}
},
{
"key" : "audio",
"doc_count" : 1,
"root" : {
"doc_count" : 1,
"products" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "radio",
"doc_count" : 1
}
]
}
}
}
]

Sort Aggregation in elastic seach?

I have use case where I need to get all unique user ids from Elasticsearch and it should be sorted by timestamp.
What I'm using currently is composite term aggregation with sub aggregation which will return the latest timestamp.
(I can't sort it in client side as it slow down the script)
Sample data in elastic search
{
"_index": "logstash-2020.10.29",
"_type": "doc",
"_id": "L0Urc3UBttS_uoEtubDk",
"_version": 1,
"_score": null,
"_source": {
"#version": "1",
"#timestamp": "2020-10-29T06:56:00.000Z",
"timestamp_string": "1603954560",
"search_query": "example 3",
"user_uuid": "asdfrghcwehf",
"browsing_url": "https://www.google.com/search?q=example+3",
},
"fields": {
"#timestamp": [
"2020-10-29T06:56:00.000Z"
]
},
"sort": [
1603954560000
]
}
Expected Output:
[
{
"key" : "bjvexyducsls",
"doc_count" : 846,
"1" : {
"value" : 1.603948557E12,
"value_as_string" : "2020-10-29T05:15:57.000Z"
}
},
{
"key" : "lhmsbq2osski",
"doc_count" : 420,
"1" : {
"value" : 1.6039476E12,
"value_as_string" : "2020-10-29T05:00:00.000Z"
}
},
{
"key" : "m2wiaufcbvvi",
"doc_count" : 1,
"1" : {
"value" : 1.603893635E12,
"value_as_string" : "2020-10-28T14:00:35.000Z"
}
},
{
"key" : "rrm3vd5ovqwg",
"doc_count" : 1,
"1" : {
"value" : 1.60389362E12,
"value_as_string" : "2020-10-28T14:00:20.000Z"
}
},
{
"key" : "x42lk4t3frfc",
"doc_count" : 72,
"1" : {
"value" : 1.60389318E12,
"value_as_string" : "2020-10-28T13:53:00.000Z"
}
}
]
Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings":{
"properties":{
"user":{
"type":"keyword"
},
"date":{
"type":"date"
}
}
}
}
Index Data:
{
"date": "2015-01-01",
"user": "user1"
}
{
"date": "2014-01-01",
"user": "user2"
}
{
"date": "2015-01-11",
"user": "user3"
}
Search Query:
{
"size": 0,
"aggs": {
"user_id": {
"terms": {
"field": "user",
"order": {
"sort_user": "asc"
}
},
"aggs": {
"sort_user": {
"min": {
"field": "date"
}
}
}
}
}
}
Search Result:
"aggregations": {
"user_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user2",
"doc_count": 1,
"sort_user": {
"value": 1.3885344E12,
"value_as_string": "2014-01-01T00:00:00.000Z"
}
},
{
"key": "user1",
"doc_count": 1,
"sort_user": {
"value": 1.4200704E12,
"value_as_string": "2015-01-01T00:00:00.000Z"
}
},
{
"key": "user3",
"doc_count": 1,
"sort_user": {
"value": 1.4209344E12,
"value_as_string": "2015-01-11T00:00:00.000Z"
}
}
]
}

Elasticsearch aggregation by nested object

I'm trying to build a product search with facet filtering for a eCommerce app. For the product brand I have the following structure:
"brand": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"id": {
"type": "integer"
}
}
}
I want to make an aggregation by brand id and return the whole object and the count of the documents. Something like this:
"brands" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : {
"name": "Apple",
"id": 1
},
"doc_count" : 34
},
{
"key" : {
"name": "Samsung",
"id": 2
},
"doc_count" : 23
}
]
}
Currently I'm writing the aggregation like this:
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
}
}
}
},
}
and the result looks like this:
"aggregations" : {
"brands" : {
"doc_count" : 15,
"brandIds" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 4
},
{
"key" : 2,
"doc_count" : 2
}
]
}
}
}
You can use a Term Aggregation within a Terms Aggregation like this :
GET {index_name}/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
},
"aggs": {
"by name": {
"terms": {
"field": "brand.name.keyword",
"size": 10
}
}
}
}
}
}
}
}
This would result in something like this:
"aggregations": {
"brands": {
"doc_count": 68,
"brandIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 46,
"by name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Apple",
"doc_count": 46
}
]
}
},
{
"key": 2,
"doc_count": 22,
"ny id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Samsung",
"doc_count": 22
}
]
}
}
]
}
}
}
Hope this helps!!

How to aggregate sub buckets of each bucket on nested documents

Full sample code:
https://gist.github.com/anonymous/329eaaf5654096c529da
I have a simple, standard product/options mapping like this for a standard ecommerce site:
"mappings": {
"product": {
"properties" : {
"name":
{
"type": "string",
"fields": {
"raw": { "type": "string", "analyzer": "lowercase" }
},
"analyzer": "default"
},
"options" : {
"type": "nested",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"values": {"type": "nested"}
}
},
"price":{"type": "integer"},
"createdAt": {
"type": "date",
"format": "basic_date_time"
}
}
}
}
Please note that 1 product has multiple options, and each option can have multiple values (ie.: a Shirt with option Color including blue, red; and option Size including M, XL)
Currently, after the query to search for products using multiple conditions, I aggregate the result to get a list of all options and options values in the result set:
"aggregations": {
"options": {
"nested": {
"path": "options"
},
"aggs": {
"options_ids": {
"terms": {
"field": "id"
}
},
"aggs": {
"nested": {
"path": "options.values"
},
"aggs": {
"options_values_ids": {
"terms": {
"field": "options.values.id"
}
}
}
}
}
}
}
All work well except I get something like this
"aggregations": {
"options": {
"doc_count": 4,
"options_ids": {
"buckets": [
{
"key": 1,
"doc_count": 2
},
{
"key": 2,
"doc_count": 2
}
]
},
"aggs": {
"doc_count": 7,
"options_values_ids": {
"buckets": [
{
"key": 1,
"doc_count": 2
},
{
"key": 5,
"doc_count": 2
},
{
"key": 2,
"doc_count": 1
},
{
"key": 3,
"doc_count": 1
},
{
"key": 6,
"doc_count": 1
}
]
}
}
}
}
As you can see, there is no way for me to know which option values belong to which options from the result. It will be much better if the available options values can be listed under each option. Is that possible at all?
You would need to nest your aggregations:
"aggregations": {
"options" : {
"aggs" : {
"options_ids" : {
"aggs" : {
"aggs" : {
"options_values_ids" : {
"terms" : {
"field" : "options.values.id"
}
}
},
"nested" : {
"path" : "options.values"
}
},
"terms" : {
"field" : "id"
}
}
},
"nested" : {
"path" : "options"
}
}
}

Resources