Sort Aggregated Buckets From Nested Object Array By Specific Field - elasticsearch

I have indexed documents such as
// doc 1
{
...,
"list": [{
"value": "a",
"order": 1
}, {
"value": "b",
"order": 2
}]
,...
}
// doc 2
{
...,
"list": [{
"value": "b",
"order": 2
}, {
"value": "c",
"order": 3
}]
,...
}
If I use the aggregation on the list.value:
{
"aggs": {
"values": {
"terms": {
"field": "list.value.keyword"
}
}
}
}
I get buckets in order b, a, c:
"buckets" : [
{
"key" : "b",
"doc_count" : 2
},
{
"key" : "a",
"doc_count" : 1
},
{
"key" : "c",
"doc_count" : 1
}
]
as keys would be sorted by the _count in desc order.
If I use the aggregation on the list.value with sub-aggregation for sorting in form of max(list.order):
{
"aggs": {
"values": {
"terms": {
"field": "list.value.keyword",
"order": { "max_order": "desc" }
},
"aggs": {
"max_order": { "max": { "field": "list.order" } }
}
}
}
}
I get buckets in order b, c, a
"buckets" : [
{
"key" : "b",
"doc_count" : 2,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "c",
"doc_count" : 1,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "a",
"doc_count" : 1,
"max_order" : {
"value" : 2.0
}
}
]
as both b and c have max order 3 in their lists of the object.
However, I want to write a query to get buckets in order c, b, a as their order is 3, 2, 1 respectively. How to achieve that?

You need to use nested aggregation, to get the buckets in order of c,b,a
Adding a working example with index data, mapping, search query and search result
Index Mapping
PUT testidx1
{
"mappings":{
"properties": {
"list":{
"type": "nested"
}
}
}
}
Index Data:
POST testidx1/_doc/1
{
"list": [
{
"value": "a",
"order": 1
},
{
"value": "b",
"order": 2
}
]
}
POST testidx1/_doc/2
{
"list": [
{
"value": "b",
"order": 2
},
{
"value": "c",
"order": 3
}
]
}
Search Query:
POST testidx1/_search
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "list"
},
"aggs": {
"unique_values": {
"terms": {
"field": "list.value.keyword",
"order": {
"max_order": "desc"
}
},
"aggs": {
"max_order": {
"max": {
"field": "list.order"
}
}
}
}
}
}
}
}
Search Response:
"aggregations" : {
"resellers" : {
"doc_count" : 4,
"unique_values" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "c",
"doc_count" : 1,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "b",
"doc_count" : 2,
"max_order" : {
"value" : 2.0
}
},
{
"key" : "a",
"doc_count" : 1,
"max_order" : {
"value" : 1.0
}
}
]
}
}
}
}

Related

bucket aggregation/bucket_script computation

How to apply computation using bucket fields via bucket_script? More so, I would like to understand how to aggregate on distinct, results.
For example, below is a sample query, and the response.
What I am looking for is to aggregate the following into two fields:
sum of all buckets dist.value from e.g. response (1+2=3)
sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
Query
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"field": "value"
}
}
]
}
},
"aggs":{
"sales_summary":{
"terms":{
"field":"qty",
"size":"100"
},
"aggs":{
"dist":{
"cardinality":{
"field":"somekey.keyword"
}
}
}
}
}
}
Query Result:
{
"aggregations": {
"sales_summary": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 10,
"doc_count": 100,
"dist": {
"value": 1
}
},
{
"key": 20,
"doc_count": 200,
"dist": {
"value": 2
}
}
]
}
}
}
You need to use a sum bucket aggregation, which is a pipeline aggregation to find the sum of response of cardinality aggregation across all the buckets.
Search Query for sum of all buckets dist.value from e.g. response (1+2=3):
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>dist"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
}
}
]
},
"sum_buckets" : {
"value" : 5.0
}
}
For the second requirement, you need to first modify the response of value in the bucket aggregation response, using bucket script aggregation, and then use the modified value to perform bucket sum aggregation on it.
Search Query for sum of all buckets (dist.value x key) from e.g., response (1x10)+(2x20)=50
POST idxtest1/_search
{
"size": 0,
"aggs": {
"sales_summary": {
"terms": {
"field": "qty",
"size": "100"
},
"aggs": {
"dist": {
"cardinality": {
"field": "pageview"
}
},
"format-value-agg": {
"bucket_script": {
"buckets_path": {
"newValue": "dist"
},
"script": "params.newValue * 10"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "sales_summary>format-value-agg"
}
}
}
}
Search Response :
"aggregations" : {
"sales_summary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 3,
"dist" : {
"value" : 2
},
"format-value-agg" : {
"value" : 20.0
}
},
{
"key" : 20,
"doc_count" : 3,
"dist" : {
"value" : 3
},
"format-value-agg" : {
"value" : 30.0
}
}
]
},
"sum_buckets" : {
"value" : 50.0
}
}

Count number of inner elements of array property (Including repeated values)

Given I have the following records.
[
{
"profile": "123",
"inner": [
{
"name": "John"
}
]
},
{
"profile": "456",
"inner": [
{
"name": "John"
},
{
"name": "John"
},
{
"name": "James"
}
]
}
]
I want to get something like:
"aggregations": {
"name": {
"buckets": [
{
"key": "John",
"doc_count": 3
},
{
"key": "James",
"doc_count": 1
}
]
}
}
I'm a beginner using Elasticsearch, and this seems to be a pretty simple operation to do, but I can't find how to achieve this.
If I try a simple aggs using term, it returns 2 for John, instead of 3.
Example request I'm trying:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
}
}
}
}
How can I possibly achieve this?
Additional Info: It will be used on Kibana later.
I can change mapping to whatever I want, but AFAIK Kibana doesn't like the "Nested" type. :(
You need to do a value_count aggregation, by default terms only does a doc_count, but the value_count aggregation will count the number of times a given field exists.
So, for your purposes:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
},
"aggs": {
"total": {
"value_count": {
"field": "inner.name"
}
}
}
}
}
}
Which returns:
"aggregations" : {
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "John",
"doc_count" : 2,
"total" : {
"value" : 3
}
},
{
"key" : "James",
"doc_count" : 1,
"total" : {
"value" : 2
}
}
]
}
}

ElasticSearch: Query to find max of count of objects based on field value

For the example document below in the index, I want to find max of count of actions based on component name across all documents in the index. Could you please help to find a way for this.
Expected result assuming only one document present in the Index:
comp1 -> action1 -> max 2 times
comp1 -> action2 -> max 1 time
comp2 -> action2 -> max 1 time
comp2 -> action3 -> max 1 time
Sample Document:
{
"id": "AC103902:A13A_AC140008:01BB_5FA2E8FA_1C08:0007",
"tokens": [
{
"name": "comp1",
"items": [
{
"action": "action1",
"attr": "value"
},
{
"action": "action1",
"attr": "value"
},
{
"action": "action2",
"attr": "value"
}
]
},
{
"name": "comp2",
"items": [
{
"action": "action2",
"attr": "value"
},
{
"action": "action3",
"attr": "value"
}
]
}
]
}
ElasticSearch Version: 7.9
I can loop through each document and calculate this at client side but I am curious to know if there is already an ES query which can help to get this kid of summary from the documents in the index.
You'll need to define both the tokens array and the tokens.items array as nested in order to get the correct stats.
Then, assuming your mapping looks something along the lines of
{
"mappings": {
"properties": {
"tokens": {
"type": "nested",
"properties": {
"items": {
"type": "nested"
}
}
}
}
}
}
the following query can be executed:
GET index_name/_search
{
"size": 0,
"aggs": {
"by_token_name": {
"nested": {
"path": "tokens"
},
"aggs": {
"token_name": {
"terms": {
"field": "tokens.name.keyword"
},
"aggs": {
"by_max_actions": {
"nested": {
"path": "tokens.items"
},
"aggs": {
"max_actions": {
"terms": {
"field": "tokens.items.action.keyword"
}
}
}
}
}
}
}
}
}
}
yielding these buckets:
[
{
"key" : "comp1", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 3,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action1", <--
"doc_count" : 2
},
{
"key" : "action2", <--
"doc_count" : 1
}
]
}
}
},
{
"key" : "comp2", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 2,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action2", <--
"doc_count" : 1
},
{
"key" : "action3", <--
"doc_count" : 1
}
]
}
}
}
]
which can be easily post-processed at client side.

ElasticSearch: find multiple unique values in array with complex objects

Suppose there is an index with documents following a structure like:
{
"array": [
{
"field1": 1,
"field2": 2
},
{
"field1": 3,
"field2": 2
},
{
"field1": 3,
"field2": 2
},
...
]
}
Is it possible to define a query that returns documents having multiple unique values for a field?
For the example above, the query searching on field2 would not return the document because all have the same value, but searching on field1 would return it because it has values 1 and 3.
The only thing I can think of is to store the unique values in the parent object and then query for its length, but, as it seems trivial, I'd hope to solve it without having to change the structure to something like:
{
"arrayField1Values" : [1, 3],
"arrayField2Values" : [2]
"array": [
{
"field1": 1,
"field2": 2
},
{
"field1": 3,
"field2": 2
},
{
"field1": 3,
"field2": 2
},
...
]
}
Thanks for anybody that can help!
My hunch was to go with a nested datatype but then I realized you could do a simple distinct count on the array-values of fields 1 and 2 using query scripts and top_hits:
PUT array
POST array/_doc
{
"array": [
{
"field1": 1,
"field2": 2
},
{
"field1": 3,
"field2": 2
},
{
"field1": 3,
"field2": 2
}
]
}
GET array/_search
{
"size": 0,
"aggs": {
"field1_is_unique": {
"filter": {
"script": {
"script": {
"source": "def uniques = doc['array.field1'].stream().distinct().sorted().collect(Collectors.toList()); return uniques.length > 1 ;",
"lang": "painless"
}
}
},
"aggs": {
"top_hits_field1": {
"top_hits": {}
}
}
},
"field2_is_unique": {
"filter": {
"script": {
"script": {
"source": "def uniques = doc['array.field2'].stream().distinct().sorted().collect(Collectors.toList()); return uniques.length > 1 ;",
"lang": "painless"
}
}
},
"aggs": {
"top_hits_field2": {
"top_hits": {}
}
}
}
}
}
yielding separate aggregations for whether field1 or field2 included unique value counts > 1:
"aggregations" : {
"field1_is_unique" : {
"doc_count" : 1,
"top_hits_field1" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "array",
"_type" : "_doc",
"_id" : "WbJhgnEBVBaNYdXKNktL",
"_score" : 1.0,
"_source" : {
"array" : [
{
"field1" : 1,
"field2" : 2
},
{
"field1" : 3,
"field2" : 2
},
{
"field1" : 3,
"field2" : 2
}
]
}
}
]
}
}
},
"field2_is_unique" : {
"doc_count" : 0,
"top_hits_field2" : {
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
}
}
Hope it helps.

ElasticSearch: Aggregate Over a Collected Set of Results

Let's say I have a set of... burgers...
For each burger, I have a set of images relating to each component of the burger.
Unfortunately, there isn't any consistency in the structure of these components (I didn't write it).
Here is an example of two documents:
{
"bunsResource": {
"image": {
"url": "./buns_1.png",
"who": "Sam"
},
"buns": [
{
"image": {
"url": "./top-bun_1.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_1.png",
"who": "Sarah"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
},
{
"bunsResource": {
"image": {
"url": "./buns_2.png",
"who": "Jim"
},
"buns": [
{
"image": {
"url": "./top-bun_2.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_2.png",
"who": "Kathy"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
}
What I need is a set of photographer / image count.
{
"who": "Sam",
"count": 1
},
{
"who": "Jim",
"count": 3
},
{
"who": "Sarah",
"count": 2
},
{
"who": "Kathy",
"count": 2
}
That is a UNIQUE image count, mind you!
I haven't been able to figure out how to achieve this...
I assume that I need to first resolve each burger to a unique set of url / who, then aggregate from there, but I can't figure out how to get the flattened list of url / who per burger.
It depends on whether the patties and buns arrays are nested or not. If they are not, then it's easy, you can simply run a terms aggregation using a script that gathers all the who fields from everywhere in the document:
POST not-nested/_search
{
"size": 0,
"aggs": {
"script": {
"terms": {
"script": {
"source": """
def list = new ArrayList();
list.addAll(doc['pattyResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.buns.image.who.keyword'].values);
list.addAll(doc['pattyResource.patties.image.who.keyword'].values);
return list;
"""
}
}
}
}
}
That will return this:
"aggregations" : {
"script" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 2
},
{
"key" : "Sam",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
}
However, if it's nested, things get more complicated as you'll need some client-side work to figure out the final counts, but we can simplify that client-side work with a few aggregations:
POST nested/_search
{
"size": 0,
"aggs": {
"bunsWho": {
"terms": {
"field": "bunsResource.image.who.keyword"
}
},
"bunsWhoNested": {
"nested": {
"path": "bunsResource.buns"
},
"aggs": {
"who": {
"terms": {
"field": "bunsResource.buns.image.who.keyword"
}
}
}
},
"pattiesWho": {
"terms": {
"field": "pattyResource.image.who.keyword"
}
},
"pattiesWhoNested": {
"nested": {
"path": "pattyResource.patties"
},
"aggs": {
"who": {
"terms": {
"field": "pattyResource.patties.image.who.keyword"
}
}
}
}
}
}
That will return this:
"aggregations" : {
"pattiesWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
},
"bunsWhoNested" : {
"doc_count" : 4,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
},
"pattiesWhoNested" : {
"doc_count" : 2,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
}
},
"bunsWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 1
},
{
"key" : "Sam",
"doc_count" : 1
}
]
}
}
And then you can simply create some client-side logic (here some sample code in Node.js) that adds the numbers up:
var whos = {};
var recordWho = function(who, count) {
whos[who] = (whos[who] || 0) + count;
};
resp.aggregations.pattiesWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.pattiesWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
console.log(whos);
=>
{ Kathy: 5, Jim: 3, Sam: 1, Sarah: 1 }

Resources