es cumulative_sum cannot support the number of returned docs - elasticsearch

I got a confusion that how to specify the number of returned docs from cumulative_sum aggs, this is my search:
{
"query": {"match_all": {}},
"size": 0,
"aggs": {
"group_by_date": {
"date_histogram": {
"field": "timestamp",
"interval": "day"
},
"aggs": {
"cumulative_docs": {
"cumulative_sum": {"buckets_path": "_count"}
}
}
}
}
}
and it returns max number of buckets
"aggregations" : {
"group_by_date" : {
"buckets" : [
{
"key_as_string" : "2022-09-03T00:00:00.000Z",
"key" : 1662163200000,
"doc_count" : 19,
"cumulative_docs" : {
"value" : 19.0
}
},
{
"key_as_string" : "2022-09-04T00:00:00.000Z",
"key" : 1662249600000,
"doc_count" : 0,
"cumulative_docs" : {
"value" : 19.0
}
},
{
"key_as_string" : "2022-09-05T00:00:00.000Z",
"key" : 1662336000000,
"doc_count" : 0,
"cumulative_docs" : {
"value" : 19.0
}
},
{
"key_as_string" : "2022-09-06T00:00:00.000Z",
"key" : 1662422400000,
"doc_count" : 0,
"cumulative_docs" : {
"value" : 19.0
}
},
{
"key_as_string" : "2022-09-07T00:00:00.000Z",
"key" : 1662508800000,
"doc_count" : 0,
"cumulative_docs" : {
"value" : 19.0
}
},
{
"key_as_string" : "2022-09-08T00:00:00.000Z",
"key" : 1662595200000,
"doc_count" : 0,
"cumulative_docs" : {
"value" : 19.0
}
},
...
I tried to use bucket_selector to filter top10 or N in cumulative_sum but its return error such like can not support sub aggs in cumulative_sum, and also tried to use size param but not support.
if I wanna return only ten or more(I can specify it myself), how can I revise my code here?

Related

How to get word count in docs as a aggregate over time in elastic search?

I am trying to get word count trends in docs as aggregate result . Although using the following approach I am able to get the doc count aggregation result but I am not able to find any resources using which I can get word count for the month of jan , feb & mar
PUT test/_doc/1
{
"description" : "one two three four",
"month" : "jan"
}
PUT test/_doc/2
{
"description" : "one one test test test",
"month" : "feb"
}
PUT test/_doc/3
{
"description" : "one one one test",
"month" : "mar"
}
GET test/_search
{
"size": 0,
"query": {
"match": {
"description": {
"query": "one"
}
}
},
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
}
}
}
}
OUTPUT
{
"took" : 706,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1
},
{
"key" : "jan",
"doc_count" : 1
},
{
"key" : "mar",
"doc_count" : 1
}
]
}
}
}
EXPECTED WORD COUNT OVER MONTH
"aggregations" : {
"monthly_count" : {
"buckets" : [
{
"key" : "feb",
"word_count" : 2
},
{
"key" : "jan",
"word_count" : 1
},
{
"key" : "mar",
"word_count" : 3
}
]
}
}
Maybe this query can help you:
GET test/_search
{
"size": 0,
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
},
"aggs": {
"count_word_one": {
"terms": {
"script": {
"source": """
def str = doc['description.keyword'].value;
def array = str.splitOnToken(' ');
int i = 0;
for (item in array) {
if(item == 'one'){
i++
}
}
return i;
"""
},
"size": 10
}
}
}
}
}
}
Response:
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "2",
"doc_count" : 1
}
]
}
},
{
"key" : "jan",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1
}
]
}
},
{
"key" : "mar",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 1
}
]
}
}
]
}
}

Aggregating all fields for an object in a search query, without manually specifying the fields

I have an index products which has an internal object attributes which looks like:
{
properties: {
id: {...},
name: {...},
colors: {...},
// remaining fields
}
}
I'm trying to produce a search query with this form and I need to figure out how to write the aggs object.
{ query: {...}, aggs: {...} }
I can write this out manually for two fields to get the desired result, however the object contains 50+ fields so I need it to be able to handle it automatically
"aggs": {
"attributes.color_group.id": {
"terms": {
"field": "attributes.color_group.id.keyword"
}
},
"attributes.product_type.id": {
"terms": {
"field": "attributes.product_type.id.keyword"
}
}
}
Gives me the result:
"aggregations" : {
"attributes.product_type.id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 34,
"buckets" : [
{
"key" : "374",
"doc_count" : 203
},
{
"key" : "439",
"doc_count" : 79
},
{
"key" : "460",
"doc_count" : 28
},
{
"key" : "451",
"doc_count" : 24
},
{
"key" : "558",
"doc_count" : 18
},
{
"key" : "500",
"doc_count" : 10
},
{
"key" : "1559",
"doc_count" : 9
},
{
"key" : "1560",
"doc_count" : 9
},
{
"key" : "455",
"doc_count" : 7
},
{
"key" : "501",
"doc_count" : 6
}
]
},
"attributes.color_group.id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 35,
"buckets" : [
{
"key" : "12",
"doc_count" : 98
},
{
"key" : "54",
"doc_count" : 48
},
{
"key" : "118",
"doc_count" : 43
},
{
"key" : "110",
"doc_count" : 41
},
{
"key" : "111",
"doc_count" : 35
},
{
"key" : "71",
"doc_count" : 35
},
{
"key" : "119",
"doc_count" : 24
},
{
"key" : "62",
"doc_count" : 21
},
{
"key" : "115",
"doc_count" : 20
},
{
"key" : "113",
"doc_count" : 15
}
]
}
}
Which is exactly what I want. After some research I found that you can use query_string which would allow me to find everything starting with attributes., however it does not seem to work inside aggregations.
As I know what you are asking is not possible with inbuild functionality of elasticsearch. But there are some work around you can do like:
Use Search Template:
Below is Example for Search Template, where you will provide list of field as array and it will create the aggregation for all provided fields. you can store search template using Script API and use id of search template while calling search request.
POST dyagg/_search/template
{
"source": """{
"query": {
"match_all": {}
},
"aggs": {
{{#filter}}
"{{.}}": {
"terms": {
"field": "{{.}}",
"size": 10
}
}, {{/filter}}
"name": {
"terms": {
"field": "name",
"size": 10
}
}
}
}""",
"params": {
"filter":["lastname","firstname","city","country"]
}
}
Response:
"aggregations" : {
"country" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "India",
"doc_count" : 4
}
]
},
"firstname" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Rajan",
"doc_count" : 1
},
{
"key" : "Sagar",
"doc_count" : 1
},
{
"key" : "Sajan",
"doc_count" : 1
},
{
"key" : "Sunny",
"doc_count" : 1
}
]
},
"city" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Mumbai",
"doc_count" : 2
},
{
"key" : "Pune",
"doc_count" : 2
}
]
},
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Rajan Desai",
"doc_count" : 1
},
{
"key" : "Sagar Patel",
"doc_count" : 1
},
{
"key" : "Sajan Patel",
"doc_count" : 1
},
{
"key" : "Sunny Desai",
"doc_count" : 1
}
]
},
"lastname" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Desai",
"doc_count" : 2
},
{
"key" : "Patel",
"doc_count" : 2
}
]
}
}
Second way is using programming. Please check this stackoverflow answer where they have mentioned about how to do in PHP so same you can follow for other language.
NOTE:
If you noticed search template, I have added one static aggregation for name field and reason for adding is to avoid extra comma in the end of for loop complete. If you not add then you will get json_parse_exception.

Aggregate by custom defined buckets, according to field value

I'm interested in aggregating my data into buckets, but I want to put two distinct values to the same bucket.
This is what I mean:
Say I have this query:
GET _search
{
"size": 0,
"aggs": {
"my-agg-name": {
"terms": {
"field": "ecs.version"
}
}
}
}
it returns this response:
"aggregations" : {
"my-agg-name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1.12.0",
"doc_count" : 642826144
},
{
"key" : "8.0.0",
"doc_count" : 204064845
},
{
"key" : "1.1.0",
"doc_count" : 16508253
},
{
"key" : "1.0.0",
"doc_count" : 9162928
},
{
"key" : "1.6.0",
"doc_count" : 1111542
},
{
"key" : "1.5.0",
"doc_count" : 10445
}
]
}
}
every distinct value of the field ecs.version is in it's own bucket.
But say I wanted to define my buckets such that:
bucket1: [1.12.0, 8.0.0]
bucket2: [1.6.0, 8.4.0]
bucket3: [1.0.0, 8.8.0]
Is this possible in anyway?
I know I can just return all the buckets and do the sum programmatically, but this list can be very long, I don't think it would be efficient. Am I wrong?
You can use Runtime Mapping to generat runtime field and that field will be use for aggregation. I have done below exmaple on ES 7.16.
I have index some of the sample document and below is aggregation output without join on multipul values:
"aggregations" : {
"version" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1.12.0",
"doc_count" : 3
},
{
"key" : "1.6.0",
"doc_count" : 3
},
{
"key" : "8.4.0",
"doc_count" : 3
},
{
"key" : "8.0.0",
"doc_count" : 2
}
]
}
}
You can use below query with runtime mapping but you need to add multipul if condition for your version mappings:
{
"size": 0,
"runtime_mappings": {
"normalized_version": {
"type": "keyword",
"script": """
String version = doc['version.keyword'].value;
if (version.equals('1.12.0') || version.equals('8.0.0')) {
emit('1.12.0, 8.0.0');
} else if (version.equals('1.6.0') || version.equals('8.4.0')){
emit('1.6.0, 8.4.0');
}else {
emit(version);
}
"""
}
},
"aggs": {
"genres": {
"terms": {
"field": "normalized_version"
}
}
}
}
Below is output of above aggregation query:
"aggregations" : {
"genres" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1.6.0, 8.4.0",
"doc_count" : 6
},
{
"key" : "1.12.0, 8.0.0",
"doc_count" : 5
}
]
}
}

ElasticSearch - Sort on the subaggregation

I am quite new to elasticsearch, I am trying to sort on a subaggregations. that is my results should be sorted based on the sub aggregations first. I have tried lot of things to enable this sort but it isn't working. Can anyone help with this?
{
"aggs": {
"distinct_part": {
"terms": {
"field": "part",
"size": 1000
}
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer",
"size": 1000
}
}
}
}
I am trying to sort on the manufacturer, my entire result should be sorted on that? Can someone point me on how I can achieve that?
I tried to do a test locally with your query. I did a small correction if I understood your issue well. I ingested the following data in the index "subsorting":
"part": "car",
"manufacturer": "brandA"
"part": "car",
"manufacturer": "brandB"
"part": "car",
"manufacturer": "brandC"
"part": "motor",
"manufacturer": "brandA"
"part": "motor",
"manufacturer": "brandB"
"part": "motor",
"manufacturer": "brandC"
Note: Both part and manufacturer are mapped as text.
GET subsorting/_search
{
"size": 0,
"aggs": {
"distinct_part": {
"terms": {
"field": "part.keyword",
"size": 1000
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer.keyword",
"order": {
"_key": "asc"
},
"size": 1000
}
}
}
}
}
}
If both fields "part" and "manufacturer" are mapped as keywords, remove the ".keywords" from the query.
The response from the above query is as follows if sorted as ascending order:
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 2
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
}
]
}
}
If you need the result as descending order, here is the response where "_key": "desc":
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 2
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 1
}
]
}
}
]
}
}
Links:
https://www.elastic.co/guide/en/elasticsearch/reference/7.9/search-aggregations-bucket-terms-aggregation.html

Elasticsearch order aggregations bucket based on a field (can be text/string)

My document has a category id.
This is my aggregation query:
"aggs": {
"categories": {
"filter": {
"bool": {
"must": [
{
"exists": {
"field": "price"
}
}
]
}
},
"aggs": {
"categories": {
"terms": {
"field": "category_id",
"order": {
"_count": "desc"
},
"size": 15
}
}
}
}
It produces the following results:
"categories" : {
"doc_count" : 92485,
"categories" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 4780,
"buckets" : [ {
"key" : 5053,
"doc_count" : 21827
}, {
"key" : 5413,
"doc_count" : 15760
}, {
"key" : 5057,
"doc_count" : 12473
}, {
"key" : 77978,
"doc_count" : 11388
}, {
"key" : 5030,
"doc_count" : 9898
}, {
"key" : 5055,
"doc_count" : 2492
}, {
"key" : 8543,
"doc_count" : 2461
}, {
"key" : 5684,
"doc_count" : 2106
}, {
"key" : 5050,
"doc_count" : 2001
}, {
"key" : 8544,
"doc_count" : 1803
}, {
"key" : 5049,
"doc_count" : 1635
}, {
"key" : 5054,
"doc_count" : 1284
}, {
"key" : 5035,
"doc_count" : 977
}, {
"key" : 8731,
"doc_count" : 817
}, {
"key" : 8732,
"doc_count" : 783
} ]
}
}
Is it possible to get the response such that buckets are ordered by category_id or any other field post bucketing as I want to select only 15 such buckets with maximum doc_count.
Also if possible is there a way do it based on a field which is text/string.
I tried sub-aggregation but couldn't figure it out.

Resources