access nested variable from sub-aggregation on elasticsearch - elasticsearch

I have an index with documents that look like:
{
"id": 1,
"timeline": [{
"amount": {
"mpe": 30,
"drawn": 20
},
"interval": {
"gte": "2020-03-01",
"lte": "2020-04-01"
}
}, {
"amount": {
"mpe": 40,
"drawn": 10
},
"interval": {
"gte": "2020-04-01",
"lte": "2020-06-01"
}
}]
}
Then I have the following query that produces a time bucketed sum of the values from the original intervals:
{
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval",
"calendar_interval": "day"
},
"aggs": {
"sum_mpe": {
"sum": {
"field": "timeline.amount.mpe"
}
},
"sum_drawn": {
"sum": {
"field": "timeline.amount.drawn"
}
}
}
}
}
}
}
}
The above works like a charm yielding the correct sum for each day. Now I want to improve it so I can dynamically multiply the values by a given number that may vary between query executions, although for simplicity I will just use a fixed number 2. I've tried the following:
{
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval",
"calendar_interval": "day"
},
"aggs": {
"sum_mpe": {
"sum": {
"script": "timeline.amount.mpe * 2"
}
},
"sum_drawn": {
"sum": {
"script": "timeline.amount.drawn * 2"
}
}
}
}
}
}
}
}
But I get the following error:
{
"reason": {
"type": "script_exception",
"reason": "compile error",
"script_stack": [
"timeline.amount.mpe * 2",
"^---- HERE"
],
"script": "timeline.amount.mpe * 2",
"lang": "painless",
"position": {
"offset": 0,
"start": 0,
"end": 23
},
"caused_by": {
"type": "illegal_argument_exception",
"reason": "Variable [timeline] is not defined."
}
}
}
Is there a way to make the nested variable declared above available in the script?

This link states as how to access the fields via script. Note that you can only use this for fields which are analyzed i.e. text type.
The below should help:
POST <your_index_name>/_search
{
"size": 0,
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval.gte",
"calendar_interval": "day",
"min_doc_count": 1 <---- Note this
},
"aggs": {
"sum_mpe": {
"sum": {
"script": "doc['timeline.amount.mpe'].value * 2" <---- Note this
}
},
"sum_drawn": {
"sum": {
"script": "doc['timeline.amount.drawn'].value * 2" <---- Note this
}
}
}
}
}
}
}
}
Also note that I've made use of min_doc_count so that your histogram would only show you the valid dates.

Related

Bucket_selector on nested agg bucket doesn't works

I'm trying to make an aggregation on a sibling's children aggregation to filter bucket based on a requested quantity condition, so here is my query :
GET _search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"terms": {
"product.id": [20,21,22,23,24]
}
}
]
}
},
"aggs": {
"carts": {
"terms": {
"field": "item.cart_key"
},
"aggs": {
"unique_product": {
"terms": {
"field": "product.id"
},
"aggs": {
"quantity": {
"sum": {
"field": "item.quantity"
}
}
}
},
"filtered_product_quantity": {
"bucket_selector": {
"buckets_path": {
"productId": "unique_product.key",
"productQuantity": "unique_product>quantity"
},
"script": {
"params": {
"requiredQuantities": {
"20": null,
"21": null,
"22": null,
"23": 3,
"24": null
}
},
"lang": "painless",
"source": "params.requiredQuantities[params.productId] <= params.productQuantity"
}
}
}
}
}
}
}
And the error :
{
"error": {
"root_cause": [],
"type": "search_phase_execution_exception",
"reason": "",
"phase": "fetch",
"grouped": true,
"failed_shards": [],
"caused_by": {
"type": "aggregation_execution_exception",
"reason": "buckets_path must reference either a number value or a single value numeric metric aggregation, got: [Object[]] at aggregation [unique_product]"
}
},
"status": 500
}
Here is a sample document set :
[
{
product.id: 12,
item.cart_key: abc_123,
item.quantity: 2
},
{
product.id: 11,
item.cart_key: abc_123,
item.quantity: 1
},
{
product.id: 23,
item.cart_key: def_456,
item.quantity: 1
}
]
Is it the appropriate aggregation in use ?
In other way, I would like to :
Aggregate my documents by cart_key.
Per product.id , sum the quantity
Filter aggregations that have a quantity higher than a given Record object {[product.id]: minimum_quantity} (here is the requiredQuantities param
I don't know if the source script will works as elasticsearch can't reach it.
I don't think you handle the problem correctly, but here an hacky working solution
{
"size": 0,
"aggs": {
"carts": {
"terms": {
"field": "item.cart_key"
},
"aggs": {
"unique_product": {
"terms": {
"field": "product.id"
},
"aggs": {
"quantity": {
"sum": {
"field": "item.quantity"
}
},
"hackProductId": {
"max": {
"field": "product.id"
}
},
"filtered_product_quantity": {
"bucket_selector": {
"buckets_path": {
"productQuantity": "quantity",
"productId": "hackProductId"
},
"script": {
"params": {
"requiredQuantities": {
"12": 0,
"11": 0,
"22": 0,
"23": 0,
"24": 0
}
},
"lang": "painless",
"source": "params.requiredQuantities[((int)params.productId).toString()] <= params.productQuantity"
}
}
}
}
}
}
}
}
}

Subtract numeric fields between two documents with different timestamp

Lets say I have these data samples:
{
"date": "2019-06-16",
"rank": 150
"name": "doc 1"
}
{
"date": "2019-07-16",
"rank": 100
"name": "doc 1"
}
{
"date": "2019-06-16",
"rank": 50
"name": "doc 2"
}
{
"date": "2019-07-16",
"rank": 80
"name": "doc 2"
}
The expected result is by subtracting the rank field from two same name of docs with different date (old date - new date):
{
"name": "doc 1",
"diff_rank": 50
}
{
"name": "doc 2",
"diff_rank": -30
}
And sort by diff_rank if possible, otherwise I will just sort manually after getting the result.
What I have tried is by using date_histogram and serial_diff but some results are missing the diff_rank value in somehow which I am sure the data exist:
{
"aggs" : {
"group_by_name": {
"terms": {
"field": "name"
},
"aggs": {
"days": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"the_rank": {
"sum": {
"field": "rank"
}
},
"diff_rank": {
"serial_diff": {
"buckets_path": "the_rank",
"lag" : 30 // 1 month or 30 days in this case
}
}
}
}
}
}
}
}
The help will be much appreciated to solve my issue above!
Finally, I found a method from official doc using Filter, Bucket Script aggregation and Bucket Sort to sort the result. Here is the final snippet code:
{
"size": 0,
"aggs" : {
"group_by_name": {
"terms": {
"field": "name",
"size": 50,
"shard_size": 10000
},
"aggs": {
"last_month_rank": {
"filter": {
"term": {"date": "2019-06-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"latest_rank": {
"filter": {
"term": {"date": "2019-07-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"diff_rank": {
"bucket_script": {
"buckets_path": {
"lastMonthRank": "last_month_rank>rank",
"latestRank": "latest_rank>rank"
},
"script": "params.lastMonthRank - params.latestRank"
}
},
"rank_bucket_sort": {
"bucket_sort": {
"sort": [
{"diff_rank": {"order": "desc"}}
],
"size": 50
}
}
}
}
}
}

Sum and count aggregations over Elasticsearch fields

I am new to Elasticsearch and I am looking to perform certain aggregations over the fields from an Elasticsearch 5.x index. I have an index that contains the documents with fields langs (which have nested structure) and docLang. These are dynamically mapped fields. Following are the examples documents
DOC 1:
{
"_index":"A",
"_type":"document",
"_id":"1",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1,
"es":2,
"zh":3
},
"Y":{
"en":4,
"es":5,
"zh":6
}
},
"docLang": "en"
}
}
DOC 2:
{
"_index":"A",
"_type":"document",
"_id":"2",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1,
"es":2
},
"Y":{
"en":3,
"es":4
}
},
"docLang": "es"
}
}
DOC 3:
{
"_index":"A",
"_type":"document",
"_id":"2",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1
},
"Y":{
"en":2
}
},
"docLang": "en"
}
}
I want to perform sum aggregation over the langs field in a way that for each key (X/Y) and for each language, I can get the sum across all documents in an index. Also, I want to produce the counts of documents for each type of language from docLang field.
e.g.: For above 3 documents, sum aggregation over langs field would look like below:
"langs":{
"X":{
"en":3,
"es":4,
"zh":3
},
"Y":{
"en":9,
"es":9,
"zh":6
}
}
And the docLang count would look like below:
"docLang":{
"en" : 2,
"es" : 1
}
Also because of some production env restrictions, I cannot use scripts in Elasticsearch. So, I was wondering if it is possible to use just field aggregation type for above fields?
{
"size": 0,
"aggs": {
"X": {
"nested": {
"path": "langs.X"
},
"aggs": {
"X_sum_en": {
"sum": {
"field": "langs.X.en"
}
},
"X_sum_es": {
"sum": {
"field": "langs.X.es"
}
},
"X_sum_zh": {
"sum": {
"field": "langs.X.zh"
}
}
}
},
"Y": {
"nested": {
"path": "langs.Y"
},
"aggs": {
"Y_sum_en": {
"sum": {
"field": "langs.Y.en"
}
},
"Y_sum_es": {
"sum": {
"field": "langs.Y.es"
}
},
"Y_sum_zh": {
"sum": {
"field": "langs.Y.zh"
}
}
}
},
"sum_docLang": {
"terms": {
"field": "docLang.keyword",
"size": 10
}
}
}
}
Since you didn't mention, but I think it's important. I made X and Y as nested fields:
"langs": {
"properties": {
"X": {
"type": "nested",
"properties": {
"en": {
"type": "long"
},
"es": {
"type": "long"
},
"zh": {
"type": "long"
}
}
},
"Y": {
"type": "nested",
"properties": {
"en": {
"type": "long"
},
"es": {
"type": "long"
},
"zh": {
"type": "long"
}
}
}
}
}
But, if you fields are not nested at all and here I mean actually the nested field type in Elasticsearch, a simple aggregation like this one should be enough:
{
"size": 0,
"aggs": {
"X_sum_en": {
"sum": {
"field": "langs.X.en"
}
},
"X_sum_es": {
"sum": {
"field": "langs.X.es"
}
},
"X_sum_zh": {
"sum": {
"field": "langs.X.zh"
}
},
"Y_sum_en": {
"sum": {
"field": "langs.Y.en"
}
},
"Y_sum_es": {
"sum": {
"field": "langs.Y.es"
}
},
"Y_sum_zh": {
"sum": {
"field": "langs.Y.zh"
}
},
"sum_docLang": {
"terms": {
"field": "docLang.keyword",
"size": 10
}
}
}
}

How to display only the key from the bucket

I have an index with millions of documents. Suppose each of my documents has some code, and I need to find the list of codes matching some criteria. The only way I found doing that, is using whole lot of aggregations, so I created an ugly query which does exactly what I want:
POST my-index/_search
{
"query": {
"range": {
"timestamp": {
"gte": "2017-08-01T00:00:00.000",
"lt": "2017-08-08T00:00:00.000"
}
}
},
"size": 0,
"aggs": {
"codes": {
"terms": {
"field": "code",
"size": 10000
},
"aggs": {
"days": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"format": "dd"
},
"aggs": {
"hours": {
"date_histogram": {
"field": "timestamp",
"interval": "hour",
"format": "yyyy-MM-dd:HH"
},
"aggs": {
"hour_income": {
"sum": {
"field": "price"
}
}
}
},
"max_income": {
"max_bucket": {
"buckets_path": "hours>hour_income"
}
},
"day_income": {
"sum_bucket": {
"buckets_path": "hours.hour_income"
}
},
"more_than_sixty_percent": {
"bucket_script": {
"buckets_path": {
"dayIncome": "day_income",
"maxIncome": "max_income"
},
"script": "params.maxIncome - params.dayIncome * 60 / 100 > 0 ? 1 : 0"
}
}
}
},
"amount_of_days": {
"sum_bucket": {
"buckets_path": "days.more_than_sixty_percent"
}
},
"bucket_filter": {
"bucket_selector": {
"buckets_path": {
"amountOfDays": "amount_of_days"
},
"script": "params.amountOfDays >= 3"
}
}
}
}
}
}
The response I get is a few millions lines of JSON, consisting of buckets. Each bucket has more than 700 lines (and buckets of its own), but all I need is its key, so that I have my list of codes. I guess it's not good having a response a few thousand times larger than neccessary, and there might be problems with parsing. So I wanted to ask, is there any way to hide the other info in the bucket and get only the keys?
Thanks.

Is it possible to perform elasticsearch nested stats aggregation on a field defined by the parent aggregation?

I'm trying to do a query to generate a plot. My data index looks like this:
"mappings": {
"mydata": {
"properties": {
"type": { "type": "string", "index": "not_analyzed" },
"stamp": { "type": "date", "format": "date_hour_minute_second_millis" },
"data": { "type": "object" }
}
}
Depending on the type, the data field will contain different objects, e.g.,
temperature_data = {
"type": "temperature",
"stamp": "2015-11-01T15:25:19.123",
"data": {"temperature": 23.4, "variance": 0.0}
}
humidity_data = {
"type": "humidity",
"stamp": "2015-11-01T15:26:21.063",
"data": {"humidity": 75.1, "variance": 0.0}
}
I'm trying to aggregate the data on buckets depending on their type, and then perform a date histogram to get the stats of each reading (temperature, humidity). My problem is how to set the field on the stats aggs since it changes with the type (for "type": "temperature" the field is data.temperature for example):
query = {
"size": 0,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{"range" : {
"stamp" : {
"gt" : start_stamp,
"lt" : end_stamp
}
}}
]
}
}
}
},
"aggs": {
"pathes": {
"terms": {
"field": "type"
},
"aggs": {
"points": {
"date_histogram": {
"field": "stamp",
"interval": interval
},
"aggs": {
"point_stats": {
"stats": {
"field": "data."+field???
}
}
}
}
}
}
}
}
* UPDATE *
As suggested I added a data-type.groovy file to config/scripts/, the file contains the following:
return doc['data.temperature'].value
Elasticsearch is able to compile the script:
[2015-11-02 19:50:32,651][INFO ][script] [Atum] compiling script file [/home/user/elasticsearch-1.7.0/config/scripts/data-type.groovy]
I updated the query to load the script file:
query = {
"size": 0,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{"range" : {
"stamp" : {
"gt" : start_stamp,
"lt" : end_stamp
}
}}
]
}
}
}
},
"aggs": {
"pathes": {
"terms": {
"field": "type"
},
"aggs": {
"points": {
"date_histogram": {
"field": "stamp",
"interval": interval
},
"aggs": {
"point_stats": {
"stats": {
"script": {"file": "data-type"}
}
}
}
}
}
}
}
}
When I run the query I get the following output:
{u'status': 400, u'error': u'SearchPhaseExecutionException[Failed to execute phase [query], ... Parse Failure [Unexpected token START_OBJECT in [point_stats].]]; }]'}
There's only temperature data in the database, if I change "script": {"file": "data-type"} for "field": "data.temperature" the query works.
One option is to rename the humidity and temperature fields to something identical, like value, so you can simply aggregate on that field and you're good. You'd already know what kind of value it is since you know it from the type field.
"aggs": {
"pathes": {
"terms": {
"field": "type"
},
"aggs": {
"points": {
"date_histogram": {
"field": "stamp",
"interval": interval
},
"aggs": {
"point_stats": {
"stats": {
"field": "data.value"
}
}
}
}
}
}
}
The second option is to use a script but that'd be less performant and less scalable if you were to add more type of data (pressure, etc)
"aggs": {
"pathes": {
"terms": {
"field": "type"
},
"aggs": {
"points": {
"date_histogram": {
"field": "stamp",
"interval": interval
},
"aggs": {
"point_stats": {
"stats": {
"script": "return doc.type.value == 'temperature' ? doc['data.temperature'].value : doc['data.humidity'].value"
}
}
}
}
}
}
}
Note that for this second option you need to enable dynamic scripting

Resources