Subtract numeric fields between two documents with different timestamp - elasticsearch

Lets say I have these data samples:
{
"date": "2019-06-16",
"rank": 150
"name": "doc 1"
}
{
"date": "2019-07-16",
"rank": 100
"name": "doc 1"
}
{
"date": "2019-06-16",
"rank": 50
"name": "doc 2"
}
{
"date": "2019-07-16",
"rank": 80
"name": "doc 2"
}
The expected result is by subtracting the rank field from two same name of docs with different date (old date - new date):
{
"name": "doc 1",
"diff_rank": 50
}
{
"name": "doc 2",
"diff_rank": -30
}
And sort by diff_rank if possible, otherwise I will just sort manually after getting the result.
What I have tried is by using date_histogram and serial_diff but some results are missing the diff_rank value in somehow which I am sure the data exist:
{
"aggs" : {
"group_by_name": {
"terms": {
"field": "name"
},
"aggs": {
"days": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"the_rank": {
"sum": {
"field": "rank"
}
},
"diff_rank": {
"serial_diff": {
"buckets_path": "the_rank",
"lag" : 30 // 1 month or 30 days in this case
}
}
}
}
}
}
}
}
The help will be much appreciated to solve my issue above!

Finally, I found a method from official doc using Filter, Bucket Script aggregation and Bucket Sort to sort the result. Here is the final snippet code:
{
"size": 0,
"aggs" : {
"group_by_name": {
"terms": {
"field": "name",
"size": 50,
"shard_size": 10000
},
"aggs": {
"last_month_rank": {
"filter": {
"term": {"date": "2019-06-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"latest_rank": {
"filter": {
"term": {"date": "2019-07-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"diff_rank": {
"bucket_script": {
"buckets_path": {
"lastMonthRank": "last_month_rank>rank",
"latestRank": "latest_rank>rank"
},
"script": "params.lastMonthRank - params.latestRank"
}
},
"rank_bucket_sort": {
"bucket_sort": {
"sort": [
{"diff_rank": {"order": "desc"}}
],
"size": 50
}
}
}
}
}
}

Related

Elasticsearch aggregation with unqiue counting

My documents consist of a history of orders and their state, here a minimal example:
{
"orderNumber" : "xyz",
"state" : "shipping",
"day" : "2022-07-20",
"timestamp" : "2022-07-20T15:06:44.290Z",
}
the state can be strings like shipping, processing, redo,...
For every possible state, I need to count the number of orders that had this state at some point during a day, without counting a state twice for the same orderNumber that day (which can happen if there is a problem and it needs to start from the beginning that same day).
My aggregation looks like this:
GET order-history/_search
{
"aggs": {
"countDays": {
"terms": {
"field": "day",
"order": {
"_key": "desc"
},
"size": 20
},
"aggs": {
"countStates": {
"terms": {
"field": "state.keyword",
"size": 10
}
}
}
}
}
, "size": 1
}
However, this will count a state for a given orderNumber twice if it reappears that same day. How would I prevent it from counting a state twice for each orderNumber, if it is on the same day?
Tldr;
I don't think there is a flexible and simple solution.
But if you know in advance the number of state that exists. Maybe through another aggregation query, to get all type of state.
You could do the following
POST /_bulk
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"shipping","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"redo","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"shipping","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"bbb","state":"processing","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"bbb","state":"shipping","day":"2022-07-20"}
GET 73138766/_search
{
"size": 0,
"aggs": {
"per_day": {
"date_histogram": {
"field": "day",
"calendar_interval": "day"
},
"aggs": {
"shipping": {
"filter": { "term": { "state.keyword": "shipping" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
},
"processing": {
"filter": { "term": { "state.keyword": "processing" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
},
"redo": {
"filter": { "term": { "state.keyword": "redo" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
}
}
}
}
}
You will obtain the following results
{
"aggregations": {
"per_day": {
"buckets": [
{
"key_as_string": "2022-07-20T00:00:00.000Z",
"key": 1658275200000,
"doc_count": 5,
"shipping": {
"doc_count": 3,
"orders": {
"value": 2
}
},
"processing": {
"doc_count": 1,
"orders": {
"value": 1
}
},
"redo": {
"doc_count": 1,
"orders": {
"value": 1
}
}
}
]
}
}
}

Perform multi-field / multi-dimensional aggregations with nested fields in Elastic Search

I am tracking attendance of few students. I am storing their details in the index like the below.
Each doc in "entries" have few other fields. The following data shows that a student has attended 6 classes on "Monday".
"entries" is of type "nested"
{
reg_id: 1111,
"entires" : [
{
id: "123"
day: 'Monday'
},
{
id: "1234",
attendance: true
},
{
id: "12345",
classes_attended: 6
}
],
}
I want the count of each classes_attended of students for each day.
For Example "72 entries of students found for "Monday", who has attended 6 classes"
Sample desired output - This is just a sample I am completely fine if the output schema is changed.
[
{
"day" : "monday",
"classes_attended": 6,
count: 4
},
{
"day" : "monday",
"classes_attended": 1,
count: 5
},
{
"day" : "tuesday",
"classes_attended": 5,
count: 2
},
{
"day" : "tuesday",
"classes_attended": 6,
count: 1
}
]
Not sure How to start with the aggregations query:
I tried with the following query but I know its not the correct solution
"aggs": {
"attendance_aggs": {
"nested": {
"path": "entries"
},
"aggs": {
"days": {
"terms": {
"field": "entries.day"
},
"aggs": {
"attended": {
"reverse_nested": {},
"aggs":{
"class_attended_day": {
"terms": {
"field": "entries.classes_attended"
},
"aggs": {
"class_attended_days_count": {
"reverse_nested": {},
"aggs": {
"classes_attended_final": {
"cardinality": {
"field": "entries.class_attended"
}
}
}
}
}
}
}
}
}
}
}
}
}
It's unclear what the top-level object is so I'm going to assume it's a "student attendance entry per day". I'm also unsure what the entries.ids represent but I'll assume you'll be needing them at some later point so I'll keep them untouched.
Now, since all that your entries objects have in common is the id, they can be decoupled. Meaning that you should be using nested if any only if you share some attributes across all objects which need their attribute connections preserved. Since I don't see entries.id anywhere in your aggs, I'd recommend the following adjustments to your mapping:
PUT students
{
"mappings": {
"properties": {
"day": { ------------
"type": "keyword" |
}, |
"attendance": { |
"type": "boolean" | <--
}, |
"classes_attended": { |
"type": "integer" |
}, ------------
"entries": {
"type": "nested",
"properties": {
"day": {
"type": "keyword",
"copy_to": "day" <--
},
"attendance": {
"type": "boolean",
"copy_to": "attendance" <--
},
"classes_attended": {
"type": "integer",
"copy_to": "classes_attended" <--
}
}
}
}
}
}
and here's your query:
GET students/_search
{
"size": 0,
"aggs": {
"days": {
"terms": {
"field": "day"
},
"aggs": {
"classes_attended": {
"terms": {
"field": "classes_attended"
},
"aggs": {
"student_count": {
"cardinality": {
"field": "_id"
}
}
}
}
}
}
}
}
The response can then be post-processed into whatever you prefer.
EDIT
You could hijack reverse_nested but will need to come back to it as you're referencing other nested entries:
GET students/_search
{
"size": 0,
"aggs": {
"attendance_aggs": {
"nested": {
"path": "entries"
},
"aggs": {
"days": {
"terms": {
"field": "entries.day"
},
"aggs": {
"attended": {
"reverse_nested": {},
"aggs": {
"class_attended_day": {
"nested": {
"path": "entries"
},
"aggs": {
"class_attended_day": {
"terms": {
"field": "entries.classes_attended"
},
"aggs": {
"classes_attended_final": {
"cardinality": {
"field": "entries.classes_attended"
}
}
}
}
}
}
}
}
}
}
}
}
}
}

access nested variable from sub-aggregation on elasticsearch

I have an index with documents that look like:
{
"id": 1,
"timeline": [{
"amount": {
"mpe": 30,
"drawn": 20
},
"interval": {
"gte": "2020-03-01",
"lte": "2020-04-01"
}
}, {
"amount": {
"mpe": 40,
"drawn": 10
},
"interval": {
"gte": "2020-04-01",
"lte": "2020-06-01"
}
}]
}
Then I have the following query that produces a time bucketed sum of the values from the original intervals:
{
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval",
"calendar_interval": "day"
},
"aggs": {
"sum_mpe": {
"sum": {
"field": "timeline.amount.mpe"
}
},
"sum_drawn": {
"sum": {
"field": "timeline.amount.drawn"
}
}
}
}
}
}
}
}
The above works like a charm yielding the correct sum for each day. Now I want to improve it so I can dynamically multiply the values by a given number that may vary between query executions, although for simplicity I will just use a fixed number 2. I've tried the following:
{
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval",
"calendar_interval": "day"
},
"aggs": {
"sum_mpe": {
"sum": {
"script": "timeline.amount.mpe * 2"
}
},
"sum_drawn": {
"sum": {
"script": "timeline.amount.drawn * 2"
}
}
}
}
}
}
}
}
But I get the following error:
{
"reason": {
"type": "script_exception",
"reason": "compile error",
"script_stack": [
"timeline.amount.mpe * 2",
"^---- HERE"
],
"script": "timeline.amount.mpe * 2",
"lang": "painless",
"position": {
"offset": 0,
"start": 0,
"end": 23
},
"caused_by": {
"type": "illegal_argument_exception",
"reason": "Variable [timeline] is not defined."
}
}
}
Is there a way to make the nested variable declared above available in the script?
This link states as how to access the fields via script. Note that you can only use this for fields which are analyzed i.e. text type.
The below should help:
POST <your_index_name>/_search
{
"size": 0,
"aggs": {
"cp-timeline": {
"nested": {
"path": "timeline"
},
"aggs": {
"mpes": {
"date_histogram": {
"field": "timeline.interval.gte",
"calendar_interval": "day",
"min_doc_count": 1 <---- Note this
},
"aggs": {
"sum_mpe": {
"sum": {
"script": "doc['timeline.amount.mpe'].value * 2" <---- Note this
}
},
"sum_drawn": {
"sum": {
"script": "doc['timeline.amount.drawn'].value * 2" <---- Note this
}
}
}
}
}
}
}
}
Also note that I've made use of min_doc_count so that your histogram would only show you the valid dates.

Elasticsearch Query 30Day Price Difference

I currently have an elasticsearch indexs for a product that spans a year each index separated by month (i think, could be by year if i dont have as much data as i think i do). Each day a process grabs all the prices of these products and puts them into elasticsearch. I am trying to build a query that can give me the percent change within the last 30days of each product.
Example...
{
"timestamp": "2019-09-18T02:38:51.417Z",
"productId": 1,
"marketPrice": 5.00,
"lowPrice": 4.30
},
{
"timestamp": "2019-08-30T02:38:51.417Z", (THIS SHOULD BE IGNORED)**
"productId": 1,
"marketPrice": 100.00,
"lowPrice": 200.15
},
{
"timestamp": "2019-08-18T02:38:51.417Z",
"productId": 1,
"marketPrice": 10.00,
"lowPrice": 2.15
},
{
"timestamp": "2019-09-18T02:38:51.417Z",
"productId": 2,
"marketPrice": 2.00,
"lowPrice": 1.00
},
{
"timestamp": "2019-08-18T02:38:51.417Z",
"productId": 2,
"marketPrice": 3.00,
"lowPrice": 2.00
}
Result Example
{
"productId": 1,
"marketPriceChangeWithin30Days": 200%,
"lowPriceChangeWithin30Days": 200%
},
{
"productId": 2,
"marketPriceChangeWithin30Days": 150%,
"lowPriceChangeWithin30Days": 200%
}
** The (THIS SHOULD BE IGNORED) is because the only two values that should be compared are the latest timestamp and the closest timestamp that is around 30days in the past.
The query would then return the product id 1 and 2 with the percent changed in the result as shown in the example response.
You can leverage the derivative pipeline aggregation to achieve exactly what you expect:
POST /sales/_search
{
"size": 0,
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "timestamp",
"interval": "month"
},
"aggs": {
"marketPrice": {
"sum": {
"field": "marketPrice"
}
},
"lowPrice": {
"sum": {
"field": "lowPrice"
}
},
"marketPriceDiff": {
"derivative": {
"buckets_path": "marketPrice"
}
},
"lowPriceDiff": {
"derivative": {
"buckets_path": "lowPrice"
}
}
}
}
}
}
UPDATE:
Given your updated requirements, I'd suggest using the serial_diff pipeline aggregation with a lag of 30 days:
POST /sales/_search
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-31d",
"lte": "now"
}
}
},
"aggs": {
"products": {
"terms": {
"field": "productId",
"size": 10
},
"aggs": {
"histo": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"min_doc_count": 0
},
"aggs": {
"marketPrice": {
"avg": {
"field": "marketPrice"
}
},
"lowPrice": {
"avg": {
"field": "lowPrice"
}
},
"30d_diff_marketPrice": {
"serial_diff": {
"buckets_path": "marketPrice",
"lag": 30
}
},
"30d_diff_lowPrice": {
"serial_diff": {
"buckets_path": "lowPrice",
"lag": 30
}
}
}
}
}
}
}
}

Unable to create nested date aggregation query

I am trying to create an ElasticSearch aggregation query which can generate sum or average of value in all my ingested documents.
The documents are of the format -
{
"weather":"cold",
"date_1":"2017/07/05",
"feedback":[
{
"date_2":"2017/08/07",
"value":28,
"comment":"not cold"
},{
"date_2":"2017/08/09",
"value":48,
"comment":"a bit chilly"
},{
"date_2":"2017/09/07",
"value":18,
"comment":"very cold"
}, ...
]
}
I am able to create a sum aggregation of all "feedback.value" using "date_1" by using the following request -
GET _search
{
"query": {
"query_string": {
"query": "cold"
}
},
"size": 0,
"aggs": {
"temperature": {
"date_histogram":{
"field" : "date_1",
"interval" : "month"
},
"aggs":{
"temperature_agg":{
"terms": {
"field": "feedback.value"
}
}
}
}
}
}
However, I need to generate the same query across all documents aggregate based on "feedback.date_2". I am not sure if ElasticSearch can resolve such aggregation or how to approach it. Any guidance would be helpful
[EDIT]
Mapping file( I only define the nested items, ES identifes other fields on its own)
{
"mappings": {
"catalog_item": {
"properties": {
"feedback":{
"type":"nested",
"properties":{
"date_2":{
"type": "date",
"format":"YYYY-MM-DD"
},
"value": {
"type": "float"
},
"comment": {
"type": "text"
}
}
}
}
}
}
}
You would need to make use of nested documents and sum aggregation.
Here's a working example:
Sample Mapping:
PUT test
{
"mappings": {
"doc": {
"properties": {
"feedback": {
"type": "nested"
}
}
}
}
}
Add Sample document:
PUT test/doc/1
{
"date_1": "2017/08/07",
"feedback": [
{
"date_2": "2017/08/07",
"value": 28,
"comment": "not cold"
},
{
"date_2": "2017/08/09",
"value": 48,
"comment": "a bit chilly"
},
{
"date_2": "2017/09/07",
"value": 18,
"comment": "very cold"
}
]
}
Calculate both the sum and average based on date_2.
GET test/_search
{
"size": 0,
"aggs": {
"temperature_aggregation": {
"nested": {
"path": "feedback"
},
"aggs": {
"temperature": {
"date_histogram": {
"field": "feedback.date_2",
"interval": "month"
},
"aggs": {
"sum": {
"sum": {
"field": "feedback.value"
}
},
"avg": {
"avg": {
"field": "feedback.value"
}
}
}
}
}
}
}
}

Resources