How to select the last bucket in a date_histogram selector in Elasticsearch - elasticsearch

I have a date_histogram and I can use max_bucket to get the bucket with the greatest value, but I want to select the last bucket (i.e. the bucket with the highest timestamp).
Using max_bucket to get the greatest value works OK, but I don't know what to put in the buckets_path to get the last bucket.
My mapping:
{
"ee-2020-02-28" : {
"mappings" : {
"dynamic" : "strict",
"properties" : {
"date" : {
"type" : "date"
},
"frequency" : {
"type" : "long"
},
"keyword" : {
"type" : "keyword"
},
"text" : {
"type" : "text"
}
}
}
}
}
My working query, which returns the bucket for the day with higher frequency (it's named last_day because this is a WIP query to get to my goal):
{
"query": {
"range": {
"date": { /* Start away from the begining of data, so the rolling avg is full */
"gte": "2019-02-18"/*,
"lte": "2020-12-14"*/
}
}
},
"aggs": {
"palabrejas": {
"terms": {
"field": "keyword",
"size": 100
},
"aggs": {
"nnndiario": {
"date_histogram": {
"field": "date",
"calendar_interval": "day"
},
"aggs": {
"dailyfreq": {
"sum": {
"field": "frequency"
}
}
}
},
"ventanuco": {
"avg_bucket": {
"buckets_path": "nnndiario>dailyfreq",
"gap_policy": "insert_zeros"
}
},
"last_day": {
"max_bucket": {
"buckets_path": "nnndiario>dailyfreq"
}
}
}
}
}
}
Its output (notice I replaced long parts with [...]):
{
"aggregations" : {
"palabrejas" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "rama0",
"doc_count" : 20400,
"nnndiario" : {
"buckets" : [
{
"key_as_string" : "2020-01-01T00:00:00.000Z",
"key" : 1577836800000,
"doc_count" : 600,
"dailyfreq" : {
"value" : 3000.0
}
},
{
"key_as_string" : "2020-01-02T00:00:00.000Z",
"key" : 1577923200000,
"doc_count" : 600,
"dailyfreq" : {
"value" : 3000.0
}
},
{
"key_as_string" : "2020-01-03T00:00:00.000Z",
"key" : 1578009600000,
"doc_count" : 600,
"dailyfreq" : {
"value" : 3000.0
}
},
[...]
{
"key_as_string" : "2020-01-31T00:00:00.000Z",
"key" : 1580428800000,
"doc_count" : 600,
"dailyfreq" : {
"value" : 3000.0
}
}
]
},
"ventanuco" : {
"value" : 3290.3225806451615
},
"last_day" : {
"value" : 12000.0,
"keys" : [
"2020-01-13T00:00:00.000Z"
]
}
},
{
"key" : "rama1",
"doc_count" : 20400,
"nnndiario" : {
"buckets" : [
{
"key_as_string" : "2020-01-01T00:00:00.000Z",
"key" : 1577836800000,
"doc_count" : 600,
"dailyfreq" : {
"value" : 3000.0
}
},
[...]
]
},
"ventanuco" : {
"value" : 3290.3225806451615
},
"last_day" : {
"value" : 12000.0,
"keys" : [
"2020-01-13T00:00:00.000Z"
]
}
},
[...]
}
]
}
}
}
I don't know what to put in last_day's buckets_path to obtain the last bucket.

You might consider using a terms aggregation instead of a date_histogram-aggregation:
"max_date_bucket_agg": {
"terms": {
"field": "date",
"size": 1,
"order": {"_key": "desc"}
}
}
An issue might be the granularity of your data, you may consider storing the date-value of the expected granularity (e.g. day) in a separate field and use that field in the terms-aggregation.

Related

Elasticsearch Aggregation not giving desirable otput

I have an object, which contains mills of drugs given to a patient.
More than one drug can be administered to a patient.
I am trying to sum the total individual mills of drugs administered to a patient within a specified time.
Here is a sample of my Object.
{
"_uid" : "953a4af9901847c3b206dac7cee5b298",
"_fullName" : "Test Patient",
"_created": "2021-12-18 22:48:45",
"_treatment" : {
"_created" : "2021-12-18 22:48:45",
"_drugs" : [
{
"_name" : "Another Tablet",
"_uid" : "5a09f6a9c415465a84a8661f35ac621d",
"_mils" : "500"
},
{
"_name" : "Test Drug",
"_uid" : "36c7fcf048c743078ca4c80d187d86c9",
"_mils" : "300"
}
]
}
}
In Kibana, i did the following
{
"query": {
"bool": {
"filter": {
"range": {
"_created": {
"gte": "2021-01-01 00:00:00",
"lte": "2021-12-31 00:00:00"
}
}
}
}
},
"size": 0,
"aggs" : {
"men" : {
"terms": {
"field": "_treatment._drugs._name.keyword"
},
"aggs": {
"milsUsed": { "sum": { "field": "_treatment._drugs._mils" } }
}
}
}
}
Presently kibana is adding all the mills together and not separating them. Below is the response from Kibana.
"aggregations" : {
"men" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Another Tablet",
"doc_count" : 2,
"milsUsed" : {
"value" : 1100.0
}
},
{
"key" : "Test Drug",
"doc_count" : 2,
"milsUsed" : {
"value" : 1100.0
}
}
]
}
}
Expected response i am looking to get
"aggregations" : {
"men" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Another Tablet",
"doc_count" : 1,
"milsUsed" : {
"value" : 500.0
}
},
{
"key" : "Test Drug",
"doc_count" : 1,
"milsUsed" : {
"value" : 300.0
}
}
]
}
}
Index mapping
{
"patients" : {
"mappings" : {
"properties" : {
"_fullName" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"_treatment" : {
"properties": {
"_drugs": {
"properties": {
"_mils" : {
"type" : "long"
},
"_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},,
"_uid" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
}
}
}
}
}
}
}
}
TLDR;
Have you heard about nested fields in elastic search ?
Internally Elastic search flatten nested object in your documents.
So if you have
{
"group" : "fans",
"user" : [
{
"first" : "John",
"last" : "Smith"
},
{
"first" : "Alice",
"last" : "White"
}
]
}
The internal representation of the json documents in the index will be
{
"group" : "fans",
"user.first" : [ "alice", "john" ],
"user.last" : [ "smith", "white" ]
}
In you case when you perform the aggregation it does the same. And all of a sudden, because of the flattening operation. You lose the "relationship" between _drugs._name et _drugs._mils
Below is a pet project that solve you use example.
Example
Set Up
PUT /so_agg_sum_drugs/
{
"mappings": {
"properties": {
"_fullName": {
"type": "keyword"
},
"_treatment": {
"properties": {
"_drugs": {
"type": "nested", <- nested field type !!
"properties": {
"_mils": {
"type": "long"
},
"_name": {
"type": "keyword"
},
"_uid": {
"type": "keyword"
}
}
}
}
}
}
}
}
POST /so_agg_sum_drugs/_doc
{
"_fullName" : "Test Patient",
"_treatment" : {
"_drugs" : [
{
"_name" : "Another Tablet",
"_uid" : "5a09f6a9c415465a84a8661f35ac621d",
"_mils" : "500"
},
{
"_name" : "Test Drug",
"_uid" : "36c7fcf048c743078ca4c80d187d86c9",
"_mils" : "300"
}
]
}
}
POST /so_agg_sum_drugs/_doc
{
"_fullName" : "Test Patient 2",
"_treatment" : {
"_drugs" : [
{
"_name" : "Another Tablet",
"_uid" : "5a09f6a9c415465a84a8661f35ac621d",
"_mils" : "500"
},
{
"_name" : "Test Drug",
"_uid" : "36c7fcf048c743078ca4c80d187d86c9",
"_mils" : "400"
},
{
"_name" : "Test Drug",
"_uid" : "36c7fcf048c743078ca4c80d187d86c9",
"_mils" : "300"
}
]
}
}
Solution
Your aggregation was mostly right, except for the nested field type. You can find some documentation about aggregation on nested fields here. [doc]
GET /so_agg_sum_drugs/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"patients": {
"terms": {
"field": "_fullName"
},
"aggs": {
"drugs": {
"nested": {
"path": "_treatment._drugs". <- wrap you agg on the drugs objects in a nested type agg.
},
"aggs": {
"per_drug": {
"terms": {
"field": "_treatment._drugs._name"
},
"aggs": {
"quantity": {
"sum": {
"field": "_treatment._drugs._mils"
}
}
}
}
}
}
}
}
}
}
{
"took" : 350,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"patients" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Test Patient",
"doc_count" : 1,
"drugs" : {
"doc_count" : 2,
"per_drug" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Another Tablet",
"doc_count" : 1,
"quantity" : {
"value" : 500.0
}
},
{
"key" : "Test Drug",
"doc_count" : 1,
"quantity" : {
"value" : 300.0
}
}
]
}
}
},
{
"key" : "Test Patient 2",
"doc_count" : 1,
"drugs" : {
"doc_count" : 3,
"per_drug" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Test Drug",
"doc_count" : 2,
"quantity" : {
"value" : 700.0
}
},
{
"key" : "Another Tablet",
"doc_count" : 1,
"quantity" : {
"value" : 500.0
}
}
]
}
}
}
]
}
}
}

Elastic how to use the aggregation buckets to update the documents

I'm new to elastic/painless and needed some assistance.
Having this query :
GET index1/_search/
{
"size": 0,
"aggs": {
"attrs_root": {
"nested": {
"path": "business_index_jd_list_agg"
},
"aggs": {
"attrs": {
"terms": {
"field": "jdl_id"
},
"aggs": {
"sumOfQuantity" : {
"sum" : {
"field" : "value"
}
}
}
}
}
}
}
}
and these results from that query :
[...]
aggregations" : {
"attrs_root" : {
"doc_count" : 5,
"attrs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : -666,
"doc_count" : 1,
"sumOfQuantity" : {
"value" : 55.0
}
},
{
"key" : 93,
"doc_count" : 1,
"sumOfQuantity" : {
"value" : 25.0
},
[...]
]
}
}
}
}
How can I use that query and navigate through those results using a painless script to achieve to update each document in the index with that agregated info. Something like this:
{
"jdl_id" : -666,
"value" : 55.0
}
},
{
"jdl_id" : 93,
"value" : 25.0
}
},
[...]
Thank you.

How to return hit term in ES ?

I try to return only the terms that were successfully hit instead of the document itself, but I don’t know how to achieve the desired effect。
"es_episode" : {
"aliases" : { },
"mappings" : {
"properties" : {
"endTime" : {
"type" : "long"
},
"episodeId" : {
"type" : "long"
},
"startTime" : {
"type" : "long"
},
"studentIds" : {
"type" : "long"
}
}
}
This is an example:
{
"episodeId":124,
"startTime":10,
"endTime":20,
"studentIds":[200,300]
}
My query:
GET /es_episode/_search
{
"_source": ["studentIds"],
"query": {
"terms": {
"studentIds": [300,400]
}
}
}
The result is
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "es_episode",
"_type" : "episode",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"studentIds" : [
200,
300
]
}
}
]
}
But in fact I only want to know which term hits. For example, the result I want should be studentIds=[300] instead of all studentIds=[200,300] of the returned document. It seems that some additional operations are required, but I don’t know
how.
I try to achieve my goal with the following query
GET /es_episode/_search
{
"_source": ["studentIds"],
"query": {
"terms": {
"studentIds": [300,400]
}
},
"aggs": {
"student_id": {
"terms": {
"field": "studentIds",
"size": 10
},
"aggs": {
"id": {
"terms": {
"field": "episodeId"
}
},
"id_select":{
"bucket_selector": {
"buckets_path": {
"key" : "_key"
},
"script": "params.key==300 || params.key==400"
}
}
}
}
}
}
the result for this is
"aggregations" : {
"student_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 300,
"doc_count" : 1,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 124,
"doc_count" : 1
}
]
}
}
]
}
}
It seems that I successfully filtered out the terms I don’t want, but this doesn’t look pretty, and I need to set my parameters repeatedly in the script

Elasticsearch get average

I'm trying to average aggregate data on elasticsearch. This is the structure of my data:
document 1
{
"groupId":"TEST_01",
"lag":10,
"detectionDate":"2021-02-26T21:42:30.010Z",
"tipo":"uno",
"topics":[
{
"name":"topic_01",
"valore":2
},
{
"name":"topic_02",
"valore":4
}
]
}
document 2
{
"groupId":"TEST_01",
"lag":10,
"detectionDate":"2021-02-26T21:42:30.010Z",
"tipo":"uno",
"topics":[
{
"name":"topic_01",
"valore":4
},
{
"name":"topic_02",
"valore":8
}
]
}
I have to create an aggregation by groupId and by topic name and on this aggregation calculate the average of the value field. But trying with the source code the result of the obtained average is wrong.
With the above data of documents one and two the expected result should be:
groupId
topicName
average
TEST_01
topic_01
3
TEST_01
topic_02
6
TermsAggregationBuilder aggregation = AggregationBuilders
.terms("groupId")
.field("groupId.keyword")
.subAggregation(AggregationBuilders
.terms("topicName")
.field("topics.name.keyword").subAggregation(AggregationBuilders
.avg("avg").field("topics.valore")));
First of all make sure you topics field is type "nested", because if it is "object" the topicName and valores will be flattened. This mean you will end up with a set of valores and topicNames without relation between them.
Mappings
{
"test_ynsanity" : {
"mappings" : {
"properties" : {
"detectionDate" : {
"type" : "date"
},
"groupId" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"lag" : {
"type" : "long"
},
"tipo" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"topics" : {
"type" : "nested",
"properties" : {
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"valore" : {
"type" : "long"
}
}
}
}
}
}
}
Ingesting data
POST test_ynsanity/_doc
{
"groupId":"TEST_01",
"lag":10,
"detectionDate":"2021-02-26T21:42:30.010Z",
"tipo":"uno",
"topics":[
{
"name":"topic_01",
"valore":2
},
{
"name":"topic_02",
"valore":4
}
]
}
POST test_ynsanity/_doc
{
"groupId":"TEST_01",
"lag":10,
"detectionDate":"2021-02-26T21:42:30.010Z",
"tipo":"uno",
"topics":[
{
"name":"topic_01",
"valore":4
},
{
"name":"topic_02",
"valore":8
}
]
}
Query
POST test_ynsanity/_search
{
"size": 0,
"aggs": {
"groups": {
"terms": {
"field": "groupId.keyword",
"size": 10
},
"aggs": {
"topics": {
"nested": {
"path": "topics"
},
"aggs": {
"topic_names": {
"terms": {
"field": "topics.name.keyword"
},
"aggs": {
"topic_avg": {
"avg": {
"field": "topics.valore"
}
}
}
}
}
}
}
}
}
}
Response
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"groups" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "TEST_01",
"doc_count" : 2,
"topics" : {
"doc_count" : 4,
"topic_names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "topic_01",
"doc_count" : 2,
"NAME" : {
"value" : 3.0
}
},
{
"key" : "topic_02",
"doc_count" : 2,
"NAME" : {
"value" : 6.0
}
}
]
}
}
}
]
}
}
}
I have no access to the Java DSL right now, but the query should look something like this:
TermsAggregationBuilder aggregation = AggregationBuilders
.terms("groupId")
.field("groupId.keyword")
.subAggregation(AggregationBuilders
.nested("agg", "topics")
.terms("topic_names")
.field("topics.name.keyword").subAggregation(AggregationBuilders
.avg("avg").field("topics.valore")));

ElasticSearch: How to make an aggregation pipeline?

Imagine the following use case:
We work at Stark Airlines and our marketing team wants to segment our passengers in order to give them discounts or gift cards. They decide that they want two sets of passengers:
Passengers that fly at least 3 times per week
Passenger who have flown at least once but who have not flown for two weeks
With this they can make different marketing campaigns for our passengers!
So, in elastic search we have a trip index that represents a ticket bought by a passenger:
{
"_index" : "trip",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"total_amount" : 300,
"trip_date" : "2020/03/24 13:30:00",
"status" : "completed",
"passenger" : {
"id" : 11,
"name" : "Thiago nunes"
}
}
}
The trip index contains a status field that may have other values like: pending or open or canceled
This means that we can only take into account trips that has the completed status (Meaning the passenger did travel).
So, with all this in mind...How would I get those two sets of passengers with elastic search?
I have been trying for a while but with no success.
What I have done until now:
I have built a query that gets all valid trip (trips with status completed)
GET /trip/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"status": {
"value": "completed"
}
}
}
]
}
},
"aggs": {
"status_viagem": {
"terms": {
"field": "status.keyword"
}
}
}
}
This query returns the following:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 200,
"relation" : "eq"
},
"max_score" : 0.18232156,
"hits" : [...]
},
"aggregations" : {
"status_viagem" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "completed",
"doc_count" : 200
}
]
}
}
}
But I am stuck and can't figure out the next step. I know that the next thing to do should create buckets with passengers and then filter them in two buckets representing our desired data sets. But I don't know how.
Can someone help?
PS.:
I don't exactly need this to be one single query, just a hint about how to build a query like this would be very helpful
THE OUPUT SHOULD BE AN ARRAY of passenger id's
Note: I have shortened the trip index for the sake of simplicity
As per my understanding of your issue.
I have used date_histogram with interval as week to get collection on passengers which week. Only those passengers are kept which have three documents in a week. This will give you all passengers which have traveled thrice in a week.
In another aggregation I have use terms aggregation to get passengers and their last travel date. Using bucket selector have kept passengers whose last travel is not beyond certain date.
Mapping
{
"index87" : {
"mappings" : {
"properties" : {
"passengerid" : {
"type" : "long"
},
"passengername" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"status" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"total_amount" : {
"type" : "long"
},
"trip_date" : {
"type" : "date"
}
}
}
}
}
Query
{
"query": {
"bool": {
"must": [
{
"term": {
"status": {
"value": "completed"
}
}
}
]
}
},
"aggs": {
"travel_thrice_week": {
"date_histogram": {
"field": "trip_date",
"interval": "week"
},
"aggs": {
"passenger": {
"terms": {
"field": "passengername.keyword",
"min_doc_count": 3,
"size": 10
}
},
"select_bucket_with_user": {-->to keep weeks which have a pasenger with thrice
--> a day travel
"bucket_selector": {
"buckets_path": {
"passenger": "passenger._bucket_count"
},
"script": "if(params['passenger']>=1) {return true;} else{ return false;} "
}
}
}
},
"not_flown_last_two_week": {
"terms": {
"field": "passengername.keyword",
"size": 10
},
"aggs": {
"last_travel": {
"max": {
"field": "trip_date" --> most recent travel
}
},
"last_travel_before_two_week": {
"bucket_selector": {
"buckets_path": {
"traveldate": "last_travel"
},
"script":{
"source": "if(params['traveldate']< params['date_epoch']) return true; else return false;",
"params": {
"date_epoch":1586408336000 --> unix epoc of cutt off date
}
}
}
}
}
}
}
}
Result:
"aggregations" : {
"not_flown_last_two_week" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Thiago nunes",
"doc_count" : 3,
"last_travel" : {
"value" : 1.5851808E12,
"value_as_string" : "2020-03-26T00:00:00.000Z"
}
},
{
"key" : "john doe",
"doc_count" : 1,
"last_travel" : {
"value" : 1.5799968E12,
"value_as_string" : "2020-01-26T00:00:00.000Z"
}
}
]
},
"travel_thrice_week" : {
"buckets" : [
{
"key_as_string" : "2020-03-23T00:00:00.000Z",
"key" : 1584921600000,
"doc_count" : 3,
"passenger" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Thiago nunes",
"doc_count" : 3
}
]
}
}
]
}
}

Resources