is there a way of showing documents after a sum aggregation? - elasticsearch

I've been trying lately to retrieve information about sales on Kibana DSL.
I've been told to show vendors information PLUS their monthly sales.
(I'll use the "Kibana_sample_data_ecommerce" for this example)
I already did this aggregation in order to group all clients by their 'customer_id':
#Aggregations (group by)
GET kibana_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"by user_id": {
"terms": {
"field": "customer_id"
},
"aggs": {
"add_field_to_bucket": {
"top_hits": {"size": 1, "_source": {"includes": ["customer_full_name"]}}
}
}
}
}
}
in which i've included customer_full_name in the result:
"aggregations" : {
"by user_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 2970,
"buckets" : [
{
"key" : "27",
"doc_count" : 348,
"add_field_to_bucket" : {
"hits" : {
"total" : 348,
"max_score" : 1.0,
"hits" : [
{
"_index" : "kibana_sample_data_ecommerce",
"_type" : "_doc",
"_id" : "fhwUR3sBpfDKGuVlpu8r",
"_score" : 1.0,
"_source" : {
"customer_full_name" : "Elyssa Underwood"
}
}
]
}
}
}
So, in this result i know that 'Elyssa Underwood' with 'customerid' '27' has 348 hits (or documents related).
Also i recquire to know the total spent by 'Elyssa' on those products, using the field 'products.taxful_price'.
The thing is that i cannot perform a subaggregation on top_hits (as far as i know); Also I've tried to do a sum_aggregation, but it ends on the same result (i got my sum, but i cannot access top_hits sub aggregation at that point).
At the end of the day i want to have a result like this:
"hits" : [
{
"_index" : "kibana_sample_data_ecommerce",
"_type" : "_doc",
"_id" : "fhwUR3sBpfDKGuVlpu8r",
"_score" : 1.0,
"_source" : {
"customer_full_name" : "Elyssa Underwood",
"total_spent": 1234.5678
}
}
]
Is there something I can do to achieve it?.
PS: I'm using ElasticSearch 5.x and also I have access to NEST client, if there's a solution I can reach through it.
Thanks In Advance.

I have used below as sample data.
Data:
{
"customer_id":2,
"client-name":"b",
"purchase": 2001
}
Query:
GET index/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "customer_id",
"size": 10
},
"aggs": {
"total_sales": {
"sum": {
"field": "purchase"
}
},
"documents":{
"top_hits": {
"size": 10
}
}
}
}
}
}
Result:
{
"key" : 2,
"doc_count" : 1,
"documents" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index1",
"_type" : "_doc",
"_id" : "0HPzcHsBjw4ziwrzGzrq",
"_score" : 1.0,
"_source" : {
"customer_id" : 2,
"client-name" : "b",
"purchase" : 2001
}
}
]
}
},
"total_sales" : {
"value" : 2001.0
}
}

Related

Elasticsearch - How to order buckets using keyword field

I encountered a problem, as I need to sort my buckets using a keyword field for this I have tried two approaches.
I have been trying to sort the result of my aggregation (buckets) from the top hit aggregation. My top_hits contains one element which is the username
"user_data": {
"top_hits": {
"_source": {
"includes": ["username"]
},
"size": 1
}
},
To sort the buckets i'm trying with a bucket sort, the bucket sort is something like this
sorting": {
"bucket_sort": {
"sort": [
{
"user_data>username": { ----> This is the error
"order": "desc"
}
}
],
"from": 0,
"size": 25
}
}
But I received a syntax error basically the bucket path is wrong.
Another approach that I used to accomplish the sort was to add another aggregation over the username to obtain the max. Something like this
"to_sort" : {
"max": {
"field": "username"
}
}
And use the following bucket_sort
"sorting": {
"bucket_sort": {
"sort": [
{
"to_sort": {
"order": "desc"
}
}
],
"from": 0,
"size": 25
}
}
But basically I can't to use a keyword field to use the max aggregation.
Is there a way to sort my buckets using the username, the username is a keyword field?
The parent of my aggregation is
"aggs": {
"CountryId": {
"terms": {
"field": "countryId",
"size": 10000
}
The value of the username is different between each bucket
The result of the buckets is something like this
"buckets" : [
{
"key" : "11111",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "cccccc"
}
}
]
}
}
},
{
"key" : "33333",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "bbbbb"
}
}
]
}
}
},
{
"key" : "22222",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "aaaaa"
}
}
]
}
}
}
]
And the following buckets result is I would like to have
"buckets" : [
{
"key" : "22222",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "aaaaa"
}
}
]
}
}
},
{
"key" : "33333",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "bbbbb"
}
}
]
}
}
},
{
"key" : "11111",
"doc_count" : 17,
"user_data" : {
"hits" : {
"total" : 10,
"max_score" : 11,
"hits" : [
{
"_index" : "index_name",
"_type" : "index_name",
"_id" : "101010",
"_score" : 0.0,
"_source" : {
"username" : "ccccc"
}
}
]
}
}
}
]
How you can see the buckets was order by username.
I had a problem similar to this and didn't found any answer on the internet. So I tried to build my own, took me almost a week :/. It won't work always because of the limit on the ordered hashcode generation for strings, so you will have to play with your own charset and the length of the first chars on the string you deem enough to sort (6 for me), do some tests because you only want to use the positive interval of the long type or it will not work at all (due to my charset length I could go up to 13). I basically, build my metric for the bucket_sort using a scripted_metric based on finding the top_hits manually from here and adapted it to compute an ordered hashcode of my wanted keyword.
Below is my query where I sort the user's last session top hits by sso.name keyword, it should be more or less easy for you to adapt it to your problem.
{
"size": 0,
"timeout": "60s",
"query": {
"bool": {
"must": [
{
"exists": {
"field": "user_id"
}
}
]
}
},
"aggregations": {
"by_user": {
"terms": {
"field": "user_id",
"size": 10000,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"my_top_hits_sso_ordered_hash": {
"scripted_metric": {
"init_script": "state.timestamp_latest = 0L; state.last_sso_ordered_hash = 0L",
"map_script": """
def current_date = doc['login_timestamp'].getValue().toInstant().toEpochMilli();
if (current_date > state.timestamp_latest) {
state.timestamp_latest = current_date;
state.last_sso_ordered_hash = 0L;
if(doc['sso.name'].size()>0) {
String charset = "abcdefghijklmnopqrstuvwxyz";
String ssoName = doc['sso.name'].value;
int length = charset.length();
for(int i = 0; i<Math.min(ssoName.length(), 6); i++) {
state.last_sso_ordered_hash = state.last_sso_ordered_hash*length + charset.indexOf(String.valueOf(ssoName.charAt(i))) + 1;
}
}
}
""",
"combine_script":"return state",
"reduce_script": """
def last_sso_ordered_hash = '';
def timestamp_latest = 0L;
for (s in states) {
if (s.timestamp_latest > (timestamp_latest)) {
timestamp_latest = s.timestamp_latest; last_sso_ordered_hash = s.last_sso_ordered_hash;
}
}
return last_sso_ordered_hash;
"""
}
},
"user_last_session": {
"top_hits": {
"from": 0,
"size": 1,
"sort": [
{
"login_timestamp": {
"order": "desc"
}
}
]
}
},
"pagination": {
"bucket_sort": {
"sort": [
{
"my_top_hits_sso_ordered_hash.value": {
"order": "desc"
}
}
],
"from": 0,
"size": 100
}
}
}
}
}
}

Search documents with highest fields

I'm trying to get all the documents with highest field value (+ conditional term filter)
Given the Employees mapping
Name Department Salary
----------------------------
Tomcat Dev 100
Bobcat QA 90
Beast QA 100
Tom Dev 100
Bob Dev 90
In SQL it would look like
select * from Employees where Salary = select max(salary) from Employees
expected output
Name Department Salary
----------------------------
Tomcat Dev 100
Beast QA 100
Tom Dev 100
and
select * from Employees where Salary = (select max(salary) from Employees where Department ='Dev' )
expected output
Name Department Salary
----------------------------
Tomcat Dev 100
Tom Dev 100
Is it possible with Elasticsearch ?
The below should help:
Looking at your data, note that I've come up with the below mapping:
Mapping:
PUT my-salary-index
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"department":{
"type": "keyword"
},
"salary":{
"type": "float"
}
}
}
}
Sample Documents:
POST my-salary-index/_doc/1
{
"name": "Tomcat",
"department": "Dev",
"salary": 100
}
POST my-salary-index/_doc/2
{
"name": "Bobcast",
"department": "QA",
"salary": 90
}
POST my-salary-index/_doc/3
{
"name": "Beast",
"department": "QA",
"salary": 100
}
POST my-salary-index/_doc/4
{
"name": "Tom",
"department": "Dev",
"salary": 100
}
POST my-salary-index/_doc/5
{
"name": "Bob",
"department": "Dev",
"salary": 90
}
Solutions:
Scenario 1: Return all employees with max salary
POST my-salary-index/_search
{
"size": 0,
"aggs": {
"my_employees_salary":{
"terms": {
"field": "salary",
"size": 1, <--- Note this
"order": {
"_key": "desc"
}
},
"aggs": {
"my_employees": {
"top_hits": { <--- Note this. Top hits aggregation
"size": 10
}
}
}
}
}
}
Note that I've made use of Terms Aggregation with Top Hits aggregation chained to it. I'd suggest to go through the links to understand both the aggregations.
So basically you just need to retrieve the first element in the Terms Aggregation that is why I've mentioned the size: 1. Also note the order, just in case if you requirement to retrieve the lowest.
Scenario 1 Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"my_employees" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 2,
"buckets" : [
{
"key" : 100.0,
"doc_count" : 3,
"employees" : {
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "my-salary-index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"name" : "Tomcat",
"department" : "Dev",
"salary" : 100
}
},
{
"_index" : "my-salary-index",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"name" : "Beast",
"department" : "QA",
"salary" : 100
}
},
{
"_index" : "my-salary-index",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"name" : "Tom",
"department" : "Dev",
"salary" : 100
}
}
]
}
}
}
]
}
}
}
Scenario 2: Return all employee with max salary from particular department
POST my-salary-index/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"term": {
"department": "Dev"
}
}
]
}
},
"aggs": {
"my_employees_salary":{
"terms": {
"field": "salary",
"size": 1,
"order": {
"_key": "desc"
}
},
"aggs": {
"my_employees": {
"top_hits": {
"size": 10
}
}
}
}
}
}
For this, there are many ways to do this, but the idea is that you basically filter the documents before you apply aggregation on top of it. That way it would be more efficient.
Note that I'v just added a bool condition to the aggregation query mentioned in solution for Scenario 1.
Scenario 2 Response
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"my_employees_salary" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 1,
"buckets" : [
{
"key" : 100.0,
"doc_count" : 2,
"my_employees" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.53899646,
"hits" : [
{
"_index" : "my-salary-index",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.53899646,
"_source" : {
"name" : "Tomcat",
"department" : "Dev",
"salary" : 100
}
},
{
"_index" : "my-salary-index",
"_type" : "_doc",
"_id" : "4",
"_score" : 0.53899646,
"_source" : {
"name" : "Tom",
"department" : "Dev",
"salary" : 100
}
}
]
}
}
}
]
}
}
}
You can also think of making use of SQL Access if you have complete xpack or rather licensed version of x-pack.
Hope this helps.

How to get last and first document ids by given criteria

I have some documents indexed on Elasticsearch, looking like these samples:
{"id":"1","isMigrated":true}
{"id":"2","isMigrated":true}
{"id":"3","isMigrated":false}
{"id":"4","isMigrated":false}
how can i get in one query the last migrated id and first not migrated id?
Any ideas?
Filter aggregation and top_hits aggregation can be used to get last migrated and first not migrated
{
"size": 0,
"aggs": {
"migrated": {
"filter": { --> filter where isMigrated:true
"term": {
"isMigrated": true
}
},
"aggs": {
"last_migrated": { --> get first documents sorted on id in descending order
"top_hits": {
"size": 1,
"sort": [{"id.keyword":"desc"}]
}
}
}
},
"not_migrated": {
"filter": {
"term": {
"isMigrated": false
}
},
"aggs": {
"first_not_migrated": {
"top_hits": {
"size": 1,
"sort": [{"id.keyword":"asc"}] -->any keyword field can be used to sort
}
}
}
}
}
}
Result:
"aggregations" : {
"not_migrated" : {
"doc_count" : 2,
"first_not_migrated" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index86",
"_type" : "_doc",
"_id" : "TxuKUHIB8mx5yKbJ_rGH",
"_score" : null,
"_source" : {
"id" : "3",
"isMigrated" : false
},
"sort" : [
"3"
]
}
]
}
}
},
"migrated" : {
"doc_count" : 2,
"last_migrated" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index86",
"_type" : "_doc",
"_id" : "ThuKUHIB8mx5yKbJ87HF",
"_score" : null,
"_source" : {
"id" : "2",
"isMigrated" : true
},
"sort" : [
"2"
]
}
]
}
}
}
}
You can store the timestamp information with each document and query based on the latest timestamp and isMigrated: true condition.
As per comment, https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html can you used to combine multiple boolean conditions.

How can i extend an elastic search date range histogram aggregation query?

Hi I have an elastic search index named mep-report.
Each document has a status field. The possible values for status fields are "ENROUTE", "SUBMITTED", "DELIVERED", "FAILED" . Below is the sample elastic search index with 6 documents.
{
"took" : 10,
"timed_out" : false,
"_shards" : {
"total" : 13,
"successful" : 13,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1094313,
"max_score" : 1.0,
"hits" : [
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837500",
"_score" : 1.0,
"_source" : {
"status" : "ENROUTE",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837501",
"_score" : 1.0,
"_source" : {
"status" : "ENROUTE",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837502",
"_score" : 1.0,
"_source" : {
"status" : "SUBMITTED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837503",
"_score" : 1.0,
"_source" : {
"status" : "DELIVERED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837504",
"_score" : 1.0,
"_source" : {
"status" : "FAILED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837504",
"_score" : 1.0,
"_source" : {
"status" : "FAILED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
}
}
I would like to find an aggregation histogram distribution something like to get messages_processed, message_delivered,messages_failed .
messages_processed : 3 ( 2 documents in status ENROUTE + 1 Document with status SUBMITTED )
message_delivered 1 ( 1 document with status DELIVERED )
messages_failed : 2 ( 2 documents with status FAILED )
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 13,
"successful" : 13,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 21300,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"performance_over_time" : {
"buckets" : [
{
"key_as_string" : "2020-02-21",
"key" : 1582243200000,
"doc_count" : 6,
"message_processed": 3,
"message_delivered": 1,
"message_failed": 2
}
]
}
}
}
So the following is my current query and i would like to modify it to get some additional statistics such as message_processed , message_delivered, message_failed. kindly let me know .
{ "size": 0, "query": { "bool": { "must": [ { "range": { "#timestamp": { "from": "2020-02-21T00:00Z", "to": "2020-02-21T23:59:59.999Z", "include_lower": true, "include_upper": true, "format": "yyyy-MM-dd'T'HH:mm:ss.SSSZ ||yyyy-MM-dd'T'HH:mmZ", "boost": 1.0 } } } ], "adjust_pure_negative": true, "boost": 1.0 } }, "aggregations": { "performance_over_time": { "date_histogram": { "field": "#timestamp", "format": "yyyy-MM-dd", "interval": "1d", "offset": 0, "order": { "_key": "asc" }, "keyed": false, "min_doc_count": 0 } } } }
You are almost there with the query, you just need to add Terms Aggregation and looking at your request, I've come up with a Scripted Terms Aggregation.
I've also modified the date histogram aggregation field interval to calendar_interval so that you get the values as per the calendar date.
Query Request:
POST <your_index_name>/_search
{
"size": 0,
"query":{
"bool":{
"must":[
{
"range":{
"#timestamp":{
"from":"2019-09-10",
"to":"2019-09-12",
"include_lower":true,
"include_upper":true,
"boost":1.0
}
}
}
],
"adjust_pure_negative":true,
"boost":1.0
}
},
"aggs":{
"message_processed":{
"date_histogram": {
"field": "#timestamp",
"calendar_interval": "1d" <----- Note this
},
"aggs": {
"my_messages": {
"terms": {
"script": { <----- Core Logic of Terms Agg
"source": """
if(doc['status'].value=="ENROUTE" || doc['status'].value == "SUBMITTED"){
return "message_processed";
}else if(doc['status'].value=="DELIVERED"){
return "message_delivered"
}else {
return "message_failed"
}
""",
"lang": "painless"
},
"size": 10
}
}
}
}
}
}
Note that the core logic what you are looking for is inside the scripted terms aggregation. Logic is self explainable if you go through it. Feel free to modify the logic that fits you.
For the sample date you've shared, you would get the result in the below format:
Response:
{
"took" : 144,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"message_processed" : {
"buckets" : [
{
"key_as_string" : "2019-09-11T00:00:00.000Z",
"key" : 1568160000000,
"doc_count" : 6,
"my_messages" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "message_processed",
"doc_count" : 3
},
{
"key" : "message_failed",
"doc_count" : 2
},
{
"key" : "message_delivered",
"doc_count" : 1
}
]
}
}
]
}
}
}

elastic query to get events where corresponding pair is missing

I have records of transaction which follow following lifecycle.
Event when transaction is received [RCVD]
Event when transaction gets pending for execution [PNDG] (OPTIONAL step)
Event when it gets executed [SENT]
Following are the 7 sample events in the index:
{trxID: 1, status:RCVD}
{trxID: 2, status:RCVD}
{trxID: 3, status:RCVD}
{trxID: 2, status:PNDG}
{trxID: 3, status:PNDG}
{trxID: 1, status:SENT}
{trxID: 2, status:SENT}
I need to find all the transactions which went to pending state but not executed yet. In other word there should be PNDG status for transaction but not SENT.
I am trying not to do it at java layer.
I did an aggregation on trxID, and then I did sub aggregation on status.
Then I cannot figure out how to get those records where bucket has only PNDG in sub-aggregation. I am not sure if I am thinking in right direction.
The result I am expecting is trxID 3 because for this transaction ,we got PNDG status but did not get SENT yet. On the other hand TrxUD 1 should not be reported as it never went to PNDG (pending) state irrespective of if SENT status is reported of not.
You can use count of status under a transaction id.
GET index24/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "trxID",
"size": 10
},
"aggs": {
"status": {
"terms": {
"field": "status.keyword",
"size": 10
}
},
"count": {
"cardinality": {
"field": "status.keyword"
}
},
"my_bucketselector": {
"bucket_selector": {
"buckets_path": {
"statusCount": "count"
},
"script": "params.statusCount==1"
}
}
}
}
}
}
Response:
"aggregations" : {
"transactionId" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 4,
"doc_count" : 1,
"count" : {
"value" : 1
},
"status" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "PNDG",
"doc_count" : 1
}
]
}
}
]
}
}
EDIT 1:
I have tried with below :-
Get max date for a transaction id and then get date under pending . If both dates are same then pending is the last status
Data:
[
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "aYCs0m0BD5PlkoxXxO36",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "aoCs0m0BD5PlkoxX7e35",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "PNDG",
"date" : "2019-10-15T12:01:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "a4Ct0m0BD5PlkoxXCO06",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "SENT",
"date" : "2019-10-15T12:02:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "bICt0m0BD5PlkoxXQe0Y",
"_score" : 1.0,
"_source" : {
"trxID" : 2,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "bYCt0m0BD5PlkoxXZO2x",
"_score" : 1.0,
"_source" : {
"trxID" : 2,
"status" : "PNDG",
"date" : "2019-10-15T12:01:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "boCt0m0BD5PlkoxXju1H",
"_score" : 1.0,
"_source" : {
"trxID" : 3,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "b4Ct0m0BD5PlkoxXou0-",
"_score" : 1.0,
"_source" : {
"trxID" : 3,
"status" : "SENT",
"date" : "2019-10-15T12:01:00"
}
}
]
Query:
GET index24/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "trxID",
"size": 10000
},
"aggs": {
"maxDate": {
"max": {
"field": "date" ---> get max date under transactions
}
},
"pending_status": {
"filter": {
"term": {
"status.keyword": "PNDG" ---> filter for pending
}
},
"aggs": {
"filtered_maxdate": {
"max": {
"field": "date" --> get date under pending
}
}
}
},
"buckets_latest_status_pending": { -->filter if max date==pending date
"bucket_selector": {
"buckets_path": {
"filtereddate": "pending_status>filtered_maxdate",
"maxDate": "maxDate"
},
"script": "params.filtereddate==params.maxDate"
}
}
}
}
}
}
Response:
{
"transactionId" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, --> only transaction id 2 is returned
"doc_count" : 2,
"pending_status" : {
"doc_count" : 1,
"filtered_maxdate" : {
"value" : 1.57114086E12,
"value_as_string" : "2019-10-15T12:01:00.000Z"
}
},
"maxDate" : {
"value" : 1.57114086E12,
"value_as_string" : "2019-10-15T12:01:00.000Z"
}
}
]
}
}
I did an aggregation on trxID, and then I did sub aggregation on status.
That's a great start !!!
Now, you can leverage the bucket_selector pipeline aggregation in order to surface only the transactions which have only 1 or 2 documents, i.e. the script condition params.eventCount < 3 makes sure to catch all buckets that have RCVD and/or PNDG documents but no SENT documents:
POST events/_search
{
"size": 0,
"aggs": {
"trx": {
"terms": {
"field": "trxID",
"size": 1000
},
"aggs": {
"count": {
"cardinality": {
"field": "status.keyword"
}
},
"not_sent": {
"bucket_selector": {
"buckets_path": {
"eventCount": "count"
},
"script": "params.eventCount < 3"
}
}
}
}
}
}
In your case, this would yield this, i.e. only event with trxID = 3:
"aggregations" : {
"trx" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 3,
"doc_count" : 2,
"count" : {
"value" : 2
}
}
]
}
}

Resources