Elastic search (Kibana) - intersect between boolean results - elasticsearch

I am facing a problem in Kibana how to correctly filter a data. Basically my aim is to filter PASSED or FAILED tests from the following data structure.
{
"_index":"qa-reporting-2020-04",
"_type":"qa-reporting",
"_id":"456.0",
"_version":1,
"_score":null,
"_source":{
"TestId":"CXXX01",
"TestStatus":0,
"Issues":[
],
"MetaData":{
"TestName":"Test1",
"LastException":null,
"DurationMs":1980.5899000000002,
"Platform":{
"BrowserName":"chrome",
"OS":"windows",
"OsVersion":"10"
},
"Categories":[
"Cat1",
"Cat2",
"CXXX01"
],
"Priority":null,
"TestStatusStr":"Passed",
"JobName":"My-Demo/dev/ServerJob1",
"Logs":"PASS - Passed - CXXX01",
"SuiteName":"Tests.ServerTests",
"LastMessage":"PASS - Passed - CXXX01: \n",
"Environment":"dev"
}
}
}
The problem is that during a time (day), the logs will have several entries (e.g. the test failed and later in the same day the test passed). I have an aggregation query that gives me both results which is not desired. I want to have a result returning the intersection so the report will contain either failed/passed tests.
Here is my query (I am begginer) which gives me aggregation for a specific test that failed and passed.
GET qa-reporting-*/_search
{"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"MetaData.Environment": "dev"
}
},
{
"match": {
"TestId": "CXXX01"
}
},
{
"range": {
"Created": {
"gte": "now-0d/d"
}
}
}
]
}
},
"aggs": {
"tests": {
"terms": {"field": "MetaData.TestStatusStr.keyword"}
}
}
}
It returns the following (excerpt from the entire object)
"aggregations": {
"tests": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Passed",
"doc_count": 10
},
{
"key": "Failed",
"doc_count": 1
}
]
}
}
As above mentioned how to get the intersection between failed/passed tests during a specific date time? Is it possible to do it in Kibana or it needs to be solved on a script level outside of it?
Currently my report looks like this:
DateTime | TestId | Status
Apr 24,12:00 | CXXX01 | Failed
Apr 24,15:00 | CXXX01 | Passed
Wanted report will be only the intersection of the above status
Apr 24 | CXXX01 | Failed
or
Apr 24 | CXXX01 | Passed
So the latest result will have more weight I think.

I have taken data for two days. Two entries for each day. Last entry for day 1 is "Failed" and latest entry for day 2 is "Passed"
Data:
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QKlosXEBuBOc-UQkKecO",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-23T01:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Passed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QalosXEBuBOc-UQkieeR",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-23T10:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Failed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QqlosXEBuBOc-UQkoue4",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-24T10:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Failed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
},
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "Q6losXEBuBOc-UQkwecl",
"_score" : 1.0,
"_source" : {
"TestId" : "CXXX01",
"TestStatus" : 0,
"Issues" : [ ],
"Created" : "2020-04-24T11:00:00",
"MetaData" : {
"TestName" : "Test1",
"LastException" : null,
"DurationMs" : 1980.5899000000002,
"Platform" : {
"BrowserName" : "chrome",
"OS" : "windows",
"OsVersion" : "10"
},
"Categories" : [
"Cat1",
"Cat2",
"CXXX01"
],
"Priority" : null,
"TestStatusStr" : "Passed",
"JobName" : "My-Demo/dev/ServerJob1",
"Logs" : "PASS - Passed - CXXX01",
"SuiteName" : "Tests.ServerTests",
"LastMessage" : "PASS - Passed - CXXX01: \n",
"Environment" : "dev"
}
}
}
]
Query: I have used date_histogram to create bucket of each day and top_hits
aggregation to get last document of that day
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"MetaData.Environment": "dev"
}
},
{
"match": {
"TestId": "CXXX01"
}
},
{
"range": {
"Created": {
"gte": "now-3d/d"
}
}
}
]
}
},
"aggs": {
"daily":{
"date_histogram": {
"field": "Created",
"interval": "day"
},
"aggs": {
"last_result": {
"top_hits": {
"size": 1,
"_source": ["MetaData.TestStatusStr"],
"sort": [
{"Created":"desc"}]
}
}
}
}
}
}
Result:
"aggregations" : {
"daily" : {
"buckets" : [
{
"key_as_string" : "2020-04-23T00:00:00.000Z",
"key" : 1587600000000,
"doc_count" : 2,
"last_result" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "QalosXEBuBOc-UQkieeR",
"_score" : null,
"_source" : {
"MetaData" : {
"TestStatusStr" : "Failed"
}
},
"sort" : [
1587636000000
]
}
]
}
}
},
{
"key_as_string" : "2020-04-24T00:00:00.000Z",
"key" : 1587686400000,
"doc_count" : 2,
"last_result" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index19",
"_type" : "_doc",
"_id" : "Q6losXEBuBOc-UQkwecl",
"_score" : null,
"_source" : {
"MetaData" : {
"TestStatusStr" : "Passed"
}
},
"sort" : [
1587726000000
]
}
]
}
}
}
]
}
}

Related

Elasticsearch DSL query

{
"name" : "Danny",
"id" : "123",
"lastProfileUpdateTime" : "2021-06-26T20:08:25.089Z"
},
{
"name" : "Harry",
"id" : "124",
"lastProfileUpdateTime" : "2021-04-12T20:08:25.089Z"
},
{
"name" : "Danny Brown",
"id" : "123",
"lastProfileUpdateTime" : "2021-07-26T20:08:25.089Z"
},
{
"name" : "Danny Smith",
"id" : "123",
"lastProfileUpdateTime" : "2021-08-26T20:08:25.089Z"
}
I have a usecase where i need to find if a particular id has been updated or not and filter out latest profile. In above case since id:123 has updated profile. so expected outcome should be:
{
"name" : "Danny Smith",
"id" : "123",
"lastProfileUpdateTime" : "2021-08-26T20:08:25.089Z"
}
If a id has more than one entry, pick the one which has latest lastProfileUpdateTime.
This can be done using
Terms aggregation
: to group by "id"
Bucket selector: to get ids where count > 1
Top_hits: To get latest document for given id
Query
{
"size": 0,
"aggs": {
"Updated_docs": {
"terms": {
"field": "id.keyword",
"size": 10
},
"aggs": {
"filter_count_more_than_one": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count>1"
}
},
"latest_document": {
"top_hits": {
"size": 1,
"sort": [
{
"lastProfileUpdateTime": "desc"
}
]
}
}
}
}
}
}
Result
"aggregations" : {
"Updated_docs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "123",
"doc_count" : 3,
"latest_document" : {
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "huE1i3sBZ5_aIkj8cvpR",
"_score" : null,
"_source" : {
"name" : "Danny Smith",
"id" : "123",
"lastProfileUpdateTime" : "2021-08-26T20:08:25.089Z"
},
"sort" : [
1630008505089
]
}
]
}
}
}
]
}
}

Aggregating multiple values of single key into a single bucket elasticsearch

I have a elastic search index with following mapping
{
"probe_alert" : {
"mappings" : {
"alert" : {
"properties" : {
"id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"probeChannelId" : {
"type" : "long"
},
"severity" : {
"type" : "integer"
},
}
}
}
}
}
Sample indexed data : For each channel index has a severity value
[
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 15,
"severity" : 2,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 17,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 18,
"severity" : 10,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 19,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" :20,
"severity" : 10,
}
}
]
I have done terms aggregation for fetching max severity value for a single probeChannelId but now I want to aggregate on multiple values of probeChannelId and get max value of severity.
Expected Result :
"aggregations" : {
"aggs_by_channels" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : [15,17],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 5.0
}
},
{
"key" : [18,19,20],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 10.0
}
}
]
}
}
In response i want group of values probeChannelId to have highest severity value
If you want to get the highest severity value, among a set of documents, then you can try out the below query using the Adjacency matrix aggregation
Search Query:
{
"size": 0,
"aggs": {
"interactions": {
"adjacency_matrix": {
"filters": {
"[15,17]": {
"terms": {
"probeChannelId": [
15,
17
]
}
},
"[18,19,20]": {
"terms": {
"probeChannelId": [
18,
19,
20
]
}
}
}
},
"aggs": {
"max_severity": {
"max": {
"field": "severity"
}
}
}
}
}
}
Search Result:
"aggregations": {
"interactions": {
"buckets": [
{
"key": "[15,17]",
"doc_count": 2,
"max_severity": {
"value": 5.0 // note this
}
},
{
"key": "[18,19,20]",
"doc_count": 3,
"max_severity": {
"value": 10.0 // note this
}
}
]
}

ElasticSearch - Filter Buckets

My elasticSearch query is like:
{
"size": 0,
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
It works fine, I have a result of this form:
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
I'm trying to add a filter, so when "OK" the result must be like this:
aggregations
=========>group_by_id
==============>buckets
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
and for "KO" :
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
Fields "Startime" & "Status" are at the same level "Infos.InstanceInfo.[...]"
Any idea?
EDIT
Sample docs:
{
"took" : 794,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"group_by_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 143846,
"buckets" : [
{
"key" : "1000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "vHFvoXYBVWrYChNi7hB7",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609026236011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xHFvoXYBVWrYChNi7xAB",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.145+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236145
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xXFvoXYBVWrYChNi7xAC",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.147+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236147
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "x3FvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.188+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236188
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "yHFvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.19+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236190
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "ynFvoXYBVWrYChNi7xBd",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.236+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236236
]
}
]
}
}
},
{
"key" : "2000",
"doc_count" : 2,
"tops" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7HL_onYBVWrYChNij4Is",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9HL_onYBVWrYChNij4Kz",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.146+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426146
]
}
]
}
}
},
{
"key" : "3000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7nNRpHYBVWrYChNiiruh",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.015+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609074576015
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9nNRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "93NRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-XNRpHYBVWrYChNii7ti",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.209+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576209
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-nNRpHYBVWrYChNii7ts",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.219+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576219
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "_HNRpHYBVWrYChNii7ud",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.269+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576269
]
}
]
}
}
}
]
}
}
}
Assuming the status field is under Infos.InstanceInfo and it's of the keyword mapping, you can utilize the filter aggregation:
{
"size": 0,
"aggs": {
"status_KO_only": {
"filter": { <--
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
In this particular case you could've applied the same term query in the query part of the search request without having to use a filter aggregation.
If you want to get both OK and KO in the same request, you can copy/paste the whole status_KO_only aggregation, rename the 2nd one, and voila -- you now have both groups in one request. You can of course have as many differently named (top-level) filter aggs as you like.
Now, when you indeed need multiple filter aggs at once, there's a more elegant way that does not require copy-pasting -- enter the filters aggregation:
{
"size": 0,
"aggs": {
"by_statuses": {
"filters": { <--
"filters": {
"status_KO": {
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"status_OK": {
"term": {
"Infos.InstanceInfo.Status": "OK"
}
}
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
Any of the child sub-aggregations will automatically be the buckets of the explicitly declared term filters.
I personally find the copy/paste approach more readable, esp. when constructing such requests dynamically (based on UI dropdowns and such.)

ElasticSearch join data within the same index

I am quite new with ElasticSearch and I am collecting some application logs within the same index which have this format
{
"_index" : "app_logs",
"_type" : "_doc",
"_id" : "JVMYi20B0a2qSId4rt12",
"_source" : {
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
I can have different event types. In this case I am interested in STARTED and FINISHED. I would like to query ES in order to get all the app that started in a certain day and enrich them with their end time. Basically I want to create couples of start/end (an end might also be missing, but that's fine).
I have realized join relations in sql cannot be used in ES and I was wondering if I can exploit some other feature in order to get this result in one query.
Edit: these are the details of the index mapping
{
“app_logs" : {
"mappings" : {
"_doc" : {
"properties" : {
"event_type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
“app_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"ts" : {
"type" : "date"
},
“event_type” : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}}}}
What I understood is that you would want to collate list of documents having same app_id along with the status as either STARTED or FINISHED.
I do not think Elasticsearch is not meant to perform JOIN operations. I mean you can but then you have to design your documents as mentioned in this link.
What you would need is an Aggregation query.
Below is the sample mapping, documents, the aggregation query and the response as how it appears, which would actually help you get the desired result.
Mapping:
PUT mystatusindex
{
"mappings": {
"properties": {
"username":{
"type": "keyword"
},
"app_id":{
"type": "keyword"
},
"event_type":{
"type":"keyword"
},
"ts":{
"type": "date"
}
}
}
}
Sample Documents
POST mystatusindex/_doc/1
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
POST mystatusindex/_doc/2
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "FINISHED",
"ts" : "2019-10-02T08:12:53Z"
}
POST mystatusindex/_doc/3
{
"username" : "mapred",
"app_id" : "application_1569623930006_490201",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:30:53Z"
}
POST mystatusindex/_doc/4
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/5
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "FINISHED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/6
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "STARTED",
"ts" : "2019-10-03T09:30:53Z"
}
POST mystatusindex/_doc/7
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "FINISHED",
"ts" : "2019-10-03T09:45:53Z"
}
Query:
POST mystatusindex/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"ts": {
"gte": "2019-10-02T00:00:00Z",
"lte": "2019-10-02T23:59:59Z"
}
}
}
],
"should": [
{
"match": {
"event_type": "STARTED"
}
},
{
"match": {
"event_type": "FINISHED"
}
}
]
}
},
"aggs": {
"application_IDs": {
"terms": {
"field": "app_id"
},
"aggs": {
"ids": {
"top_hits": {
"size": 10,
"_source": ["event_type", "app_id"],
"sort": [
{ "event_type": { "order": "desc"}}
]
}
}
}
}
}
}
Notice that for filtering I've made use of Range Query as you only want to filter documents for that date and also added a bool should logic to filter based on STARTED and FINISHED.
Once I have the documents, I've made use of Terms Aggregation and Top Hits Aggregation to get the desired result.
Result
{
"took" : 12,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"application_IDs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "application_1569623930006_490200", <----- APP ID
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "1", <--- Document with STARTED status
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "2", <--- Document with FINISHED status
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490202",
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490201",
"doc_count" : 1,
"ids" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490201"
},
"sort" : [
"STARTED"
]
}
]
}
}
}
]
}
}
}
Note that the last document with only STARTED appears in the aggregation result as well.
Updated Answer
{
"size":0,
"query":{
"bool":{
"must":[
{
"range":{
"ts":{
"gte":"2019-10-02T00:00:00Z",
"lte":"2019-10-02T23:59:59Z"
}
}
}
],
"should":[
{
"term":{
"event_type.keyword":"STARTED" <----- Changed this
}
},
{
"term":{
"event_type.keyword":"FINISHED" <----- Changed this
}
}
]
}
},
"aggs":{
"application_IDs":{
"terms":{
"field":"app_id.keyword" <----- Changed this
},
"aggs":{
"ids":{
"top_hits":{
"size":10,
"_source":[
"event_type",
"app_id"
],
"sort":[
{
"event_type.keyword":{ <----- Changed this
"order":"desc"
}
}
]
}
}
}
}
}
}
Note the changes I've made. Whenever you would need exact matches or want to make use of aggregation, you would need to make use of keyword type.
In the mapping you've shared, there is no username field but two event_type fields. I'm assuming its just a human err and that one of the field should be username.
Now if you notice carefully, the field event_type has a text and its sibling keyword field. I've just modified the query to make use of the keyword field and when I am doing that, I'm use Term Query.
Try this out and let me know if it helps!

Search in nested object

I'm having trouble making a query on elasticsearch 7.3
I create an index as this:
PUT myindex
{
"mappings": {
"properties": {
"files": {
"type": "nested"
}
}
}
}
After I create three documents:
PUT myindex/_doc/1
{
"SHA256" : "94ee059335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22cc2",
"files" : [
{
"filename" : "firstfilename.exe",
"datetime" : 111111111
},
{
"filename" : "secondfilename.exe",
"datetime" : 111111144
}
]
}
PUT myindex/_doc/2
{
"SHA256" : "87ee059335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22c5a",
"files" : [
{
"filename" : "thirdfilename.exe",
"datetime" : 111111133
},
{
"filename" : "fourthfilename.exe",
"datetime" : 111111122
}
]
}
PUT myindex/_doc/3
{
"SHA256" : "565e049335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22c5a",
"files" : [
{
"filename" : "fifthfilename.exe",
"datetime" : 111111155
}
]
}
How can I get the last two files based on the datetime (ids: 1 and 3)?
I would SHA256 of the last two DATETIME ordered by DESC..
I did dozens of tests but none went well...
I don't write the code I tried because I'm really on the high seas ...
I would a result like this or similar:
{
"SHA256": [
"94ee05933....a2af6a22cc2",
"565e04933....a2af6a22c5a"
]
}
Query:
GET myindex/_search
{
"_source":"SHA256",
"sort": [
{
"files.datetime": {
"mode":"max",
"order": "desc",
"nested_path": "files"
}
}
],
"size": 2
}
Result:
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"SHA256" : "565e049335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22c5a"
},
"sort" : [
111111155
]
},
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"SHA256" : "94ee059335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22cc2"
},
"sort" : [
111111144
]
}
]
In sort you will get the max date time value . So If you need to get file names too , you can add it in _source and use sort file to get appropriate file name.
A bit more complicated query this will give you exactly two values.
GET myindex/_search
{
"_source": "SHA256",
"query": {
"bool": {
"must": [
{
"nested": {
"path": "files",
"query": {
"match_all": {}
},
"inner_hits": {
"size":1,
"sort": [
{
"files.datetime": "desc"
}
]
}
}
}
]
}
},
"sort": [
{
"files.datetime": {
"mode": "max",
"order": "desc",
"nested_path": "files"
}
}
],
"size": 2
}
Result:
[
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"SHA256" : "565e049335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22c5a"
},
"sort" : [
111111155
],
"inner_hits" : {
"files" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "3",
"_nested" : {
"field" : "files",
"offset" : 0
},
"_score" : null,
"_source" : {
"filename" : "fifthfilename.exe",
"datetime" : 111111155
},
"sort" : [
111111155
]
}
]
}
}
}
},
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"SHA256" : "94ee059335e587e501cc4bf90613e0814f00a7b08bc7c648fd865a2af6a22cc2"
},
"sort" : [
111111144
],
"inner_hits" : {
"files" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "myindex",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "files",
"offset" : 1
},
"_score" : null,
"_source" : {
"filename" : "secondfilename.exe",
"datetime" : 111111144
},
"sort" : [
111111144
]
}
]
}
}
}
}
]

Resources