Fetching unique data in Elasticsearch - elasticsearch

I have following data
ID: 1, fldname: pawan
ID: 1, fldname: pawan1
ID: 1, fldname: pawan2
ID: 2, fldname: pawan3
ID: 3, fldname: pawan4
ID: 4, fldname: pawan5
I am trying to get unique data based on ID field, similar to what we get in MySQL while firing group by queries like:
select * from table_name where fldname like 'pawan%' group by ID
This will return unique values. Same works in sphinx search when we use group by function.
Is there any way to get unique values in elasticsearch..?
Below is my sample mapping:
"mappings": {
"my_type": {
"properties": {
"docid": {
"type": "keyword"
},
"flgname": {
"type": "text"
}
}
}
}

I suggest that you slightly modify your mapping:
{
"record" : {
"dynamic" : "false",
"_all" : {
"enabled" : false
},
"properties" : {
"docid" : {
"type" : "long"
},
"flgname" : {
"type" : "text"
}
}
}
}
so that docid is a long
Then you could try fuzzy queries for filtering, together with aggregations, like this one here which retrieves the minimum, maximum, average and count of docid's:
{
"from" : 0,
"size" : 10,
"_source" : true,
"query" : {
"bool" : {
"must" : [ {
"match" : {
"flgname" : {
"query" : "pawan",
"operator" : "OR",
"fuzziness" : "1",
"prefix_length" : 1,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1.0
}
}
} ]
}
},
"aggs" : {
"my_cardinality" : {
"cardinality" : {
"field" : "docid"
}
},
"my_avg" : {
"avg" : {
"field" : "docid"
}
},
"my_min" : {
"min" : {
"field" : "docid"
}
},
"my_max" : {
"max" : {
"field" : "docid"
}
}
}
}
By the way this is the result of the above query on the data you proposed:
{
"took" : 47,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.9808292,
"hits" : [ {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "40b5eac0-743b-4a6a-a06d-3ae4d56f4aca",
"_score" : 0.9808292,
"_source" : {
"docid" : "1",
"flgname" : "pawan"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "27821c39-e722-4361-bc07-0dcd5181a1ad",
"_score" : 0.7846634,
"_source" : {
"docid" : "2",
"flgname" : "pawan3"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "86fcd9c1-a688-4a6a-9c45-e91791a8b902",
"_score" : 0.7846634,
"_source" : {
"docid" : "4",
"flgname" : "pawan5"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fb00a3cc-f1b8-4073-8808-f2ddbc4979e2",
"_score" : 0.55451775,
"_source" : {
"docid" : "1",
"flgname" : "pawan1"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "18e5e20d-17a7-4d59-b2f1-7bf325a4c4df",
"_score" : 0.55451775,
"_source" : {
"docid" : "3",
"flgname" : "pawan4"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fbf49af6-f574-4ad2-8686-cbbedc5e70c4",
"_score" : 0.23014566,
"_source" : {
"docid" : "1",
"flgname" : "pawan2"
}
} ]
},
"aggregations" : {
"my_cardinality" : {
"value" : 4
},
"my_max" : {
"value" : 4.0
},
"my_avg" : {
"value" : 2.0
},
"my_min" : {
"value" : 1.0
}
}
}

If you make flgname also a keyword, then you can use sub-aggregation to aggregate over docID and subaggregate over flgname. Result will be similar to the SQL query you mentioned.
Query would look like:
{ "size": 0,
"query": {
"regexp":{
"flgname": "pawa.*"
}
},
"aggs" : {
"docids": {
"terms": {"field": "docid"},
"aggs": { "flgnam": { "terms": {"field": "flgname"}}}}
}
}

Related

On Elasticsearch, how to aggregate based on the number of items in a field?

On Elasticsearch I have a field named Itinerary that can contain multiple values (from 1 up to 6), for example in the picture below there's 2 items in the field.
"Itinerary": [
{
"Carrier": "LH",
"Departure": "2021-07-04T06:55:00Z",
"Number": "1493",
"Arrival": "2021-07-04T08:40:00Z",
},
{
"Carrier": "LH",
"Departure": "2021-07-04T13:30:00Z",
"Number": "422",
"Arrival": "2021-07-04T16:05:00Z",
}
}
]
Is there a way I can aggregate based on the number of items in the field? Having something like:
1 item : 2
2 item : 4
...
Itinerary type needs to be define as nested type
"Itinerary":
{
"type": "nested"
}
Terms aggregation to group on a field. You can use script to get count of array or better introduce a field which has count of array
Top hits aggregation to get documents under that group
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"script": {
"source": "doc['Itinerary.Carrier.keyword'].length"
}
},
"aggs": {
"NAME": {
"top_hits": {
"size": 10
}
}
}
}
}
}
Result:
"aggregations" : {
"NAME" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2,
"doc_count" : 2,
"NAME" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index8",
"_type" : "_doc",
"_id" : "8OW1lnsBRh1xpgSkIOlq",
"_score" : 1.0,
"_source" : {
"Itinerary" : [
{
"Carrier" : "LH",
"Departure" : "2021-07-04T06:55:00Z",
"Number" : "1493",
"Arrival" : "2021-07-04T08:40:00Z"
},
{
"Carrier" : "LH",
"Departure" : "2021-07-04T13:30:00Z",
"Number" : "422",
"Arrival" : "2021-07-04T16:05:00Z"
}
]
}
},
{
"_index" : "index8",
"_type" : "_doc",
"_id" : "8uW6lnsBRh1xpgSkAun1",
"_score" : 1.0,
"_source" : {
"Itinerary" : [
{
"Carrier" : "LH2",
"Departure" : "2021-07-04T06:55:00Z",
"Number" : "14931",
"Arrival" : "2021-07-04T08:40:00Z"
},
{
"Carrier" : "LH2",
"Departure" : "2021-07-04T13:30:00Z",
"Number" : "4221",
"Arrival" : "2021-07-04T16:05:00Z"
}
]
}
}
]
}
}
},
{
"key" : 3,
"doc_count" : 1,
"NAME" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index8",
"_type" : "_doc",
"_id" : "8eW1lnsBRh1xpgSkdukQ",
"_score" : 1.0,
"_source" : {
"Itinerary" : [
{
"Carrier" : "LH1",
"Departure" : "2021-07-04T06:55:00Z",
"Number" : "14931",
"Arrival" : "2021-07-04T08:40:00Z"
},
{
"Carrier" : "LH1",
"Departure" : "2021-07-04T13:30:00Z",
"Number" : "4221",
"Arrival" : "2021-07-04T16:05:00Z"
},
{
"Carrier" : "LH1",
"Departure" : "2021-07-04T13:30:00Z",
"Number" : "3221",
"Arrival" : "2021-07-04T16:05:00Z"
}
]
}
}
]
}
}
}
]
}
}

Aggregating multiple values of single key into a single bucket elasticsearch

I have a elastic search index with following mapping
{
"probe_alert" : {
"mappings" : {
"alert" : {
"properties" : {
"id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"probeChannelId" : {
"type" : "long"
},
"severity" : {
"type" : "integer"
},
}
}
}
}
}
Sample indexed data : For each channel index has a severity value
[
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 15,
"severity" : 2,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 17,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 18,
"severity" : 10,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 19,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" :20,
"severity" : 10,
}
}
]
I have done terms aggregation for fetching max severity value for a single probeChannelId but now I want to aggregate on multiple values of probeChannelId and get max value of severity.
Expected Result :
"aggregations" : {
"aggs_by_channels" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : [15,17],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 5.0
}
},
{
"key" : [18,19,20],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 10.0
}
}
]
}
}
In response i want group of values probeChannelId to have highest severity value
If you want to get the highest severity value, among a set of documents, then you can try out the below query using the Adjacency matrix aggregation
Search Query:
{
"size": 0,
"aggs": {
"interactions": {
"adjacency_matrix": {
"filters": {
"[15,17]": {
"terms": {
"probeChannelId": [
15,
17
]
}
},
"[18,19,20]": {
"terms": {
"probeChannelId": [
18,
19,
20
]
}
}
}
},
"aggs": {
"max_severity": {
"max": {
"field": "severity"
}
}
}
}
}
}
Search Result:
"aggregations": {
"interactions": {
"buckets": [
{
"key": "[15,17]",
"doc_count": 2,
"max_severity": {
"value": 5.0 // note this
}
},
{
"key": "[18,19,20]",
"doc_count": 3,
"max_severity": {
"value": 10.0 // note this
}
}
]
}

ElasticSearch - Filter Buckets

My elasticSearch query is like:
{
"size": 0,
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
It works fine, I have a result of this form:
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
I'm trying to add a filter, so when "OK" the result must be like this:
aggregations
=========>group_by_id
==============>buckets
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
and for "KO" :
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
Fields "Startime" & "Status" are at the same level "Infos.InstanceInfo.[...]"
Any idea?
EDIT
Sample docs:
{
"took" : 794,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"group_by_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 143846,
"buckets" : [
{
"key" : "1000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "vHFvoXYBVWrYChNi7hB7",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609026236011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xHFvoXYBVWrYChNi7xAB",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.145+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236145
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xXFvoXYBVWrYChNi7xAC",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.147+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236147
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "x3FvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.188+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236188
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "yHFvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.19+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236190
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "ynFvoXYBVWrYChNi7xBd",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.236+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236236
]
}
]
}
}
},
{
"key" : "2000",
"doc_count" : 2,
"tops" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7HL_onYBVWrYChNij4Is",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9HL_onYBVWrYChNij4Kz",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.146+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426146
]
}
]
}
}
},
{
"key" : "3000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7nNRpHYBVWrYChNiiruh",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.015+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609074576015
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9nNRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "93NRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-XNRpHYBVWrYChNii7ti",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.209+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576209
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-nNRpHYBVWrYChNii7ts",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.219+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576219
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "_HNRpHYBVWrYChNii7ud",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.269+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576269
]
}
]
}
}
}
]
}
}
}
Assuming the status field is under Infos.InstanceInfo and it's of the keyword mapping, you can utilize the filter aggregation:
{
"size": 0,
"aggs": {
"status_KO_only": {
"filter": { <--
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
In this particular case you could've applied the same term query in the query part of the search request without having to use a filter aggregation.
If you want to get both OK and KO in the same request, you can copy/paste the whole status_KO_only aggregation, rename the 2nd one, and voila -- you now have both groups in one request. You can of course have as many differently named (top-level) filter aggs as you like.
Now, when you indeed need multiple filter aggs at once, there's a more elegant way that does not require copy-pasting -- enter the filters aggregation:
{
"size": 0,
"aggs": {
"by_statuses": {
"filters": { <--
"filters": {
"status_KO": {
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"status_OK": {
"term": {
"Infos.InstanceInfo.Status": "OK"
}
}
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
Any of the child sub-aggregations will automatically be the buckets of the explicitly declared term filters.
I personally find the copy/paste approach more readable, esp. when constructing such requests dynamically (based on UI dropdowns and such.)

ElasticSearch join data within the same index

I am quite new with ElasticSearch and I am collecting some application logs within the same index which have this format
{
"_index" : "app_logs",
"_type" : "_doc",
"_id" : "JVMYi20B0a2qSId4rt12",
"_source" : {
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
I can have different event types. In this case I am interested in STARTED and FINISHED. I would like to query ES in order to get all the app that started in a certain day and enrich them with their end time. Basically I want to create couples of start/end (an end might also be missing, but that's fine).
I have realized join relations in sql cannot be used in ES and I was wondering if I can exploit some other feature in order to get this result in one query.
Edit: these are the details of the index mapping
{
“app_logs" : {
"mappings" : {
"_doc" : {
"properties" : {
"event_type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
“app_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"ts" : {
"type" : "date"
},
“event_type” : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}}}}
What I understood is that you would want to collate list of documents having same app_id along with the status as either STARTED or FINISHED.
I do not think Elasticsearch is not meant to perform JOIN operations. I mean you can but then you have to design your documents as mentioned in this link.
What you would need is an Aggregation query.
Below is the sample mapping, documents, the aggregation query and the response as how it appears, which would actually help you get the desired result.
Mapping:
PUT mystatusindex
{
"mappings": {
"properties": {
"username":{
"type": "keyword"
},
"app_id":{
"type": "keyword"
},
"event_type":{
"type":"keyword"
},
"ts":{
"type": "date"
}
}
}
}
Sample Documents
POST mystatusindex/_doc/1
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "STARTED",
"ts" : "2019-10-02T08:11:53Z"
}
POST mystatusindex/_doc/2
{
"username" : "mapred",
"app_id" : "application_1569623930006_490200",
"event_type" : "FINISHED",
"ts" : "2019-10-02T08:12:53Z"
}
POST mystatusindex/_doc/3
{
"username" : "mapred",
"app_id" : "application_1569623930006_490201",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:30:53Z"
}
POST mystatusindex/_doc/4
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "STARTED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/5
{
"username" : "mapred",
"app_id" : "application_1569623930006_490202",
"event_type" : "FINISHED",
"ts" : "2019-10-02T09:45:53Z"
}
POST mystatusindex/_doc/6
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "STARTED",
"ts" : "2019-10-03T09:30:53Z"
}
POST mystatusindex/_doc/7
{
"username" : "mapred",
"app_id" : "application_1569623930006_490203",
"event_type" : "FINISHED",
"ts" : "2019-10-03T09:45:53Z"
}
Query:
POST mystatusindex/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"ts": {
"gte": "2019-10-02T00:00:00Z",
"lte": "2019-10-02T23:59:59Z"
}
}
}
],
"should": [
{
"match": {
"event_type": "STARTED"
}
},
{
"match": {
"event_type": "FINISHED"
}
}
]
}
},
"aggs": {
"application_IDs": {
"terms": {
"field": "app_id"
},
"aggs": {
"ids": {
"top_hits": {
"size": 10,
"_source": ["event_type", "app_id"],
"sort": [
{ "event_type": { "order": "desc"}}
]
}
}
}
}
}
}
Notice that for filtering I've made use of Range Query as you only want to filter documents for that date and also added a bool should logic to filter based on STARTED and FINISHED.
Once I have the documents, I've made use of Terms Aggregation and Top Hits Aggregation to get the desired result.
Result
{
"took" : 12,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"application_IDs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "application_1569623930006_490200", <----- APP ID
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "1", <--- Document with STARTED status
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "2", <--- Document with FINISHED status
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490200"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490202",
"doc_count" : 2,
"ids" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"STARTED"
]
},
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"event_type" : "FINISHED",
"app_id" : "application_1569623930006_490202"
},
"sort" : [
"FINISHED"
]
}
]
}
}
},
{
"key" : "application_1569623930006_490201",
"doc_count" : 1,
"ids" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "mystatusindex",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"event_type" : "STARTED",
"app_id" : "application_1569623930006_490201"
},
"sort" : [
"STARTED"
]
}
]
}
}
}
]
}
}
}
Note that the last document with only STARTED appears in the aggregation result as well.
Updated Answer
{
"size":0,
"query":{
"bool":{
"must":[
{
"range":{
"ts":{
"gte":"2019-10-02T00:00:00Z",
"lte":"2019-10-02T23:59:59Z"
}
}
}
],
"should":[
{
"term":{
"event_type.keyword":"STARTED" <----- Changed this
}
},
{
"term":{
"event_type.keyword":"FINISHED" <----- Changed this
}
}
]
}
},
"aggs":{
"application_IDs":{
"terms":{
"field":"app_id.keyword" <----- Changed this
},
"aggs":{
"ids":{
"top_hits":{
"size":10,
"_source":[
"event_type",
"app_id"
],
"sort":[
{
"event_type.keyword":{ <----- Changed this
"order":"desc"
}
}
]
}
}
}
}
}
}
Note the changes I've made. Whenever you would need exact matches or want to make use of aggregation, you would need to make use of keyword type.
In the mapping you've shared, there is no username field but two event_type fields. I'm assuming its just a human err and that one of the field should be username.
Now if you notice carefully, the field event_type has a text and its sibling keyword field. I've just modified the query to make use of the keyword field and when I am doing that, I'm use Term Query.
Try this out and let me know if it helps!

Elasticsearch wildcard query with spaces

I'm trying to do a wildcard query with spaces. It easily matches the words on term basis but not on field basis.
I've read the documentation which says that I need to have the field as not_analyzed but with this type set, it returns nothing.
This is the mapping with which it works on term basis:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
This is the mapping with which the exact same query returns nothing:
{
"denshop" : {
"mappings" : {
"products" : {
"properties" : {
"code" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string",
"index" : "not_analyzed"
},
"price" : {
"type" : "long"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
The query is here:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*test*"}}}'
Response with the not_analyzed property:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
Response without not_analyzed:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
...
EDIT: Adding requested info
Here is the list of documents:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 5,
"max_score" : 1.0,
"hits" : [ {
"_index" : "denshop",
"_type" : "products",
"_id" : "3L1",
"_score" : 1.0,
"_source" : {
"id" : 3,
"name" : "Testovací produkt 2",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-2/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "4L1",
"_score" : 1.0,
"_source" : {
"id" : 4,
"name" : "Testovací produkt 3",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-3/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "2L1",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "Testovací produkt",
"code" : "",
"price" : 500,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "5L1",
"_score" : 1.0,
"_source" : {
"id" : 5,
"name" : "Testovací produkt 4",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/damske-obleceni/testovaci-produkt-4/"
}
}, {
"_index" : "denshop",
"_type" : "products",
"_id" : "6L1",
"_score" : 1.0,
"_source" : {
"id" : 6,
"name" : "Testovací produkt 5",
"code" : "",
"price" : 666,
"url" : "http://www.denshop.lh/tricka-tilka-tuniky/testovaci-produkt-5/"
}
} ]
}
}
Without the not_analyzed it returns with this:
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací*"}}}'
But not with this (notice the space before asterisk):
curl -XPOST http://127.0.0.1:9200/denshop/products/_search?pretty -d '{"query":{"wildcard":{"name":"*testovací *"}}}'
When I add the not_analyzed to mapping, it returns no hits no matter what I put in the wildcard query.
Add a custom analyzer that should lowercase the text. Then in your search query, before passing the text to it have it lowercased in your client application.
To, also, keep the original analysis chain, I've added a sub-field to your name field that will use the custom analyzer.
PUT /denshop
{
"settings": {
"analysis": {
"analyzer": {
"keyword_lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"products": {
"properties": {
"name": {
"type": "string",
"fields": {
"lowercase": {
"type": "string",
"analyzer": "keyword_lowercase"
}
}
}
}
}
}
}
And the query will work on the sub-field:
GET /denshop/products/_search
{
"query": {
"wildcard": {
"name.lowercase": "*testovací *"
}
}
}

Resources