GET TOP HIT FROM A VALUE IF THIS IS 0 KIBANA - elasticsearch

My first post, I spend the weekend looking for an answer without a good result
I will try to explain my issue, I have this Index
ST ID
0 1
1 1
0 2
1 2
0 2
1 3
0 3
For example, I need to show the last records from each ID when them are 0, for example, in this index I have to show only ID 1 and ID 2, becuase the last record has ST to 0 in ID 1 and 2
Could some try to help me with this issue?
BR

Mapping:
PUT index34
{
"mappings": {
"properties": {
"ST":{
"type": "integer"
},
"ID":{
"type": "integer"
},
"Date":{
"type": "date"
}
}
}
}
Data:
[
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "LO7Z7W0B_-hMjUaqtwHw",
"_score" : 1.0,
"_source" : {
"ST" : 1,
"ID" : 1,
"Date" : "2019-10-21T12:00:00Z"
}
},
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "Le7Z7W0B_-hMjUaq0QEz",
"_score" : 1.0,
"_source" : {
"ST" : 0,
"ID" : 1,
"Date" : "2019-10-21T12:01:00Z"
}
},
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "Lu7a7W0B_-hMjUaqAwE0",
"_score" : 1.0,
"_source" : {
"ST" : 1,
"ID" : 2,
"Date" : "2019-10-21T12:02:00Z"
}
},
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "L-7a7W0B_-hMjUaqGAEr",
"_score" : 1.0,
"_source" : {
"ST" : 0,
"ID" : 2,
"Date" : "2019-10-21T12:04:00Z"
}
},
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "MO7a7W0B_-hMjUaqNAGA",
"_score" : 1.0,
"_source" : {
"ST" : 0,
"ID" : 3,
"Date" : "2019-10-21T12:04:00Z"
}
},
{
"_index" : "index34",
"_type" : "_doc",
"_id" : "Me7a7W0B_-hMjUaqTQFP",
"_score" : 1.0,
"_source" : {
"ST" : 1,
"ID" : 3,
"Date" : "2019-10-21T12:06:00Z"
}
}
]
Query: I am getting max date for all terms and then getting the max value when ST was zero. If these two match(which means 0 was latest document) then I am keeping tha bucket
GET index34/_search
{
"size": 0,
"aggs": {
"ID": {
"terms": {
"field": "ID",
"size": 10000
},
"aggs": {
"maxDate": {
"max": {
"field": "Date"
}
},
"pending_status": {
"filter": {
"term": {
"ST": 0
}
},
"aggs": {
"filtered_maxdate": {
"max": {
"field": "Date"
}
}
}
},
"buckets_latest_status_pending": {
"bucket_selector": {
"buckets_path": {
"filtereddate": "pending_status>filtered_maxdate",
"maxDate": "maxDate"
},
"script": "params.filtereddate==params.maxDate"
}
}
}
}
}
}
Response:
"aggregations" : {
"ID" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 2,
"pending_status" : {
"doc_count" : 1,
"filtered_maxdate" : {
"value" : 1.57165926E12,
"value_as_string" : "2019-10-21T12:01:00.000Z"
}
},
"maxDate" : {
"value" : 1.57165926E12,
"value_as_string" : "2019-10-21T12:01:00.000Z"
}
},
{
"key" : 2,
"doc_count" : 2,
"pending_status" : {
"doc_count" : 1,
"filtered_maxdate" : {
"value" : 1.57165944E12,
"value_as_string" : "2019-10-21T12:04:00.000Z"
}
},
"maxDate" : {
"value" : 1.57165944E12,
"value_as_string" : "2019-10-21T12:04:00.000Z"
}
}
]
}

Related

Aggregating multiple values of single key into a single bucket elasticsearch

I have a elastic search index with following mapping
{
"probe_alert" : {
"mappings" : {
"alert" : {
"properties" : {
"id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"probeChannelId" : {
"type" : "long"
},
"severity" : {
"type" : "integer"
},
}
}
}
}
}
Sample indexed data : For each channel index has a severity value
[
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 15,
"severity" : 2,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 17,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 18,
"severity" : 10,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" : 19,
"severity" : 5,
}
},
{
"_index" : "probe_alert",
"_type" : "alert",
"_id" : "b_cu0nYB8EMvknGcmMxk",
"_score" : 0.0,
"_source" : {
"id" : "b_cu0nYB8EMvknGcmMxk",
"probeChannelId" :20,
"severity" : 10,
}
}
]
I have done terms aggregation for fetching max severity value for a single probeChannelId but now I want to aggregate on multiple values of probeChannelId and get max value of severity.
Expected Result :
"aggregations" : {
"aggs_by_channels" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : [15,17],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 5.0
}
},
{
"key" : [18,19,20],
"doc_count" : 1,
"aggs_by_severity" : {
"value" : 10.0
}
}
]
}
}
In response i want group of values probeChannelId to have highest severity value
If you want to get the highest severity value, among a set of documents, then you can try out the below query using the Adjacency matrix aggregation
Search Query:
{
"size": 0,
"aggs": {
"interactions": {
"adjacency_matrix": {
"filters": {
"[15,17]": {
"terms": {
"probeChannelId": [
15,
17
]
}
},
"[18,19,20]": {
"terms": {
"probeChannelId": [
18,
19,
20
]
}
}
}
},
"aggs": {
"max_severity": {
"max": {
"field": "severity"
}
}
}
}
}
}
Search Result:
"aggregations": {
"interactions": {
"buckets": [
{
"key": "[15,17]",
"doc_count": 2,
"max_severity": {
"value": 5.0 // note this
}
},
{
"key": "[18,19,20]",
"doc_count": 3,
"max_severity": {
"value": 10.0 // note this
}
}
]
}

ElasticSearch - Filter Buckets

My elasticSearch query is like:
{
"size": 0,
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
It works fine, I have a result of this form:
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
I'm trying to add a filter, so when "OK" the result must be like this:
aggregations
=========>group_by_id
==============>buckets
{key:id2}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
{key:id3}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"OK"}
and for "KO" :
aggregations
=========>group_by_id
==============>buckets
{key:id1}
===============>docs
{doc1.Status:"KO"}
{doc2.Status:"KO"}
Fields "Startime" & "Status" are at the same level "Infos.InstanceInfo.[...]"
Any idea?
EDIT
Sample docs:
{
"took" : 794,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"group_by_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 143846,
"buckets" : [
{
"key" : "1000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "vHFvoXYBVWrYChNi7hB7",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609026236011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xHFvoXYBVWrYChNi7xAB",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.145+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236145
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "xXFvoXYBVWrYChNi7xAC",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.147+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236147
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "x3FvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.188+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236188
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "yHFvoXYBVWrYChNi7xAs",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.19+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236190
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "ynFvoXYBVWrYChNi7xBd",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "1000",
"StartTime" : "2020-12-27T00:43:56.236+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609026236236
]
}
]
}
}
},
{
"key" : "2000",
"doc_count" : 2,
"tops" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7HL_onYBVWrYChNij4Is",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.011+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426011
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9HL_onYBVWrYChNij4Kz",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "2000",
"StartTime" : "2020-12-27T08:00:26.146+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609052426146
]
}
]
}
}
},
{
"key" : "3000",
"doc_count" : 6,
"tops" : {
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "7nNRpHYBVWrYChNiiruh",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.015+01:00",
"status" : "KO"
}
}
},
"sort" : [
1609074576015
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "9nNRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "93NRpHYBVWrYChNii7s5",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.166+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576166
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-XNRpHYBVWrYChNii7ti",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.209+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576209
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "-nNRpHYBVWrYChNii7ts",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.219+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576219
]
},
{
"_index" : "azerty",
"_type" : "_doc",
"_id" : "_HNRpHYBVWrYChNii7ud",
"_score" : null,
"_source" : {
"Infos" : {
"InstanceInfo" : {
"ID" : "3000",
"StartTime" : "2020-12-27T14:09:36.269+01:00",
"status" : "OK"
}
}
},
"sort" : [
1609074576269
]
}
]
}
}
}
]
}
}
}
Assuming the status field is under Infos.InstanceInfo and it's of the keyword mapping, you can utilize the filter aggregation:
{
"size": 0,
"aggs": {
"status_KO_only": {
"filter": { <--
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
In this particular case you could've applied the same term query in the query part of the search request without having to use a filter aggregation.
If you want to get both OK and KO in the same request, you can copy/paste the whole status_KO_only aggregation, rename the 2nd one, and voila -- you now have both groups in one request. You can of course have as many differently named (top-level) filter aggs as you like.
Now, when you indeed need multiple filter aggs at once, there's a more elegant way that does not require copy-pasting -- enter the filters aggregation:
{
"size": 0,
"aggs": {
"by_statuses": {
"filters": { <--
"filters": {
"status_KO": {
"term": {
"Infos.InstanceInfo.Status": "KO"
}
},
"status_OK": {
"term": {
"Infos.InstanceInfo.Status": "OK"
}
}
}
},
"aggs": {
"group_by_id": {
"terms": {
"field": "Infos.InstanceInfo.ID.keyword",
"size": 1000
},
"aggs": {
"tops": {
"top_hits": {
"size": 100,
"sort": {
"Infos.InstanceInfo.StartTime": "asc"
}
}
}
}
}
}
}
}
}
Any of the child sub-aggregations will automatically be the buckets of the explicitly declared term filters.
I personally find the copy/paste approach more readable, esp. when constructing such requests dynamically (based on UI dropdowns and such.)

elastic query to get events where corresponding pair is missing

I have records of transaction which follow following lifecycle.
Event when transaction is received [RCVD]
Event when transaction gets pending for execution [PNDG] (OPTIONAL step)
Event when it gets executed [SENT]
Following are the 7 sample events in the index:
{trxID: 1, status:RCVD}
{trxID: 2, status:RCVD}
{trxID: 3, status:RCVD}
{trxID: 2, status:PNDG}
{trxID: 3, status:PNDG}
{trxID: 1, status:SENT}
{trxID: 2, status:SENT}
I need to find all the transactions which went to pending state but not executed yet. In other word there should be PNDG status for transaction but not SENT.
I am trying not to do it at java layer.
I did an aggregation on trxID, and then I did sub aggregation on status.
Then I cannot figure out how to get those records where bucket has only PNDG in sub-aggregation. I am not sure if I am thinking in right direction.
The result I am expecting is trxID 3 because for this transaction ,we got PNDG status but did not get SENT yet. On the other hand TrxUD 1 should not be reported as it never went to PNDG (pending) state irrespective of if SENT status is reported of not.
You can use count of status under a transaction id.
GET index24/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "trxID",
"size": 10
},
"aggs": {
"status": {
"terms": {
"field": "status.keyword",
"size": 10
}
},
"count": {
"cardinality": {
"field": "status.keyword"
}
},
"my_bucketselector": {
"bucket_selector": {
"buckets_path": {
"statusCount": "count"
},
"script": "params.statusCount==1"
}
}
}
}
}
}
Response:
"aggregations" : {
"transactionId" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 4,
"doc_count" : 1,
"count" : {
"value" : 1
},
"status" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "PNDG",
"doc_count" : 1
}
]
}
}
]
}
}
EDIT 1:
I have tried with below :-
Get max date for a transaction id and then get date under pending . If both dates are same then pending is the last status
Data:
[
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "aYCs0m0BD5PlkoxXxO36",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "aoCs0m0BD5PlkoxX7e35",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "PNDG",
"date" : "2019-10-15T12:01:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "a4Ct0m0BD5PlkoxXCO06",
"_score" : 1.0,
"_source" : {
"trxID" : 1,
"status" : "SENT",
"date" : "2019-10-15T12:02:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "bICt0m0BD5PlkoxXQe0Y",
"_score" : 1.0,
"_source" : {
"trxID" : 2,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "bYCt0m0BD5PlkoxXZO2x",
"_score" : 1.0,
"_source" : {
"trxID" : 2,
"status" : "PNDG",
"date" : "2019-10-15T12:01:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "boCt0m0BD5PlkoxXju1H",
"_score" : 1.0,
"_source" : {
"trxID" : 3,
"status" : "RCVD",
"date" : "2019-10-15T12:00:00"
}
},
{
"_index" : "index24",
"_type" : "_doc",
"_id" : "b4Ct0m0BD5PlkoxXou0-",
"_score" : 1.0,
"_source" : {
"trxID" : 3,
"status" : "SENT",
"date" : "2019-10-15T12:01:00"
}
}
]
Query:
GET index24/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "trxID",
"size": 10000
},
"aggs": {
"maxDate": {
"max": {
"field": "date" ---> get max date under transactions
}
},
"pending_status": {
"filter": {
"term": {
"status.keyword": "PNDG" ---> filter for pending
}
},
"aggs": {
"filtered_maxdate": {
"max": {
"field": "date" --> get date under pending
}
}
}
},
"buckets_latest_status_pending": { -->filter if max date==pending date
"bucket_selector": {
"buckets_path": {
"filtereddate": "pending_status>filtered_maxdate",
"maxDate": "maxDate"
},
"script": "params.filtereddate==params.maxDate"
}
}
}
}
}
}
Response:
{
"transactionId" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, --> only transaction id 2 is returned
"doc_count" : 2,
"pending_status" : {
"doc_count" : 1,
"filtered_maxdate" : {
"value" : 1.57114086E12,
"value_as_string" : "2019-10-15T12:01:00.000Z"
}
},
"maxDate" : {
"value" : 1.57114086E12,
"value_as_string" : "2019-10-15T12:01:00.000Z"
}
}
]
}
}
I did an aggregation on trxID, and then I did sub aggregation on status.
That's a great start !!!
Now, you can leverage the bucket_selector pipeline aggregation in order to surface only the transactions which have only 1 or 2 documents, i.e. the script condition params.eventCount < 3 makes sure to catch all buckets that have RCVD and/or PNDG documents but no SENT documents:
POST events/_search
{
"size": 0,
"aggs": {
"trx": {
"terms": {
"field": "trxID",
"size": 1000
},
"aggs": {
"count": {
"cardinality": {
"field": "status.keyword"
}
},
"not_sent": {
"bucket_selector": {
"buckets_path": {
"eventCount": "count"
},
"script": "params.eventCount < 3"
}
}
}
}
}
}
In your case, this would yield this, i.e. only event with trxID = 3:
"aggregations" : {
"trx" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 3,
"doc_count" : 2,
"count" : {
"value" : 2
}
}
]
}
}

Simple way to find which one are in the same company with me?

There is index have field like below, it saves who in which company and which position is
{
"createtime" : 1562844632272,
"post" : "director",
"personId" : 30007346088,
"comId" : 20010774891
}
now want to find the partners of someone, that is which person is in the same company. Now my implementation is
first find the person's related companies(at most 500)
{
"query": { "term": { "personId": 30007346088 } },
"sort": [ { "createtime": "desc" } ],
"_source": ["comId"],
"size":500
}
then find these companies' related persons and exclude the current person and remove duplicate partner(similarly at most 500 partners)
{
"query": {
"bool": {
"must": [{ "terms": { "comId": [20010774891,...] } } ],
"must_not": [ {"term":{"personId":30007346088}} ]
}
},
"aggs" : {
"personId" : {
"terms" : {
"field" : "personId",
"size": 500
}
}
},
"size":0
}
Obviously it's a little complicated, if could exist some more simple way to implement it?
It can work if data is stored in below format.
A unique document for each person , with document id same as person id and company stored as array
POST indexperson/_doc/1
{
"createtime": 1562844632272,
"personId": 1,
"company": [
{
"id": 100,
"post": "director"
},
{
"id": 100,
"post": "director"
}
]
}
Data:
[
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 1,
"company" : [
{
"id" : 100,
"post" : "director"
},
{
"id" : 101,
"post" : "director"
}
]
}
},
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 2,
"company" : [
{
"id" : 101,
"post" : "director"
}
]
}
},
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 3,
"company" : [
{
"id" : 100,
"post" : "director"
}
]
}
},
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 4,
"company" : [
{
"id" : 104,
"post" : "director"
}
]
}
}
]
Query:
Use (terms look up)[https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html]. Terms look up takes doc id as parameter
GET indexperson/_search
{
"query": {
"bool": {
"must": [
{
"terms": {
"company.id": {
"index": "indexperson",
"id": "1", --> get all docs in indexperson which match with company id
"path": "company.id"
}
}
}
],
"must_not": [
{
"term": {
"personId": {
"value": 2
}
}
}
]
}
}
}
Result:
"hits" : [
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 1,
"company" : [
{
"id" : 100,
"post" : "director"
},
{
"id" : 101,
"post" : "director"
}
]
}
},
{
"_index" : "indexperson",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"createtime" : 1562844632272,
"personId" : 3,
"company" : [
{
"id" : 100,
"post" : "director"
}
]
}
}
]

Fetching unique data in Elasticsearch

I have following data
ID: 1, fldname: pawan
ID: 1, fldname: pawan1
ID: 1, fldname: pawan2
ID: 2, fldname: pawan3
ID: 3, fldname: pawan4
ID: 4, fldname: pawan5
I am trying to get unique data based on ID field, similar to what we get in MySQL while firing group by queries like:
select * from table_name where fldname like 'pawan%' group by ID
This will return unique values. Same works in sphinx search when we use group by function.
Is there any way to get unique values in elasticsearch..?
Below is my sample mapping:
"mappings": {
"my_type": {
"properties": {
"docid": {
"type": "keyword"
},
"flgname": {
"type": "text"
}
}
}
}
I suggest that you slightly modify your mapping:
{
"record" : {
"dynamic" : "false",
"_all" : {
"enabled" : false
},
"properties" : {
"docid" : {
"type" : "long"
},
"flgname" : {
"type" : "text"
}
}
}
}
so that docid is a long
Then you could try fuzzy queries for filtering, together with aggregations, like this one here which retrieves the minimum, maximum, average and count of docid's:
{
"from" : 0,
"size" : 10,
"_source" : true,
"query" : {
"bool" : {
"must" : [ {
"match" : {
"flgname" : {
"query" : "pawan",
"operator" : "OR",
"fuzziness" : "1",
"prefix_length" : 1,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1.0
}
}
} ]
}
},
"aggs" : {
"my_cardinality" : {
"cardinality" : {
"field" : "docid"
}
},
"my_avg" : {
"avg" : {
"field" : "docid"
}
},
"my_min" : {
"min" : {
"field" : "docid"
}
},
"my_max" : {
"max" : {
"field" : "docid"
}
}
}
}
By the way this is the result of the above query on the data you proposed:
{
"took" : 47,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.9808292,
"hits" : [ {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "40b5eac0-743b-4a6a-a06d-3ae4d56f4aca",
"_score" : 0.9808292,
"_source" : {
"docid" : "1",
"flgname" : "pawan"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "27821c39-e722-4361-bc07-0dcd5181a1ad",
"_score" : 0.7846634,
"_source" : {
"docid" : "2",
"flgname" : "pawan3"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "86fcd9c1-a688-4a6a-9c45-e91791a8b902",
"_score" : 0.7846634,
"_source" : {
"docid" : "4",
"flgname" : "pawan5"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fb00a3cc-f1b8-4073-8808-f2ddbc4979e2",
"_score" : 0.55451775,
"_source" : {
"docid" : "1",
"flgname" : "pawan1"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "18e5e20d-17a7-4d59-b2f1-7bf325a4c4df",
"_score" : 0.55451775,
"_source" : {
"docid" : "3",
"flgname" : "pawan4"
}
}, {
"_index" : "stack_overflow1",
"_type" : "record",
"_id" : "fbf49af6-f574-4ad2-8686-cbbedc5e70c4",
"_score" : 0.23014566,
"_source" : {
"docid" : "1",
"flgname" : "pawan2"
}
} ]
},
"aggregations" : {
"my_cardinality" : {
"value" : 4
},
"my_max" : {
"value" : 4.0
},
"my_avg" : {
"value" : 2.0
},
"my_min" : {
"value" : 1.0
}
}
}
If you make flgname also a keyword, then you can use sub-aggregation to aggregate over docID and subaggregate over flgname. Result will be similar to the SQL query you mentioned.
Query would look like:
{ "size": 0,
"query": {
"regexp":{
"flgname": "pawa.*"
}
},
"aggs" : {
"docids": {
"terms": {"field": "docid"},
"aggs": { "flgnam": { "terms": {"field": "flgname"}}}}
}
}

Resources