ElasticSearch get last n distinct records - elasticsearch

I am trying to implement a search query over records stored in elasticsearch.
The record structure looks something like this.
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "pWjQLWkBIJk0ORjd0X2P",
"_score" : null,
"_source" : {
"transactionID" : "60ab66cf24c9924f562bf1a2b5d92305d0a6",
"boxNumber" : "Box3",
"createDate" : "2013-09-17T00:00:00",
"itemNumber" : "Item1",
"address" : "Sample Address"
}
}
one box can contain multiple items. For example Box3 can have Item1, Item2 and Item3. So in elasticsearch i will have 3 different documents. Also at the same time, same box and same item can also exist but with different address. The transactionID may or maynot be the same for these documents.
My requirement is to fetch last n recent and distinct transactionIDs, along with their records.
I tried following query to fetch last 7 distinct transactionIDs
GET /box_info_store/boxes/_search?size=7
{
"query": {
"bool": {
"must": [
{"match":{"boxNumber":"Box3"}},
{"match":{"itemNumber":"Item1"}}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": { "field": "transactionID"}
}
}
}
This fetched me last 7 documents where boxNumber is Box3 and itemNumber is Item1, but not 7 distinct transactionIDs, two out of these seven documents have the same transactionID(both having separate address though).
But my requirement is to get 7 distinct transactionIds, no matter how many document it returns.
Hope i was able to explain myself.
Appreciate any kind of help here
Thanks
------Edited #gaurav9620, i ran the first query and got count as 32, then i ran the second query with distinct count as 3 i got the following result
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 32,
"max_score" : null,
"hits" : [
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "RWjRLWkBIJk0ORjdEX-L",
"_score" : null,
"_source" : {
"transactionID" : "3087e106244f6247a5290fb21ce64254529c",
"boxNumber" : "Box3",
"createDate" : "2017-11-15T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress12",
},
"sort" : [
1510704000000
]
},
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "MGjQLWkBIJk0ORjdwX0M",
"_score" : null,
"_source" : {
"transactionID" : "60ab66cf24c9924f562bf1a2b5d92305d0a6",
"boxNumber" : "Box3",
"createDate" : "2016-04-03T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress321",
},
"sort" : [
1459641600000
]
},
..........
..........
..........
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "AGjRLWkBIJk0ORjdK4CJ",
"_score" : null,
"_source" : {
"transactionID" : "3087e106244f6247a5290fb21ce64254529c",
"boxNumber" : "Box3",
"createDate" : "1996-02-16T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress4324",
},
"sort" : [
824428800000
]
}
]
},
"aggregations" : {
"unique_transactions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 16,
"buckets" : [
{
"key" : "3087e106244f6247a5290fb21ce64254529c",
"doc_count" : 6
},
{
"key" : "27c5f3422f4482495d29e7b2c15c0e311743",
"doc_count" : 5
},
{
"key" : "c40e53212e74e24bf02a5bd2b134cf92bffb",
"doc_count" : 5
}
]
}
}
}

The size which you have used : represents number of raw documents that are retrieved.
If your case what you need to do is :
Mention size as 0 -> which will return you no raw documents
Include a size parameter in aggregation which will return you unique 7 ids.
GET /box_info_store/boxes/_search?size=7
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": {
"field": "transactionID",
"size": 7
}
}
}
}
EDIT-------------------------------------
First fire this query
GET /box_info_store/boxes/_search?size=0
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
}
}
Here you will find total number of documents matching your query which you can set as n
After this fire your query as below
GET /box_info_store/boxes/_search?size=**n**
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": {
"field": "transactionID",
"size": NUMBER_OF_UNIQUE_TRANSACTION_IDS_TO_BE_FETCHED
}
}
}
}

Related

ElasticSearch aggregation for filtering users who had both events

I've written a query which perfectly return me 6000+ events my users had:
GET /<app_logs-2022.11.23*>/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"context.identity.type": "login"
}
},
{
"term": {
"context.identity.type": "login_error"
}
}
],
"minimum_should_match": 1
}
},
"_source": [
"context.identity.user_id",
"context.identity.type"
],
"size": 3
}
And i get such set of data
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 15,
"successful" : 15,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6001,
"max_score" : 10.722837,
"hits" : [
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "bb469377-0618-49a6-a643-1201dc84c829",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "8f4e82a0-f333-4096-bfb6-767fed924093",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login_error"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "7090be5a-8b53-4723-a1ac-223476a000f1",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "75bcb301-1cee-4b3b-aa1b-adbe4c011388",
"type" : "login_error"
}
}
}
}
]
}
}
But i can't figure out how to get a number of users who had both login and login_error events, i've tried cardinality aggregation, terms and several more but all of them just split types into buckets showing sum but doesn't group by user, and i want to find how many users had problems first but then managed to login in the end.
The best i've managed to achieve is to get buckets by user_id and output cardinality for each of them by type
"aggs": {
"results": {
"terms": {
"field": "context.identity.user_id",
"size": 300
},
"aggs": {
"events": {
"cardinality": {
"field": "context.identity.type"
}
}
}
}
}
I created an example based on the sentence I want to find how many users had problems first but then managed to login in at the end.
It works like this:
Make aggs based on user_id
Make sub-aggs by type
Ignore the ones that don't contain login_error
#put mapping
PUT test_stack_login
{
"mappings": {
"properties": {
"context.identity.user_id": {
"type": "keyword"
},
"context.identity.type": {
"type": "keyword"
}
}
}
}
#put example docs
POST test_stack_login/_bulk?refresh&pretty
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":2,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":3,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login"}
#Run the query
GET test_stack_login/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "context.identity.user_id",
"size": 1000
},
"aggs": {
"context_identity_type": {
"terms": {
"field": "context.identity.type",
"size": 10
}
},
"login_error_exist": {
"bucket_selector": {
"buckets_path": {
"var1": "context_identity_type['login_error']>_count"
},
"script": "params.var1 != null"
}
}
}
}
}
}
#the result will be like in the ss
You will get user_id containing both login and login_error information in the context.identity.type field. The keys in the response will give you the user_id of which have logged in at least once unsuccessfully and once successfully logged in.
"buckets" : [
{"key" : "1" ...},
{"key" : "4" ...}
]

How to return hit term in ES ?

I try to return only the terms that were successfully hit instead of the document itself, but I don’t know how to achieve the desired effect。
"es_episode" : {
"aliases" : { },
"mappings" : {
"properties" : {
"endTime" : {
"type" : "long"
},
"episodeId" : {
"type" : "long"
},
"startTime" : {
"type" : "long"
},
"studentIds" : {
"type" : "long"
}
}
}
This is an example:
{
"episodeId":124,
"startTime":10,
"endTime":20,
"studentIds":[200,300]
}
My query:
GET /es_episode/_search
{
"_source": ["studentIds"],
"query": {
"terms": {
"studentIds": [300,400]
}
}
}
The result is
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "es_episode",
"_type" : "episode",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"studentIds" : [
200,
300
]
}
}
]
}
But in fact I only want to know which term hits. For example, the result I want should be studentIds=[300] instead of all studentIds=[200,300] of the returned document. It seems that some additional operations are required, but I don’t know
how.
I try to achieve my goal with the following query
GET /es_episode/_search
{
"_source": ["studentIds"],
"query": {
"terms": {
"studentIds": [300,400]
}
},
"aggs": {
"student_id": {
"terms": {
"field": "studentIds",
"size": 10
},
"aggs": {
"id": {
"terms": {
"field": "episodeId"
}
},
"id_select":{
"bucket_selector": {
"buckets_path": {
"key" : "_key"
},
"script": "params.key==300 || params.key==400"
}
}
}
}
}
}
the result for this is
"aggregations" : {
"student_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 300,
"doc_count" : 1,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 124,
"doc_count" : 1
}
]
}
}
]
}
}
It seems that I successfully filtered out the terms I don’t want, but this doesn’t look pretty, and I need to set my parameters repeatedly in the script

global sorting across different buckets after aggregation in elasticsearch

a sample in my document is as shown below.
{"rackName" : "rack005", "roomName" : "roomB", "power" : 132, "timestamp" : 1594540106208}
the thing I wanna do is get the latest data of each rack in a given room then sort them by power.
with the code below I did something to get close to my target.losing mind with the last step which seems like soring my data cross different buckets by field 'power'.
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"timestamp": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
-----------------------------------result-------------------------------------------------------
"aggregations" : {
"rk_ag" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "rack003",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "0FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack003",
"roomName" : "roomB",
"power" : 115,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack004",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "1FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack004",
"roomName" : "roomB",
"power" : 108,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack005",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "2FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack005",
"roomName" : "roomB",
"power" : 118,
"timestamp" : 1594540114492
},
"sort" : [
1594540114492
]
}
]
}
}
}
]
}
}
You're sorting by timestamp instead of power. Try this instead:
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
You can sort by multiple fields too.
Adding to #Joe's answer. As he mentioned, you can use multiple fields in the sort.
Below query would give you what you are looking for:
POST my_rack_index/_search
{
"size": 0,
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [ <---- Note this part
{
"timestamp": {
"order": "desc"
}
},
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
So now if for every rack you have two documents having same rackName with exact same power, the one with the latest timestamp would be showing up in the response.
The way sort would work is, first it would sort based on the timestamp, then it would do the sorting based on power by keeping the sort based on timestamp intact.

Elasticsearch, terms aggs according to sibling nested fields

Elasticsearch v7.5
Hello and good day!
We have 2 indices named socialmedia and influencers
Sample contents:
socialmedia:
{
'_id' : 1001,
'title' : "Title 1",
'smp_id' : 1,
"latest" : [
{
"soc_mm_score" : "5",
}
]
},
{
'_id' : 1002,
'title' : "Title 2",
'smp_id' : 2,
"latest" : [
{
"soc_mm_score" : "10",
}
]
},
{
'_id' : 1003,
'title' : "Title 3",
'smp_id' : 3,
"latest" : [
{
"soc_mm_score" : "35",
}
]
},
{
'_id' : 1004,
'title' : "Title 4",
'smp_id' : 2,
"latest" : [
{
"soc_mm_score" : "30",
}
]
}
//omitted some other fields
influencers:
{
'_id' : 1,
'name' : "John",
'smp_id' : 1
},
{
'_id' : 2,
'name' : "Peter",
'smp_id' : 2
},
{
'_id' : 3,
'name' : "Mark",
'smp_id' : 3
}
Now I have this simple query that determines which documents in the socialmedia index has the most latest.soc_mm_score value, and also displaying their corresponding influencers determined by the smp_id
GET socialmedia/_search
{
"size": 0,
"_source": "latest",
"query": {
"match_all": {}
},
"aggs": {
"LATEST": {
"nested": {
"path": "latest"
},
"aggs": {
"MM_SCORE": {
"terms": {
"field": "latest.soc_mm_score",
"order": {
"_key": "desc"
},
"size": 3
},
"aggs": {
"REVERSE": {
"reverse_nested": {},
"aggs": {
"SMP_ID": {
"top_hits": {
"_source": ["smp_id"],
"size": 1
}
}
}
}
}
}
}
}
}
}
SAMPLE OUTPUT:
"aggregations" : {
"LATEST" : {
"doc_count" : //omitted,
"MM_SCORE" : {
"doc_count_error_upper_bound" : //omitted,
"sum_other_doc_count" : //omitted,
"buckets" : [
{
"key" : 35,
"doc_count" : 1,
"REVERSE" : {
"doc_count" : 1,
"SMP_ID" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "socialmedia",
"_type" : "index",
"_id" : "1003",
"_score" : 1.0,
"_source" : {
"smp_id" : "3"
}
}
]
}
}
}
},
{
"key" : 30,
"doc_count" : 1,
"REVERSE" : {
"doc_count" : 1,
"SMP_ID" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "socialmedia",
"_type" : "index",
"_id" : "1004",
"_score" : 1.0,
"_source" : {
"smp_id" : "2"
}
}
]
}
}
}
},
{
"key" : 10,
"doc_count" : 1,
"REVERSE" : {
"doc_count" : 1,
"SMP_ID" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "socialmedia",
"_type" : "index",
"_id" : "1002",
"_score" : 1.0,
"_source" : {
"smp_id" : "2"
}
}
]
}
}
}
}
]
}
}
}
with the query above, I was able to successfully display which documents have the highest latest.soc_mm_score values
The sample output above only displays DOCUMENTS, telling that the influencers (a.k.a smp_id) related to them are the TOP INFLUENCERS according to latest.soc_mm_score
Ideally just by using this aggs query,
"terms" : {
"field" : "smp_id"
}
portrays the concept of which influencers are the top according to the doc_count
Now, displaying the terms query according to latest.soc_mm_score displays TOP DOCUMENTS
"terms" : {
"field" : "latest.soc_mm_score"
}
REAL OBJECTIVE:
I want to display the TOP INFLUENCERS according to the latest.soc_mm_count in the socialmedia index. If Elasticsearch can count all the documents where according to unique smp_id, is there a way for ES to sum all latest.soc_mm_score values and use it as terms?
My objective above should output these:
smp_id 2 as the Top Influencer because he has 2 posts (with soc_mm_score of 30 and 10), adding them gets him 40 soc_mm_score
smp_id 3 as the 2nd Top Influencer, he has 1 post with 35 soc_mm_score
smp_id 1 as the 3rd Top Influencer, he has 1 post with 5 soc_mm_score
Is there a proper query to meet this objective?
FINALLY! FOUND AN ANSWER!!!
"aggs": {
"INFS": {
"terms": {
"field": "smp_id.keyword",
"order": {
"LATEST > SUM_SVALUE": "desc"
}
},
"aggs": {
"LATEST": {
"nested": {
"path": "latest"
},
"aggs": {
"SUM_SVALUE": {
"sum" : {
"field": "latest.soc_mm_score"
}
}
}
}
}
}
}
Displays the following sample:

elasticsearch groupby and filter by regex condition

It's a bit hard for me to define the question as I'm not very experienced with Elasticsearch. I'm focusing the question on my specific problem:
Assuming I have the following records:
{
id: 1
name: bla1_1.aaa
},
{
id: 1
name: bla1_2.bbb
},
{
id: 2
name: bla2_1.aaa
},
{
id: 2
name: bla2_2.aaa
}
What I want is to GET all the ids that have all of their names ending with aaa.
I was thinking about group by id and then do a regex query like so: *\.aaa so that all the name must satisfy the regex query.
On this particular example I would get id: 2 back.
How do I do it?
Let me know if there's anything I need to add to clarify the question.
RegexExp can be used.
Wildcard .* matches any character any number of times including zero
Terms aggregation will give you unique "ids" and number of docs under them.
Mapping :
PUT regex
{
"mappings": {
"properties": {
"id":{
"type":"integer"
},
"name":{
"type":"text",
"fields": {
"keyword":{
"type":"keyword"
}
}
}
}
}
}
Data:
"hits" : [
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "olQXjW0BywGFQhV7k84P",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "o1QXjW0BywGFQhV7us6B",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_2.bbb"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pFQXjW0BywGFQhV77c6J",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pVQYjW0BywGFQhV7Dc6F",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_2.aaa"
}
}
]
Query:
GET regex/_search
{
"size":0,
"query": {
"regexp": {
"name.keyword": {
"value": ".*.aaa" ---> name ending with .aaa
}
}
},
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
}
}
}
}
Result:
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_ids" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, ---> 2 doc under id 2
"doc_count" : 2
},
{
"key" : 1, ----> 1 doc under id 1
"doc_count" : 1
}
]
}
}
Edit:
Using bucket selector to keep buckets where total count of docs in Id matches with docs selected in regex
GET regex/_search
{
"size": 0,
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"totalCount": { ---> to get total count of id(all docs)
"value_count": {
"field": "id"
}
},
"filter_agg": {
"filter": {
"bool": {
"must": [
{
"regexp": {
"name.keyword": ".*.aaa"
}
}
]
}
},
"aggs": {
"finalCount": { -->total count of docs matching regex
"value_count": {
"field": "id"
}
}
}
},
"mybucket_selector": { ---> include buckets where totalcount==finalcount
"bucket_selector": {
"buckets_path": {
"FinalCount": "filter_agg>finalCount",
"TotalCount": "totalCount"
},
"script": "params.FinalCount==params.TotalCount"
}
}
}
}
}
}

Resources