Getting only the most recent records from ElasticSearch - elasticsearch

I have a user index of ElasticSearch where each user has a name and multiple other user related information and also an indexedAt field which specify when the user information is being indexed. When any information of user changes I create a new record of the user and store it. Therefore each user can have many multiple records in the index.
Now Simply I want to get only the most up to date information of the queried users.
For example if I run the following query, it will return all of the records of John and Smith. But I want only the most recent record for each of the users.
{
"size": 10000,
"query": {
"bool": {
"should": [
{
"match_phrase": {
"name": "John"
}
},
{
"match_phrase": {
"name": "Smith"
}
}
]
}
},
"sort": [
{
"indexedAt": {
"order": "desc"
}
}
]
}

You can use inner_hits to get your answer
GET /temp_index/_search
{
"size": 10,
"query": {
"bool": {
"should": [
{
"match_phrase": {
"name": "John"
}
},
{
"match_phrase": {
"name": "Smith"
}
}
]
}
},
"collapse": {
"field": "name.keyword",
"inner_hits": {
"name": "most_recent",
"size": 1,
"sort": [{"indexedAt": "desc"}]
}
}
}
This will get you a result similar to below
{
"_index" : "temp_index",
"_type" : "_doc",
"_id" : "KSHBjnMBPr3VGlJjXe3d",
"_score" : 0.8266786,
"_source" : {
"name" : "John",
"indexedAt" : 1015
},
"fields" : {
"name.keyword" : [
"John"
]
},
"inner_hits" : {
"most_recent" : {
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "temp_index",
"_type" : "_doc",
"_id" : "LyHBjnMBPr3VGlJji-24",
"_score" : null,
"_source" : {
"name" : "John",
"indexedAt" : 1050
},
"sort" : [
1050
]
}
]
}
}
}
},
You can access the inner_hits portion to get the document which was most recently indexed (i.e. with the largest indexedAt value)

Related

ElasticSearch aggregation for filtering users who had both events

I've written a query which perfectly return me 6000+ events my users had:
GET /<app_logs-2022.11.23*>/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"context.identity.type": "login"
}
},
{
"term": {
"context.identity.type": "login_error"
}
}
],
"minimum_should_match": 1
}
},
"_source": [
"context.identity.user_id",
"context.identity.type"
],
"size": 3
}
And i get such set of data
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 15,
"successful" : 15,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6001,
"max_score" : 10.722837,
"hits" : [
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "bb469377-0618-49a6-a643-1201dc84c829",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "8f4e82a0-f333-4096-bfb6-767fed924093",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login_error"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "7090be5a-8b53-4723-a1ac-223476a000f1",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "75bcb301-1cee-4b3b-aa1b-adbe4c011388",
"type" : "login_error"
}
}
}
}
]
}
}
But i can't figure out how to get a number of users who had both login and login_error events, i've tried cardinality aggregation, terms and several more but all of them just split types into buckets showing sum but doesn't group by user, and i want to find how many users had problems first but then managed to login in the end.
The best i've managed to achieve is to get buckets by user_id and output cardinality for each of them by type
"aggs": {
"results": {
"terms": {
"field": "context.identity.user_id",
"size": 300
},
"aggs": {
"events": {
"cardinality": {
"field": "context.identity.type"
}
}
}
}
}
I created an example based on the sentence I want to find how many users had problems first but then managed to login in at the end.
It works like this:
Make aggs based on user_id
Make sub-aggs by type
Ignore the ones that don't contain login_error
#put mapping
PUT test_stack_login
{
"mappings": {
"properties": {
"context.identity.user_id": {
"type": "keyword"
},
"context.identity.type": {
"type": "keyword"
}
}
}
}
#put example docs
POST test_stack_login/_bulk?refresh&pretty
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":2,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":3,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login"}
#Run the query
GET test_stack_login/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "context.identity.user_id",
"size": 1000
},
"aggs": {
"context_identity_type": {
"terms": {
"field": "context.identity.type",
"size": 10
}
},
"login_error_exist": {
"bucket_selector": {
"buckets_path": {
"var1": "context_identity_type['login_error']>_count"
},
"script": "params.var1 != null"
}
}
}
}
}
}
#the result will be like in the ss
You will get user_id containing both login and login_error information in the context.identity.type field. The keys in the response will give you the user_id of which have logged in at least once unsuccessfully and once successfully logged in.
"buckets" : [
{"key" : "1" ...},
{"key" : "4" ...}
]

Elasticsearhc filter sub object before search

Let's say I have index like this:
{
"id": 6,
"name": "some name",
"users": [
{
"id": 1,
"name": "User1",
"isEnabled": false,
},
{
"id": 2,
"name": "User2",
"isEnabled": false,
},
{
"id": 3,
"name": "User3,
"isEnabled": true,
},
]
}
what I need is to return that index while user searching for the name some name, but also I want to filter out all not enabled users, and if there is not enabled users omit that index.
I tried to use filters like this:
{
"query": {
"bool": {
"must": {
"match": {
"name": "some name"
}
},
"filter": {
"term": {
"users.isEnabled": true
}
}
}
}
}
but in such a case I see index with all users no matter if user is enabled or not. I'm a bit new but is there a way to do so??? I can filter out all that in code after getting data from elasticsearch but in such a case it can break pagination if I remove some index without enabled users from result set.
I'm a bit new to elasticsearch, but as far I can't find how to do it. Thank you in advice!
Elasticsearch will return whole document if there is any match. If you update your mapping and make users array nested, you can achieve this by using inner hits. This is a basic example mapping that works:
{
"mappings": {
"properties": {
"name": {
"type": "text"
},
"users": {
"type": "nested"
}
}
}
}
And if you send a query like following, response will contain id and name from the parent document, and it will contain inner_hits that match to your user's isEnabled query.
{
"_source": ["id", "name"],
"query": {
"bool": {
"must": [
{
"match": {
"name": "some name"
}
},
{
"nested": {
"path": "users",
"query": {
"term": {
"users.isEnabled": {
"value": true
}
}
},
"inner_hits": {}
}
}
]
}
}
}
This is an example response
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9375811,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.9375811,
"_source" : {
"name" : "some name",
"id" : 6
},
"inner_hits" : {
"users" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.540445,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "users",
"offset" : 2
},
"_score" : 1.540445,
"_source" : {
"id" : 3,
"name" : "User3",
"isEnabled" : true
}
}
]
}
}
}
}
]
}
}
Then you can do the mapping in the application.

elasticsearch groupby and filter by regex condition

It's a bit hard for me to define the question as I'm not very experienced with Elasticsearch. I'm focusing the question on my specific problem:
Assuming I have the following records:
{
id: 1
name: bla1_1.aaa
},
{
id: 1
name: bla1_2.bbb
},
{
id: 2
name: bla2_1.aaa
},
{
id: 2
name: bla2_2.aaa
}
What I want is to GET all the ids that have all of their names ending with aaa.
I was thinking about group by id and then do a regex query like so: *\.aaa so that all the name must satisfy the regex query.
On this particular example I would get id: 2 back.
How do I do it?
Let me know if there's anything I need to add to clarify the question.
RegexExp can be used.
Wildcard .* matches any character any number of times including zero
Terms aggregation will give you unique "ids" and number of docs under them.
Mapping :
PUT regex
{
"mappings": {
"properties": {
"id":{
"type":"integer"
},
"name":{
"type":"text",
"fields": {
"keyword":{
"type":"keyword"
}
}
}
}
}
}
Data:
"hits" : [
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "olQXjW0BywGFQhV7k84P",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "o1QXjW0BywGFQhV7us6B",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_2.bbb"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pFQXjW0BywGFQhV77c6J",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pVQYjW0BywGFQhV7Dc6F",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_2.aaa"
}
}
]
Query:
GET regex/_search
{
"size":0,
"query": {
"regexp": {
"name.keyword": {
"value": ".*.aaa" ---> name ending with .aaa
}
}
},
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
}
}
}
}
Result:
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_ids" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, ---> 2 doc under id 2
"doc_count" : 2
},
{
"key" : 1, ----> 1 doc under id 1
"doc_count" : 1
}
]
}
}
Edit:
Using bucket selector to keep buckets where total count of docs in Id matches with docs selected in regex
GET regex/_search
{
"size": 0,
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"totalCount": { ---> to get total count of id(all docs)
"value_count": {
"field": "id"
}
},
"filter_agg": {
"filter": {
"bool": {
"must": [
{
"regexp": {
"name.keyword": ".*.aaa"
}
}
]
}
},
"aggs": {
"finalCount": { -->total count of docs matching regex
"value_count": {
"field": "id"
}
}
}
},
"mybucket_selector": { ---> include buckets where totalcount==finalcount
"bucket_selector": {
"buckets_path": {
"FinalCount": "filter_agg>finalCount",
"TotalCount": "totalCount"
},
"script": "params.FinalCount==params.TotalCount"
}
}
}
}
}
}

Filter Full Text Search based on User ID

GET _search
{
"query": {
"match": {
"content": "this test"
}
}
}
This gave me below result:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 6,
"successful" : 6,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : 0.5753642,
"hits" : [
{
"_index" : "inbox",
"_type" : "mailbox",
"_id" : "6bb174ab-a4ce-4409-a626-c9a42c98b89e",
"_score" : 0.5753642,
"_source" : {
"user_id" : 13,
"content" : "This is a test"
}
},
{
"_index" : "inbox",
"_type" : "mailbox",
"_id" : "1304cf2e-a1d4-40ca-9876-9abb08c4474d",
"_score" : 0.36464313,
"_source" : {
"user_id" : 10,
"content" : "This is a test"
}
},
{
"_index" : "inbox",
"_type" : "mailbox",
"_id" : "623c093c-4408-445e-abb1-460d2c5004cd",
"_score" : 0.36464313,
"_source" : {
"user_id" : 15,
"content" : "This is a test"
}
}
]
}
}
Which is good. However, I need to filter them by user_id. I mean I need to score only specific user and their content.
GET _search
{
"query": {
"match": {
"content": "this test",
"user_id": 10
}
}
}
When I add user_id i get this error:
{
"error": {
"root_cause": [
{
"type": "parsing_exception",
"reason": "[match] query doesn't support multiple fields, found [content] and [user_id]",
"line": 5,
"col": 18
}
],
"type": "parsing_exception",
"reason": "[match] query doesn't support multiple fields, found [content] and [user_id]",
"line": 5,
"col": 18
},
"status": 400
}
Why? And How to properly filter based on user_id?
You can use term query to filter the result by user_id.
Update your query as below:
{
"query": {
"bool": {
"must": [
{
"match": {
"content": "this test"
}
}
],
"filter": [
{
"term": {
"user_id": 10
}
}
]
}
}
}
The query should be like this:
{
"query": {
"bool": {
"must": [
{
"match": {
"content": "this test"
}
},
{
"match": {
"user_id": 10
}
}
]
}
}
}
Use bool query to combine filters
{
"query": {
"bool": {
"must": [
{
"match": {
"content": "this is content"
}
},
{
"term": {
"user_id": {
"value": 47545
}
}
}
]
}
}
}

ElasticSearch get last n distinct records

I am trying to implement a search query over records stored in elasticsearch.
The record structure looks something like this.
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "pWjQLWkBIJk0ORjd0X2P",
"_score" : null,
"_source" : {
"transactionID" : "60ab66cf24c9924f562bf1a2b5d92305d0a6",
"boxNumber" : "Box3",
"createDate" : "2013-09-17T00:00:00",
"itemNumber" : "Item1",
"address" : "Sample Address"
}
}
one box can contain multiple items. For example Box3 can have Item1, Item2 and Item3. So in elasticsearch i will have 3 different documents. Also at the same time, same box and same item can also exist but with different address. The transactionID may or maynot be the same for these documents.
My requirement is to fetch last n recent and distinct transactionIDs, along with their records.
I tried following query to fetch last 7 distinct transactionIDs
GET /box_info_store/boxes/_search?size=7
{
"query": {
"bool": {
"must": [
{"match":{"boxNumber":"Box3"}},
{"match":{"itemNumber":"Item1"}}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": { "field": "transactionID"}
}
}
}
This fetched me last 7 documents where boxNumber is Box3 and itemNumber is Item1, but not 7 distinct transactionIDs, two out of these seven documents have the same transactionID(both having separate address though).
But my requirement is to get 7 distinct transactionIds, no matter how many document it returns.
Hope i was able to explain myself.
Appreciate any kind of help here
Thanks
------Edited #gaurav9620, i ran the first query and got count as 32, then i ran the second query with distinct count as 3 i got the following result
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 32,
"max_score" : null,
"hits" : [
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "RWjRLWkBIJk0ORjdEX-L",
"_score" : null,
"_source" : {
"transactionID" : "3087e106244f6247a5290fb21ce64254529c",
"boxNumber" : "Box3",
"createDate" : "2017-11-15T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress12",
},
"sort" : [
1510704000000
]
},
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "MGjQLWkBIJk0ORjdwX0M",
"_score" : null,
"_source" : {
"transactionID" : "60ab66cf24c9924f562bf1a2b5d92305d0a6",
"boxNumber" : "Box3",
"createDate" : "2016-04-03T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress321",
},
"sort" : [
1459641600000
]
},
..........
..........
..........
{
"_index" : "box_info_store",
"_type" : "boxes",
"_id" : "AGjRLWkBIJk0ORjdK4CJ",
"_score" : null,
"_source" : {
"transactionID" : "3087e106244f6247a5290fb21ce64254529c",
"boxNumber" : "Box3",
"createDate" : "1996-02-16T00:00:00",
"itemNumber" : "Item1",
"address" : "sampleAddress4324",
},
"sort" : [
824428800000
]
}
]
},
"aggregations" : {
"unique_transactions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 16,
"buckets" : [
{
"key" : "3087e106244f6247a5290fb21ce64254529c",
"doc_count" : 6
},
{
"key" : "27c5f3422f4482495d29e7b2c15c0e311743",
"doc_count" : 5
},
{
"key" : "c40e53212e74e24bf02a5bd2b134cf92bffb",
"doc_count" : 5
}
]
}
}
}
The size which you have used : represents number of raw documents that are retrieved.
If your case what you need to do is :
Mention size as 0 -> which will return you no raw documents
Include a size parameter in aggregation which will return you unique 7 ids.
GET /box_info_store/boxes/_search?size=7
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": {
"field": "transactionID",
"size": 7
}
}
}
}
EDIT-------------------------------------
First fire this query
GET /box_info_store/boxes/_search?size=0
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
}
}
Here you will find total number of documents matching your query which you can set as n
After this fire your query as below
GET /box_info_store/boxes/_search?size=**n**
{
"query": {
"bool": {
"must": [
{
"match": {
"boxNumber": "Box3"
}
},
{
"match": {
"itemNumber": "Item1"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"aggs": {
"distinct_transactions": {
"terms": {
"field": "transactionID",
"size": NUMBER_OF_UNIQUE_TRANSACTION_IDS_TO_BE_FETCHED
}
}
}
}

Resources