Is there a way to get related documents if a match occurs after the query? - elasticsearch

I am currently doing a fuzzy name search on some documents. These documents can be related to each other (for example name field of one document may contain the name and another may contain the alias for the same person). I will give these documents the same unique identifier. My question is, can I get the documents with same unique identifier if a match occurs in any of them?
Suppose that there are 4 documents like this.
{
{
"name": "Bob"
"uid": "1"
},
{
"name": "Bilbo"
"uid": "1"
},
{
"name": "Jack"
"uid": "2"
},
{
"name": "Mary"
"uid" : "3"
}
}
When I query name "Bob", I expect to get both documents with "uid" = "1"
{
{
"name": "Bob"
"uid": "1"
},
{
"name": "Bilbo"
"uid": "1"
}
}

Elasticsearch doesn't have concept of JOINS. So documents cannot be fetched by joining on "uid"
1. Using two queries
i. Get documents with name "Bob"
{
"query": {
"term": {
"name.keyword": {
"value": "Bob"
}
}
}
}
ii. Fetch documents using above returned ids.
2. Using terms and bucket selector aggregation
Mapping:
{
"<mapping_name>" : {
"mappings" : {
"properties" : {
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"uid" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
Query:
1. Create a bucket(collection) of uid.
2. Create sub bucket of name which includes only "Bob" so uid 1 will have a bucket of key Bob , uid 2 will be empty
3. Use bucket_selector aggregation to select where count of sub bucket name is greater than equal to 1. This will remove uid 2
4. Use top_hits aggregation to get documents.
{
"size": 0,
"aggs": {
"uid": {
"terms": {
"field": "uid.keyword",
"size": 10
},
"aggs": {
"documents":{
"top_hits": { --> to get documents under parent term
"size": 10
}
},
"name": {
"terms": {
"field": "name.keyword", --> terms need non_analyzed field so keyword
"include":"Bob", --> get terms with name bob
"size": 10
}
},
"my_bucket":{
"bucket_selector": { --> select buckets which have atleast one name
"buckets_path": {"count":"name._bucket_count"},
"script": "if(params.count>=1) return true;"
}
}
}
}
}
}
Result: All docuents with uid 1(same uid as "Bob") are returned
"aggregations" : {
"uid" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 2,
"documents" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index61",
"_type" : "_doc",
"_id" : "uCP1-nAB_Wo5RvhlZM6k",
"_score" : 1.0,
"_source" : {
"name" : "Bob",
"uid" : "1"
}
},
{
"_index" : "index61",
"_type" : "_doc",
"_id" : "uSP1-nAB_Wo5Rvhlbc4S",
"_score" : 1.0,
"_source" : {
"name" : "Bilbo",
"uid" : "1"
}
}
]
}
},
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Bob",
"doc_count" : 1
}
]
}
}
]
}
}

Related

Elasticsearch aggregation on different search in same query

I want to make a query to aggregate base only on match no matter what other parameters(terms , term , etc...) are used.
To be more specific I have an online shop where I use multiple filters (color ,size etc..) If I check a field for example color : red the other colors are no longer aggregated.
A solution that I am using is to make 2 separated queries (one for search where filters are applied and other for aggregation. Any idea how can I combine the 2 separated queries ?
You can take advantage of post_filter which will not apply to your aggregations but will only filter the to-be-returned hits. For example:
Create a shop
PUT online_shop
{
"mappings": {
"properties": {
"color": {
"type": "keyword"
},
"size": {
"type": "integer"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
}
Populate it w/ a few products
POST online_shop/_doc
{"color":"red","size":35,"name":"Louboutin High heels abc"}
POST online_shop/_doc
{"color":"black","size":34,"name":"Louboutin Boots abc"}
POST online_shop/_doc
{"color":"yellow","size":36,"name":"XYZ abc"}
Apply a shared query to the hits as well as aggregations and use post_filter to ... post-filter the hits:
GET online_shop/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "abc"
}
}
]
}
},
"aggs": {
"by_color": {
"terms": {
"field": "color"
}
},
"by_size": {
"terms": {
"field": "size"
}
}
},
"post_filter": {
"bool": {
"must": [
{
"term": {
"color": {
"value": "red"
}
}
}
]
}
}
}
Expected result
{
...
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.11750763,
"hits" : [
{
"_index" : "online_shop",
"_type" : "_doc",
"_id" : "cehma3IBG_KW3EFn1QYa",
"_score" : 0.11750763,
"_source" : {
"color" : "red",
"size" : 35,
"name" : "Louboutin High heels abc"
}
}
]
},
"aggregations" : {
"by_color" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "black",
"doc_count" : 1
},
{
"key" : "red",
"doc_count" : 1
},
{
"key" : "yellow",
"doc_count" : 1
}
]
},
"by_size" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 34,
"doc_count" : 1
},
{
"key" : 35,
"doc_count" : 1
},
{
"key" : 36,
"doc_count" : 1
}
]
}
}
}

How to search Parent documents along with count of associated child documents

I am looking for a best way to search parent documents along with counts for associated child document? Example :
We have Organization documents and User documents. There could be thousands of users belong to one particular organization.
Organization document :
{
"id" : "001"
"name" : "orgname1"
}
{
"id" : "002"
"name" : "orgname2"
}
Users documents :
{
"id" : "testusr1"
"name" : "xyz1"
"orgId" : "001"
},
{
"id" : "testusr2"
"name" : "xyz2"
"orgId" : "001"
}
{
"id" : "testusr3"
"name" : "xyz3"
"orgId" : "001"
}
{
"id" : "testusr4"
"name" : "xyz4"
"orgId" : "001"
}
{
"id" : "testusr5"
"name" : "xyz5"
"orgId" : "002"
}
{
"id" : "testusr6"
"name" : "xyz6"
"orgId" : "002"
}
In above example, we have 4 users associated with organization with 001 and 2 users associated with 002. So on front end, admin will search for organization and as a result, I want to give response along with users count for that organization.
You can solve you issue in three ways. Each have its own advantages and disadvantages
1. Index Parent and child separately
This will require two queries . First you need to query user index and get orgId and then query child index and get its count
Advantage.
Change in one index doesn't affect other index
Disadvantage .
You need to use two queries
2. Nested Documents
Mapping:
PUT index9
{
"mappings": {
"properties": {
"id":{
"type": "integer"
},
"name":{
"type": "text",
"fields": {
"keyword":{
"type":"keyword"
}
}
},
"user":{
"type": "nested",
"properties": {
"id":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword"
}
}
},
"name":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword"
}
}
}
}
}
}
}
}
POST index9/_doc
{
"id" : 1,
"name" : "orgname1",
"user":[
{
"id":"testuser1",
"name":"xyz1"
},
{
"id":"testuser2",
"name":"xyz2"
}
]
}
Query:
GET index9/_search
{
"query": {
"match_all": {}
},
"aggs": {
"organization": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"user": {
"nested": {
"path": "user"
},
"aggs": {
"count": {
"value_count": {
"field": "user.id.keyword"
}
}
}
}
}
}
}
}
Result:
"aggregations" : {
"organization" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 1,
"user" : {
"doc_count" : 2,
"count" : {
"value" : 2
}
}
}
]
}
}
Nested are faster compared to parent/child,
Nested docs require reindexing the parent with all its children, while parent child allows to reindex / add / delete specific children.
3. Parent Child Relationship
Mapping
{
"my_index" : {
"mappings" : {
"properties" : {
"id" : {
"type" : "keyword"
},
"my_join_field" : {
"type" : "join",
"eager_global_ordinals" : true,
"relations" : {
"organization" : "user"
}
},
"name" : {
"type" : "text"
},
"orgId" : {
"type" : "long"
}
}
}
}
Data:
POST my_index/_doc/1
{
"id": 1,
"name" : "orgname1",
"my_join_field": "organization"
}
POST my_index/_doc/2
{
"id" : 2,
"name" : "orgname2",
"my_join_field": "organization"
}
POST my_index/_doc/3?routing=1
{
"id": "testusr1",
"name": "xyz1",
"orgId": 1,
"my_join_field": {
"name": "user",
"parent": 1
}
}
POST my_index/_doc/4?routing=2
{
"id" : "testusr5",
"name" : "xyz5",
"orgId" : 1,
"my_join_field": {
"name": "user",
"parent": 2
}
}
POST my_index/_doc/5?routing=2
{
"id" : "testusr6",
"name" : "xyz6",
"orgId" : 2,
"my_join_field": {
"name": "user",
"parent": 2
}
}
Query:
{
"query": {
"has_child": {
"type": "user",
"query": { "match_all": {} }
}
},
"aggs": {
"organization": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"user": {
"children": {
"type": "user"
},
"aggs": {
"count": {
"value_count": {
"field": "id"
}
}
}
}
}
}
}
}
Result:
"hits" : [
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "orgname1",
"my_join_field" : "organization"
}
},
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "orgname2",
"my_join_field" : "organization"
}
}
]
},
"aggregations" : {
"organization" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"user" : {
"doc_count" : 1,
"count" : {
"value" : 1
}
}
},
{
"key" : "2",
"doc_count" : 1,
"user" : {
"doc_count" : 2,
"count" : {
"value" : 2
}
}
}
]
}
Benefits:
1. Parent document and children are separate documents
Parent and child can be updated separately without re-indexing the other
It is useful when child documents are large in number and need to be added or
changed frequently.
Child documents can be returned as the results of a search request.

Filter nested objects in ElasticSearch 6.8.1

I didn't find any answers how to do simple thing in ElasticSearch 6.8 I need to filter nested objects.
Index
{
"settings": {
"index": {
"number_of_shards": "5",
"number_of_replicas": "1"
}
},
"mappings": {
"human": {
"properties": {
"cats": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"breed": {
"type": "text"
},
"colors": {
"type": "integer"
}
}
},
"name": {
"type": "text"
}
}
}
}
}
Data
{
"name": "iridakos",
"cats": [
{
"colors": 1,
"name": "Irida",
"breed": "European Shorthair"
},
{
"colors": 2,
"name": "Phoebe",
"breed": "european"
},
{
"colors": 3,
"name": "Nino",
"breed": "Aegean"
}
]
}
select human with name="iridakos" and cats with breed contains 'European' (ignore case).
Only two cats should be returned.
Million thanks for helping.
For nested datatypes, you would need to make use of nested queries.
Elasticsearch would always return the entire document as a response. Note that nested datatype means that every item in the list would be treated as an entire document in itself.
Hence in addition to return entire document, if you also want to know the exact hits, you would need to make use of inner_hits feature.
Below query should help you.
POST <your_index_name>/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "iridakos"
}
},
{
"nested": {
"path": "cats",
"query": {
"match": {
"cats.breed": "european"
}
},
"inner_hits": {}
}
}
]
}
}
}
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.74455214,
"hits" : [
{
"_index" : "my_cat_index",
"_type" : "_doc",
"_id" : "1", <--- The document that hit
"_score" : 0.74455214,
"_source" : {
"name" : "iridakos",
"cats" : [
{
"colors" : 1,
"name" : "Irida",
"breed" : "European Shorthair"
},
{
"colors" : 2,
"name" : "Phoebe",
"breed" : "european"
},
{
"colors" : 3,
"name" : "Nino",
"breed" : "Aegean"
}
]
},
"inner_hits" : { <---- Note this
"cats" : {
"hits" : {
"total" : {
"value" : 2, <---- Count of nested doc hits
"relation" : "eq"
},
"max_score" : 0.52354836,
"hits" : [
{
"_index" : "my_cat_index",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "cats",
"offset" : 1
},
"_score" : 0.52354836,
"_source" : { <---- First Nested Document
"breed" : "european"
}
},
{
"_index" : "my_cat_index",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "cats",
"offset" : 0
},
"_score" : 0.39019167,
"_source" : { <---- Second Document
"breed" : "European Shorthair"
}
}
]
}
}
}
}
]
}
}
Note in your response how the inner_hits section would appear where you would find the exact hits.
Hope this helps!
You could use something like this:
{
"query": {
"bool": {
"must": [
{ "match": { "name": "iridakos" }},
{ "match": { "cats.breed": "European" }}
]
}
}
}
To search on a cat's breed, you can use the dot-notation.

combine output of a a first filter as input of a second filter

We have an elasticsearch instance with entries with two tagged fields.
sessionid
message
In a first filter, I find all entries where the message contains a certain substring. Each of those entries contains a sessionid,
In a second filter, I want to find all messages, where the sessionid matches one of the sessionids returned by the first filter. This filter should go through all entries a second time.
Example, in the log below (sessionid;message)
1234;miss 1
2456;miss 2
1234;match
When filtering for the string "match" in the message part, I would get as output of the combined query:
1234;miss 1
1234;match
We are using KQL.
Background: We want an easy way to follow complete flows with an error-string in a message, in a multithreaded environment.
I understand why you'd want to do that in one go but it's not possible in ElasticSearch. You cannot "revisit" documents which you've already ruled out by a different query -- searching for match would disqualify all misss.
It's unfortunate you have the log message combined with the ID but you can try this:
Find all that match match (pun intended) -- I'm assuming you do have a keyword field available
GET your_index/_search
{
"query": {
"regexp": {
"separated_msg.keyword": ".*\\;match.*"
}
}
}
Post-process the hits and extract the session IDs
Run session ID matching:
GET your_index/_search
{
"query": {
"regexp": {
"separated_msg.keyword": "1234;.*"
}
}
}
or on multiple IDs using a bool should:
GET your_index/_search
{
"query": {
"bool": {
"should": [
{
"regexp": {
"separated_msg.keyword": "1234;.*"
}
},
{
"regexp": {
"separated_msg.keyword": "4567;.*"
}
}
]
}
}
}
If a unique numeric value can be assigned to each message ex 1 for "match", 2 for "miss 1" then bucket selector and top_hits can be used.
{
"size": 0,
"aggs": {
"sessionid": {
"terms": {
"field": "sessionid", --> first get all unique sessionids
"size": 10
},
"aggs": {
"documents":{
"top_hits": {
"size": 10
}
},
"messageid": {
"terms": {
"field": "messageid", ---> get unique sessionId
"size": 10
},
"aggs": {
"matching_messageid": { ---> select a bucket with key(message Id) as 2
"bucket_selector": {
"buckets_path": {
"key": "_key"
},
"script": "params.key==2"
}
}
}
},
"my_bucket": {
"bucket_selector": {
"buckets_path": {
"hits": "messageid._bucket_count"
},
"script": "params.hits>0"--> if bucket not empty then consider that sessionid
}
}
}
}
}
}
Result
"aggregations" : {
"sessionid" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1234,
"doc_count" : 2,
"documents" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index31",
"_type" : "_doc",
"_id" : "MTAYpnABheSAx2q_eNEF",
"_score" : 1.0,
"_source" : {
"sessionid" : 1234,
"message" : "miss 1",
"messageid" : 1
}
},
{
"_index" : "index31",
"_type" : "_doc",
"_id" : "MjAYpnABheSAx2q_n9FW",
"_score" : 1.0,
"_source" : {
"sessionid" : 1234,
"message" : "match",
"messageid" : 2
}
}
]
}
},
"messageid" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2,
"doc_count" : 1
}
]
}
}
]
}
}
If a given message has timestamp(max/min) then max_path can be used to select buckets with given messages.
The best approach to above problem will be to use nested documents
{
"sessionid":1234,
"messages":[
{
"message":"match"
},
{
"message":"miss 1"
}
]
}
````
then the problem can be resolved by nested query. If logstash is used then above structure can generated while indexing.

elasticsearch groupby and filter by regex condition

It's a bit hard for me to define the question as I'm not very experienced with Elasticsearch. I'm focusing the question on my specific problem:
Assuming I have the following records:
{
id: 1
name: bla1_1.aaa
},
{
id: 1
name: bla1_2.bbb
},
{
id: 2
name: bla2_1.aaa
},
{
id: 2
name: bla2_2.aaa
}
What I want is to GET all the ids that have all of their names ending with aaa.
I was thinking about group by id and then do a regex query like so: *\.aaa so that all the name must satisfy the regex query.
On this particular example I would get id: 2 back.
How do I do it?
Let me know if there's anything I need to add to clarify the question.
RegexExp can be used.
Wildcard .* matches any character any number of times including zero
Terms aggregation will give you unique "ids" and number of docs under them.
Mapping :
PUT regex
{
"mappings": {
"properties": {
"id":{
"type":"integer"
},
"name":{
"type":"text",
"fields": {
"keyword":{
"type":"keyword"
}
}
}
}
}
}
Data:
"hits" : [
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "olQXjW0BywGFQhV7k84P",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "o1QXjW0BywGFQhV7us6B",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_2.bbb"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pFQXjW0BywGFQhV77c6J",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pVQYjW0BywGFQhV7Dc6F",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_2.aaa"
}
}
]
Query:
GET regex/_search
{
"size":0,
"query": {
"regexp": {
"name.keyword": {
"value": ".*.aaa" ---> name ending with .aaa
}
}
},
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
}
}
}
}
Result:
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_ids" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, ---> 2 doc under id 2
"doc_count" : 2
},
{
"key" : 1, ----> 1 doc under id 1
"doc_count" : 1
}
]
}
}
Edit:
Using bucket selector to keep buckets where total count of docs in Id matches with docs selected in regex
GET regex/_search
{
"size": 0,
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"totalCount": { ---> to get total count of id(all docs)
"value_count": {
"field": "id"
}
},
"filter_agg": {
"filter": {
"bool": {
"must": [
{
"regexp": {
"name.keyword": ".*.aaa"
}
}
]
}
},
"aggs": {
"finalCount": { -->total count of docs matching regex
"value_count": {
"field": "id"
}
}
}
},
"mybucket_selector": { ---> include buckets where totalcount==finalcount
"bucket_selector": {
"buckets_path": {
"FinalCount": "filter_agg>finalCount",
"TotalCount": "totalCount"
},
"script": "params.FinalCount==params.TotalCount"
}
}
}
}
}
}

Resources