elastic - query multiple levels on nested object in inner_hits - elasticsearch

i have a huge nested object which has lots of levels
i want to create a query which will return only the leaf / some object in the middle,
and the query is supposed to query multiple levels in the tree.
for example:
my DB is saving the whole company structure.
company -> wards -> employees -> working hours
i want to make a query that will return only the working hours of the employees in ward 2 which started later than 3pm this month
i tried to use inner_hits - but to no use.
as requested, sample document and expected result:
company:[{
properties:{companyId: 112}
ward:[{
properties: {wardId: 223}
employee:{
properties: {employeeId: 334},
workingHours: [
{ date: "1.1.2021", numOfHours: 4},
{ date: "1.2.2021", numOfHours: 7}
]
}]
}]
}]
the query:
I need to return the working hours of date "1.2.21" , of employee 334, of ward 223. and only the working hours, not the whole tree.
expected result:
4 or { date: "1.1.2021", numOfHours: 4} , whatever is simpler
hope its clear now

You need to add inner_hits to all nested queries
You can either parse entire result to get matched working hours(from inner hits) o can use response filtering to remove additional data
Mapping
PUT index123
{
"mappings": {
"properties": {
"company": {
"type": "nested",
"properties": {
"ward": {
"type": "nested",
"properties": {
"employee": {
"type": "nested",
"properties": {
"workingHours": {
"type": "nested",
"properties": {
"date": {
"type": "date"
}
}
}
}
}
}
}
}
}
}
}
}
Data
"_index" : "index123",
"_type" : "_doc",
"_id" : "9gGYI3oBt-MOenya6BcN",
"_score" : 1.0,
"_source" : {
"company" : [
{
"companyId" : 112,
"ward" : [
{
"wardId" : 223,
"employee" : {
"employeeId" : 334,
"workingHours" : [
{
"date" : "2021-01-01",
"numOfHours" : 4
},
{
"date" : "2021-01-02",
"numOfHours" : 7
}
]
}
}
]
}
]
}
}
Query
GET index123/_search?filter_path=hits.hits.inner_hits.ward.hits.hits.inner_hits.employee.hits.hits.inner_hits.workingHours.hits.hits._source
{
"query": {
"nested": {
"inner_hits": {
"name":"ward"
},
"path": "company.ward",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.wardId": {
"value": 223
}
}
},
{
"nested": {
"inner_hits": {
"name":"employee"
},
"path": "company.ward.employee",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.employee.employeeId": {
"value":334
}
}
},
{
"nested": {
"inner_hits": {
"name":"workingHours"
},
"path": "company.ward.employee.workingHours",
"query": {
"range": {
"company.ward.employee.workingHours.date": {
"gte": "2021-01-01",
"lte": "2021-01-01"
}
}
}
}
}
]
}
}
}
}
]
}
}
}
}
}
Result
{
"hits" : {
"hits" : [
{
"inner_hits" : {
"ward" : {
"hits" : {
"hits" : [
{
"inner_hits" : {
"employee" : {
"hits" : {
"hits" : [
{
"inner_hits" : {
"workingHours" : {
"hits" : {
"hits" : [
{
"_source" : {
"date" : "2021-01-01",
"numOfHours" : 4
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
Update:
Query with company ID
GET index123/_search?filter_path=hits.hits.inner_hits.company.hits.hits.inner_hits.ward.hits.hits.inner_hits.employee.hits.hits.inner_hits.workingHours.hits.hits._source
{
"query": {
"nested": {
"path": "company",
"inner_hits": {
"name": "company"
},
"query": {
"bool": {
"must": [
{
"term": {
"company.companyId": {
"value": 112
}
}
},
{
"nested": {
"inner_hits": {
"name": "ward"
},
"path": "company.ward",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.wardId": {
"value": 223
}
}
},
{
"nested": {
"inner_hits": {
"name": "employee"
},
"path": "company.ward.employee",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.employee.employeeId": {
"value": 334
}
}
},
{
"nested": {
"inner_hits": {
"name": "workingHours"
},
"path": "company.ward.employee.workingHours",
"query": {
"range": {
"company.ward.employee.workingHours.date": {
"gte": "2021-01-01",
"lte": "2021-01-01"
}
}
}
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
}
}
}

Related

Elastic search combine must and must_not

I have a document that holds data for a product the mapping is as follow:
"mappings" : {
"properties" : {
"view_score" : {
"positive_score_impact" : true,
"type" : "rank_feature"
},
"recipients" : {
"dynamic" : false,
"type" : "nested",
"enabled" : true,
"properties" : {
"type" : {
"similarity" : "boolean",
"type" : "keyword"
},
"title" : {
"type" : "text",
"fields" : {
"key" : {
"type" : "keyword"
}
}
}
}
}
}
}
And I have 2 documents with the following data:
{
"view_score": 10,
"recipients": [{"type":"gender", "title":"male"}, {"type":"gender", "title":"female"}]
}
{
"view_score": 10,
"recipients": [{"type":"gender", "title":"female"}]
}
When a user searches for a product she can say "I prefer products for females" so The products which specifies gender as just female should come before products that specifies gender as male and female both.
I have the following query which gives more score to products with just female gender:
GET _search
{
"sort": [
"_score"
],
"query": {
"script_score": {
"query": {
"bool": {
"should": [
{
"nested": {
"path": "recipients",
"ignore_unmapped": true,
"query": {
"bool": {
"boost": 10,
"must": [
{
"term": {
"recipients.type": "gender"
}
},
{
"match": {
"recipients.title": "female"
}
}
],
"must_not": {
"bool": {
"filter": [
{
"term": {
"recipients.type": "gender"
}
},
{
"match": {
"recipients.title": "male"
}
}
]
}
}
}
}
}
}
]
}
},
"script": {
"source": "return _score;"
}
}
}
}
But if I add another query to should query it won't behave the same and gives the same score to products with one or two genders in their specifications.
here is my final query which wont work as expected:
GET _search
{
"sort": [
"_score"
],
"query": {
"script_score": {
"query": {
"bool": {
"should": [
{
"rank_feature": {
"field": "view_score",
"linear": {}
}
},
{
"nested": {
"path": "recipients",
"ignore_unmapped": true,
"query": {
"bool": {
"boost": 10,
"must": [
{
"term": {
"recipients.type": "gender"
}
},
{
"match": {
"recipients.title": "female"
}
}
],
"must_not": {
"bool": {
"filter": [
{
"term": {
"recipients.type": "gender"
}
},
{
"match": {
"recipients.title": "male"
}
}
]
}
}
}
}
}
}
]
}
},
"script": {
"source": "return _score;"
}
}
}
}
So my problem is how to combine these should clause together to give more weight to the products that specify only one gender.

In ElasticSearch break down hits per filter?

Given the following query, how can I get the number of hits independently for each range and term query and what are the performance implications for this? As of yet, I can't find anything in the documentation that indicates how to do this. Where can I find the docs for such a feature?
{
"query": {
"bool" : {
"must" : {
"term" : { "user.id" : "kimchy" }
},
"filter": {
"term" : { "tags" : "production" }
},
"must_not" : {
"range" : {
"age" : { "gte" : 10, "lte" : 20 }
}
},
You can use filter aggregation for getting document count per query clause. As you are providing query as well, you need to use global aggregation with filter aggregation. If you dont use global aggregation then it will return count based on top level query and you will not able to get total document for specific query clause.
Below is sample query with aggregation:
{
"query": {
"bool": {
"must": {
"term": {
"user.id": "kimchy"
}
},
"filter": {
"term": {
"tags": "production"
}
},
"must_not": {
"range": {
"age": {
"gte": 10,
"lte": 20
}
}
}
}
},
"aggs": {
"Total": {
"global": {},
"aggs": {
"user_term": {
"filter": {
"term": {
"user.id": "kimchy"
}
}
},
"tag_term": {
"filter": {
"term": {
"tags": "production"
}
}
},
"age_range_not": {
"filter": {
"bool": {
"must_not": {
"range": {
"age": {
"gte": 10,
"lte": 20
}
}
}
}
}
},
"age_range": {
"filter": {
"range": {
"age": {
"gte": 10,
"lte": 20
}
}
}
}
}
}
}
}
You will get below response:
"aggregations" : {
"Total" : {
"doc_count" : 3,
"age_range" : {
"doc_count" : 2
},
"age_range_not" : {
"doc_count" : 1
},
"tag_term" : {
"doc_count" : 3
},
"user_term" : {
"doc_count" : 2
}
}
}

ElasticSearch Aggregation Filter (not nested) Array

I have mapping like that:
PUT myindex1/_mapping
{
"properties": {
"program":{
"properties":{
"rounds" : {
"properties" : {
"id" : {
"type" : "keyword"
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
And my example docs:
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000000", "name":"Test1"},
{"id":"00000000-0000-0000-0000-000000000001", "name":"Fact2"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000002", "name":"Test3"},
{"id":"00000000-0000-0000-0000-000000000003", "name":"Fact4"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000004", "name":"Test5"},
{"id":"00000000-0000-0000-0000-000000000005", "name":"Fact6"}
]
}
}
Purpose: get only names of rounds that filtered as wildcard by user.
Aggregation query:
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must":[
{
"wildcard": {
"program.rounds.name": "*test*"
}
}
]
}
}
}
},
"size": 0
}
This aggregation returns all 6 names, but I need only Test1,Test3,Test5. Also tried include": "/tes.*/i" regex pattern for terms, but ignore case does not work.
Note: I'm note sure abount nested type, because I don't interested in association between Id and Name (at least for now).
ElasticSearch version: 7.7.0
If you want to only aggregate specific rounds based on a condition on the name field, then you need to make rounds nested, otherwise all name values end up in the same field.
Your mapping needs to be changed to this:
PUT myindex1/
{
"mappings": {
"properties": {
"program": {
"properties": {
"rounds": {
"type": "nested", <--- add this
"properties": {
"id": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
And then your query needs to change to this:
GET myindex1/_search
{
"size": 0,
"query": {
"nested": {
"path": "program.rounds",
"query": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"aggs": {
"rounds": {
"nested": {
"path": "program.rounds"
},
"aggs": {
"name_filter": {
"filter": {
"wildcard": {
"program.rounds.name": "*Test*"
}
},
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
}
}
}
}
}
}
And the result will be:
"aggregations" : {
"rounds" : {
"doc_count" : 6,
"name_filter" : {
"doc_count" : 3,
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Test1",
"doc_count" : 1
},
{
"key" : "Test3",
"doc_count" : 1
},
{
"key" : "Test5",
"doc_count" : 1
}
]
}
}
}
}
UPDATE:
Actually, you can achieve what you want without introducing nested types with the following query. You were close, but the include pattern was wrong
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"include": "[Tt]est.*",
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"size": 0
}

ElasticSearch filter by nested boolean type fields

I need to query on multiple nested fields on boolean types.
Structure of mapping:
"mappings" : {
"properties" : {
"leaders" : {
"type" : "nested",
"properties" : {
"except_1" : {
"type" : "boolean"
},
"except_2" : {
"type" : "boolean"
},
"counter" : {
"type" : "integer"
}
}
}
}
}
I am trying to use query both except1 and except2 only to False.
Below my try, unfortunately it returns True and False for both fields and I cannot fix it.
"query": {
"nested": {
"path": "leaders",
"query": {
"bool": {
"must": [
{
"term": {
"leaders.except_1": False
}
},
{
"term": {
"leaders.except_2": False
}
}
]
}
}
}
}
What you're probably looking for is the inner_hits option -- showing only the matched nested subdocuments.
PUT leaders
{"mappings":{"properties":{"leaders":{"type":"nested","properties":{"except_1":{"type":"boolean"},"except_2":{"type":"boolean"},"counter":{"type":"integer"}}}}}}
POST leaders/_doc
{
"leaders": [
{
"except_1": true,
"except_2": false
},
{
"except_1": false,
"except_2": false
}
]
}
GET leaders/_search
{
"query": {
"nested": {
"path": "leaders",
"inner_hits": {},
"query": {
"bool": {
"must": [
{
"term": {
"leaders.except_1": false
}
},
{
"term": {
"leaders.except_2": false
}
}
]
}
}
}
}
}
then
GET leaders/_search
{
"query": {
"nested": {
"path": "leaders",
"inner_hits": {},
"query": {
"bool": {
"must": [
{
"term": {
"leaders.except_1": false
}
},
{
"term": {
"leaders.except_2": false
}
}
]
}
}
}
}
}
yielding
{
"hits":[
{
"_index":"leaders",
"_type":"_doc",
"_id":"u-he8HEBG_KW3EFn-gMz",
"_score":0.87546873,
"_source":{ <-- default behavior
"leaders":[
{
"except_1":true,
"except_2":false
},
{
"except_1":false,
"except_2":false
}
]
},
"inner_hits":{
"leaders":{
"hits":{
"total":{
"value":1,
"relation":"eq"
},
"max_score":0.87546873,
"hits":[ <------- only the matching nested subdocument
{
"_index":"leaders",
"_type":"_doc",
"_id":"u-he8HEBG_KW3EFn-gMz",
"_nested":{
"field":"leaders",
"offset":1
},
"_score":0.87546873,
"_source":{
"except_1":false,
"except_2":false
}
}
]
}
}
}
}
]
}
Furthermore, you can force the system to only return the inner_hits by saying "_source": "inner_hits" on the top-level of your search query.

Elasticsearch: aggregation and select docs only having max value of field

I am using elastic search 6.5.
Basically, based on my query my index can return multiple documents, I need only those documents which has the max value for a particular field.
E.g.
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"size" : 10000
}
The above query will return lots of documents/messages as per the query. The sample data returned is:
"header" : {
"id" : "Text_20190702101200123_111",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T10:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702151200123_121",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex2",
"timestamp": "2019-07-02T15:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702081200133_124",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T08:12:00.133Z",
}
}
So based on the above 3 documents, I only want the max timestamp one to be shown i.e. "timestamp": "2019-07-02T15:12:00.123Z"
I only want one document in above example.
I tried doing it as below:
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "header.id",
"order": { "group_docs" : "desc" }
},
"aggs" : {
"group_docs": { "max" : { "field": "body.meta.tiemstamp" } }
}
}
},
"size": "10000"
}
Executing the above, I am still getting all the 3 documents, instead of only one.
I do get the buckets though, but I need only one of them and not all the buckets.
The output in addition to all the records,
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Text_20190702151200123_121",
"doc_count": 29,
"group_docs": {
"value": 1564551683867,
"value_as_string": "2019-07-02T15:12:00.123Z"
}
},
{
"key": "Text_20190702101200123_111",
"doc_count": 29,
"group_docs": {
"value": 1564551633912,
"value_as_string": "2019-07-02T10:12:00.123Z"
}
},
{
"key": "Text_20190702081200133_124",
"doc_count": 29,
"group_docs": {
"value": 1564510566971,
"value_as_string": "2019-07-02T08:12:00.133Z"
}
}
]
}
}
What am I missing here?
Please note that I can have more than one messages for same timestamp. So I want them all i.e. all the messages/documents belonging to the max time stamp.
In above example there are 29 messages for same timestamp (It can go to any number). So there are 29 * 3 messages being retrieved by my query after using the above aggregation.
Basically I am able to group correctly, I am looking for something like HAVING in SQl?

Resources