Combining missing and term query in nested document in Elasticsearch - elasticsearch

I have these 3 documents, where fields is of type nested:
{
"fields": [
{"field_id": 23, "value": "John Doe"},
{"field_id": 92, "value": null}
]
}
{
"fields": [
{"field_id": 23, "value": "Ada Lovelace"},
]
}
{
"fields": [
{"field_id": 23, "value": "Jack Daniels"},
{"field_id": 92, "value": "jack#example.com"}
]
}
I need to search for documents where:
(`field_id` = `92` AND `value` is `null`) OR (`field_id` `92` is missing.)
Combining a terms and missing query leads to only the document with the null value being returned:
...
"nested": {
"path": "fields",
"filter": {
"bool": {
"bool": {
"must": [
{
"missing": {
"field": "fields.value"
}
},
{
"terms": {
"fields.field_id": [92]
}
}
]
}
}
}
}
...
How can I do this?

You already have query for one condition. Lets call this A. For second condition check for fields.field_id: 92 in nested documents. Lets say this is B. But your condition is fields.field_id: 92 should not exist. So to achieve this wrap B in must_not. i.e. B'
What is required is A OR B'
So the final query will be:
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "fields",
"query": {
"bool": {
"must": [
{
"term": {
"fields.field_id": 92
}
}
],
"must_not": [
{
"exists": {
"field": "fields.value"
}
}
]
}
}
}
},
{
"bool": {
"must_not": [
{
"nested": {
"path": "fields",
"query": {
"term": {
"fields.field_id": 92
}
}
}
}
]
}
}
]
}
}
}

Related

ElasticSearch: Query nested array for empty and specific value in single query

Documents structure -
{
"hits": [
{
"_type": "_doc",
"_id": "ef0a2c44179a513476b080cc2a585d95",
"_source": {
"DIVISION_NUMBER": 44,
"MATCHES": [
{
"MATCH_STATUS": "APPROVED",
"UPDATED_ON": 1599171303000
}
]
}
},
{
"_type": "_doc",
"_id": "ef0a2c44179a513476b080cc2a585d95",
"_source": {
"DIVISION_NUMBER": 44,
"MATCHES": [ ]
}
}
]
}
Question - MATCHES is a nested array inside there is a text field MATCH_STATUS that can have any values say "APPROVED","REJECTED".
I am looking to search ALL documents that contain MATCH_STATUS having values say "APPROVED", "RECOMMENDED" as well as where there is no data in MATCHES (empty array "MATCHES": [ ]). Please note I want this in a single query.
I am able to do this in two separate queries like this -
GET all matches with status = RECOMMENDED, APPROVED
"must": [
{
"nested": {
"path": "MATCHES",
"query": {
"terms": {
"MATCHES.MATCH_STATUS.keyword": [
"APPROVED",
"RECOMMENDED"
]
}
}
}
}
]
GET all matches having empty array "MATCHES" : [ ]
{
"size": 5000,
"query": {
"bool": {
"filter": [],
"must_not": [
{
"nested": {
"path": "MATCHES",
"query": {
"exists": {
"field": "MATCHES"
}
}
}
}
]
}
},
"from": 0
}
You can combine both queries using should clause.
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"nested": {
"path": "MATCHES",
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"terms": {
"MATCHES.MATCH_STATUS.keyword": [
"APPROVED",
"RECOMMENDED"
]
}
}
]
}
}
}
},
{
"bool": {
"must_not": [
{
"nested": {
"path": "MATCHES",
"query": {
"bool": {
"filter": {
"exists": {
"field": "MATCHES"
}
}
}
}
}
}
]
}
}
]
}
}
}
Update: To answer your comment.
Missing aggregation does not support nested field for now. There is open issue as of now.
To get count of empty matches, you can use a filter aggregation with the nested query wrapped into the must_not clause of the bool query.
{
"aggs": {
"missing_matches_agg": {
"filter": {
"bool": {
"must_not": {
"nested": {
"query": {
"match_all": {}
},
"path": "MATCHES"
}
}
}
}
}
}
}

Elasticsearch multiple fields wildcard bool query

Currently using bool query which searches for a combination of both input words or either one of input word on field "Name". How to search on multiple fields using wild cards?
POST inventory_dev/_search
{"from":0,"query":{"bool":{"must":[{"bool":{"should":[{"term":{"Name":{"value":"dove"}}},{"term":{"Name":{"value":"3.75oz"}}},{"bool":{"must":[{"wildcard":{"Name":{"value":"*dove*"}}},{"wildcard":{"Name":{"value":"*3.75oz*"}}}]}}]}}]}},"size":10,"sort":[{"_score":{"order":"desc"}}]}
You can use query_string in place of wildcard query, to search on multiple fields
{
"from": 0,
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"term": {
"Name": {
"value": "dove"
}
}
},
{
"term": {
"Name": {
"value": "3.75oz"
}
}
},
{
"bool": {
"must": [
{
"query_string": {
"query": "*dove*",
"fields": [
"field1",
"Name"
]
}
},
{
"query_string": {
"query": "*3.75oz*",
"fields": [
"field1",
"Name"
]
}
}
]
}
}
]
}
}
]
}
},
"size": 10,
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}

Filter query by length of nested objects. ie. min_child

I'm trying to filter my query by the number of nested objects found. The Elastic Search documentation mentions that using a script is an expensive task, so I've set out to do it with a score, though I can't seem to get the script to work either.
Here's my mappings:
"mappings": {
"properties": {
"dates" : {
"type" : "nested",
"properties" : {
"rooms" : {
"type" : "integer"
},
"timestamp" : {
"type" : "long"
}
}
},
"doc_id" : {
"type" : "text"
},
"distance" : {
"type" : "integer"
}
...
}
}
Here's some example data:
PUT /test/_doc/1
{
"doc_id": "1",
"distance": 1,
"dates": [
{
"rooms": 1,
"timestamp": 1
},
{
"rooms": 1,
"timestamp": 2
},
...
]
}
I'm filtering by the parents distance field, among others, and filtering the nested dates by their timestamps, and rooms. I need to filter all results to an exact number of nest dates found.
I tried to borrow from here.
This is my search query:
GET /test/_search
{
"query" : {
"function_score": {
"min_score": 20,
"boost": 1,
"functions": [
{
"script_score": {
"script": {
"source": "if (_score > 20) { return - 1; } return _score;"
}
}
}
],
"query": {
"bool" : {
"filter": [
{ "range": { "distance": { "lt": 5 }}},
{
"nested": {
"score_mode": "sum",
"boost": 10,
"path": "dates",
"query": {
"bool": {
"filter": [
{ "range": { "dates.rooms": { "gte": 1 } } },
{ "range": { "dates.timestamp": { "lte": 2 }}},
{ "range": { "dates.timestamp": { "gte": 1 }}}
]
}
}
}
}
]
}
}
}
}
}
This returns all the results that match, yet they all have a score of 0.0 and aren't getting filtered by the number of nested objects found.
If this is the right solution, how can I get this working? If not, how can I get a script to do it within this search?
Thanks!
Before getting started, keep in mind that the scoring function has changed between Elastic 6 and 7. You can find the updated code samples on this this gist.
Your question didn't outline the specifics of your search. Reading the code, it seems like you want to retrieve all documents where the distance is less than five, and the number of matching rooms is precisely 2. If this is correct, the code you submitted does not achieve this.
Reasons: your function score contains your primary condition and your condition on the number of matching rooms (it is quite tricky to mix both, though not impossible). To make things simpler, isolate them for the function score to be only applicable to the number of rooms.
Supposing you are using elastic 7+, this might work:
{
"_source": {
"includes": ["*"],
"excludes": ["dates"]
},
"query": {
"bool": {
"must": [
{"range": {"distance": {"lt": 5}}},
{
"function_score": {
"min_score": 20,
"boost": 1,
"score_mode": "multiply",
"boost_mode": "replace",
"functions": [
{
"script_score": {
"script": {
"source": "if (_score > 20) { return 0; } return _score;"
}
}
}
],
"query": {
"nested": {
"path": "date",
"boost": 10,
"score_mode": "sum",
"query": {
"constant_score": {
"boost": 1,
"filter": {
"bool": {
"should": [
{
"bool": {
"must": [
{"term": {"dates.timestamp": 1}},
{"range": {"dates.rooms": {"lt": 5}}}
],
"should": [
{"term": {"dates.other_prop": 1}},
{"term": {"dates.other_prop": 4}}
]
}
},
{
"bool": {
"must": [
{"term": {"dates.timestamp": 2}},
{"range": {"dates.rooms": {"lt": 5}}}
],
"should": [
{"term": {"dates.other_prop": 1}},
{"term": {"dates.other_prop": 3}}
]
}
}
]
}
}
}
}
}
}
}
}
]
}
}
}
I managed to get it all working with scoring as filtering doesn't allow scoring. Using GET /test/_explain/[id] helped to understand exactly what was happening
GET /test/_search
{
// Don't return the nested fields, they are returned in the inner_hits
"_source": {
"includes": [ "*" ],
"excludes": [ "dates" ]
},
"query": {
"function_score": {
// Score is calculated with 1 point for each matched inner property and outer property.
// 7 is the exact score to allow
"min_score": 7,
"boost": 1,
"score_mode": "sum",
"boost_mode": "multiply",
"functions": [
{
"script_score": {
"script": {
// Ignore any results that don't match exactly
"source": "if (_score == 7) { return 1; } return 0;",
"lang": "painless"
}
}
}
],
"query": {
"bool" : {
"must" : [
{ "range" : { "distance" : { "lt": 10 }}},
{
"nested": {
"inner_hits" : {},
"path": "dates",
"score_mode": "sum",
"query": {
"bool": {
// Match each required nested object individually, then verify with the score if we got 1 match for each should
"should": [
{
"bool": {
"must": [
{ "term": { "dates.timestamp": 1 }},
{ "range": { "dates.rooms": { "lt": 5 } } }
],
"should": [
{ "term": { "dates.other_prop": 1 }},
{ "term": { "dates.other_prop": 4 }}
]
}
},
{
"bool": {
"must": [
{ "term": { "dates.timestamp": 2 }},
{ "range": { "dates.rooms": { "lt": 5 } } }
],
"should": [
{ "term": { "dates.other_prop": 1 }},
{ "term": { "dates.other_prop": 3 }}
]
}
}
]
}
}
}
}
]
}
}
}
}
}

ElasticSearch should with nested and bool must_not exists

With the following mapping:
"categories": {
"type": "nested",
"properties": {
"category": {
"type": "integer"
},
"score": {
"type": "float"
}
}
},
I want to use the categories field to return documents that either:
have a score above a threshold in a given category, or
do not have the categories field
This is my query:
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"terms": {
"categories.category": [
<id>
]
}
},
{
"range": {
"categories.score": {
"gte": 0.5
}
}
}
]
}
}
}
},
{
"bool": {
"must_not": [
{
"exists": {
"field": "categories"
}
}
]
}
}
],
"minimum_should_match": 1
}
}
}
It correctly returns documents both with and without the categories field, and orders the results so the ones I want are first, but it doesn't filter the results having score below the 0.5 threshold.
Great question.
That is because categories is not exactly a field from the elasticsearch point of view[a field on which inverted index is created and used for querying/searching] but categories.category and categories.score is.
As a result categories being not found in any document, which is actually true for all the documents, you observe the result what you see.
Modify the query to the below and you'd see your use-case working correctly.
POST <your_index_name>/_search
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"terms": {
"categories.category": [
"100"
]
}
},
{
"range": {
"categories.score": {
"gte": 0.5
}
}
}
]
}
}
}
},
{
"bool": {
"must_not": [ <----- Note this
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"exists": {
"field": "categories.category"
}
},
{
"exists": {
"field": "categories.score"
}
}
]
}
}
}
}
]
}
}
],
"minimum_should_match": 1
}
}
}

Elasticsearch must_not filter not works with a big bunch of values

I have the next query that include some filters:
{
"from": 0,
"query": {
"function_score": {
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"idpais": [
115
]
}
},
{
"term": {
"tipo": [
1
]
}
}
],
"must_not": [
{
"term": {
"idregistro": [
5912471,
3433876,
9814443,
11703069,
6333176,
8288242,
9924922,
6677850,
11852501,
12530205,
4703469,
12776479,
12287659,
11823679,
12456304,
12777457,
10977614,
...
]
}
}
]
}
},
"query": {
"bool": {
"should": [
{
"match_phrase": {
"area": "Coordinator"
}
},
{
"match_phrase": {
"company": {
"boost": 5,
"query": "IBM"
}
}
},
{
"match_phrase": {
"topic": "IT and internet stuff"
}
},
{
"match_phrase": {
"institution": {
"boost": 5,
"query": "University of my city"
}
}
}
]
}
}
}
},
"script_score": {
"params": {
"idpais": 115,
"idprovincia": 0,
"relationships": []
},
"script_id": "ScoreUsuarios"
}
}
},
"size": 24,
"sort": [
{
"_script": {
"order": "desc",
"script_id": "SortUsuarios",
"type": "number"
}
}
]
}
The must_not filter has a big bunch of values to exclude (around 200 values), but it looks like elasticsearch ignores those values and it includes on the result set. If I try to set only a few values (10 to 20 values) then elasticsearch applies the must_not filter.
Exists some restriction a bout the amount of values in the filters? Exists some way to remove a big amount of results from the query?
terms query is used for passing a list of values not term query.You have to use it like below in your must filter.
{
"query": {
"terms": {
"field_name": [
"VALUE1",
"VALUE2"
]
}
}
}

Resources