Filter query by length of nested objects. ie. min_child - elasticsearch

I'm trying to filter my query by the number of nested objects found. The Elastic Search documentation mentions that using a script is an expensive task, so I've set out to do it with a score, though I can't seem to get the script to work either.
Here's my mappings:
"mappings": {
"properties": {
"dates" : {
"type" : "nested",
"properties" : {
"rooms" : {
"type" : "integer"
},
"timestamp" : {
"type" : "long"
}
}
},
"doc_id" : {
"type" : "text"
},
"distance" : {
"type" : "integer"
}
...
}
}
Here's some example data:
PUT /test/_doc/1
{
"doc_id": "1",
"distance": 1,
"dates": [
{
"rooms": 1,
"timestamp": 1
},
{
"rooms": 1,
"timestamp": 2
},
...
]
}
I'm filtering by the parents distance field, among others, and filtering the nested dates by their timestamps, and rooms. I need to filter all results to an exact number of nest dates found.
I tried to borrow from here.
This is my search query:
GET /test/_search
{
"query" : {
"function_score": {
"min_score": 20,
"boost": 1,
"functions": [
{
"script_score": {
"script": {
"source": "if (_score > 20) { return - 1; } return _score;"
}
}
}
],
"query": {
"bool" : {
"filter": [
{ "range": { "distance": { "lt": 5 }}},
{
"nested": {
"score_mode": "sum",
"boost": 10,
"path": "dates",
"query": {
"bool": {
"filter": [
{ "range": { "dates.rooms": { "gte": 1 } } },
{ "range": { "dates.timestamp": { "lte": 2 }}},
{ "range": { "dates.timestamp": { "gte": 1 }}}
]
}
}
}
}
]
}
}
}
}
}
This returns all the results that match, yet they all have a score of 0.0 and aren't getting filtered by the number of nested objects found.
If this is the right solution, how can I get this working? If not, how can I get a script to do it within this search?
Thanks!

Before getting started, keep in mind that the scoring function has changed between Elastic 6 and 7. You can find the updated code samples on this this gist.
Your question didn't outline the specifics of your search. Reading the code, it seems like you want to retrieve all documents where the distance is less than five, and the number of matching rooms is precisely 2. If this is correct, the code you submitted does not achieve this.
Reasons: your function score contains your primary condition and your condition on the number of matching rooms (it is quite tricky to mix both, though not impossible). To make things simpler, isolate them for the function score to be only applicable to the number of rooms.
Supposing you are using elastic 7+, this might work:
{
"_source": {
"includes": ["*"],
"excludes": ["dates"]
},
"query": {
"bool": {
"must": [
{"range": {"distance": {"lt": 5}}},
{
"function_score": {
"min_score": 20,
"boost": 1,
"score_mode": "multiply",
"boost_mode": "replace",
"functions": [
{
"script_score": {
"script": {
"source": "if (_score > 20) { return 0; } return _score;"
}
}
}
],
"query": {
"nested": {
"path": "date",
"boost": 10,
"score_mode": "sum",
"query": {
"constant_score": {
"boost": 1,
"filter": {
"bool": {
"should": [
{
"bool": {
"must": [
{"term": {"dates.timestamp": 1}},
{"range": {"dates.rooms": {"lt": 5}}}
],
"should": [
{"term": {"dates.other_prop": 1}},
{"term": {"dates.other_prop": 4}}
]
}
},
{
"bool": {
"must": [
{"term": {"dates.timestamp": 2}},
{"range": {"dates.rooms": {"lt": 5}}}
],
"should": [
{"term": {"dates.other_prop": 1}},
{"term": {"dates.other_prop": 3}}
]
}
}
]
}
}
}
}
}
}
}
}
]
}
}
}

I managed to get it all working with scoring as filtering doesn't allow scoring. Using GET /test/_explain/[id] helped to understand exactly what was happening
GET /test/_search
{
// Don't return the nested fields, they are returned in the inner_hits
"_source": {
"includes": [ "*" ],
"excludes": [ "dates" ]
},
"query": {
"function_score": {
// Score is calculated with 1 point for each matched inner property and outer property.
// 7 is the exact score to allow
"min_score": 7,
"boost": 1,
"score_mode": "sum",
"boost_mode": "multiply",
"functions": [
{
"script_score": {
"script": {
// Ignore any results that don't match exactly
"source": "if (_score == 7) { return 1; } return 0;",
"lang": "painless"
}
}
}
],
"query": {
"bool" : {
"must" : [
{ "range" : { "distance" : { "lt": 10 }}},
{
"nested": {
"inner_hits" : {},
"path": "dates",
"score_mode": "sum",
"query": {
"bool": {
// Match each required nested object individually, then verify with the score if we got 1 match for each should
"should": [
{
"bool": {
"must": [
{ "term": { "dates.timestamp": 1 }},
{ "range": { "dates.rooms": { "lt": 5 } } }
],
"should": [
{ "term": { "dates.other_prop": 1 }},
{ "term": { "dates.other_prop": 4 }}
]
}
},
{
"bool": {
"must": [
{ "term": { "dates.timestamp": 2 }},
{ "range": { "dates.rooms": { "lt": 5 } } }
],
"should": [
{ "term": { "dates.other_prop": 1 }},
{ "term": { "dates.other_prop": 3 }}
]
}
}
]
}
}
}
}
]
}
}
}
}
}

Related

Limit the size per index when searching multiple index in Elastic

I have been following the guidelines from this post. I can get the desired output but in the same DSL how can I limit the size of results for each index ?
Full text Search with Multiple index in Elastic Search using NEST C#
POST http://localhost:9200/componenttypeindex%2Cprojecttypeindex/Componenttype%2CProjecttype/_search?pretty=true&typed_keys=true
{
"query": {
"bool": {
"should": [
{
"bool": {
"filter": [
{
"term": {
"_index": {
"value": "componenttypeindex"
}
}
}
],
"must": [
{
"multi_match": {
"fields": [
"Componentname",
"Summary^1.1"
],
"operator": "or",
"query": "test"
}
}
]
}
},
{
"bool": {
"filter": [
{
"term": {
"_index": {
"value": "projecttypeindex"
}
}
}
],
"must": [
{
"multi_match": {
"fields": [
"Projectname",
"Summary^0.3"
],
"operator": "or",
"query": "test"
}
}
]
}
}
]
}
}
}
With your given query, you could use aggregations to group and limit number of hits per index (in this case, limiting to 5):
{
"size": 0,
"query": {
... Same query as above ...
},
"aggs": {
"index_agg": {
"terms": {
"field": "_index",
"size": 20
},
"aggs": {
"hits_per_index": {
"top_hits": {
"size": 5
}
}
}
}
}
}

ElasticSearch should with nested and bool must_not exists

With the following mapping:
"categories": {
"type": "nested",
"properties": {
"category": {
"type": "integer"
},
"score": {
"type": "float"
}
}
},
I want to use the categories field to return documents that either:
have a score above a threshold in a given category, or
do not have the categories field
This is my query:
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"terms": {
"categories.category": [
<id>
]
}
},
{
"range": {
"categories.score": {
"gte": 0.5
}
}
}
]
}
}
}
},
{
"bool": {
"must_not": [
{
"exists": {
"field": "categories"
}
}
]
}
}
],
"minimum_should_match": 1
}
}
}
It correctly returns documents both with and without the categories field, and orders the results so the ones I want are first, but it doesn't filter the results having score below the 0.5 threshold.
Great question.
That is because categories is not exactly a field from the elasticsearch point of view[a field on which inverted index is created and used for querying/searching] but categories.category and categories.score is.
As a result categories being not found in any document, which is actually true for all the documents, you observe the result what you see.
Modify the query to the below and you'd see your use-case working correctly.
POST <your_index_name>/_search
{
"query": {
"bool": {
"should": [
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"terms": {
"categories.category": [
"100"
]
}
},
{
"range": {
"categories.score": {
"gte": 0.5
}
}
}
]
}
}
}
},
{
"bool": {
"must_not": [ <----- Note this
{
"nested": {
"path": "categories",
"query": {
"bool": {
"must": [
{
"exists": {
"field": "categories.category"
}
},
{
"exists": {
"field": "categories.score"
}
}
]
}
}
}
}
]
}
}
],
"minimum_should_match": 1
}
}
}

Using multiple Should queries

I want to get docs that are similar to multiple "groups" but separately. Each group has it's own rules (terms).
When I try to use more than one Should query inside a "bool" I get items that are a mix of both Should's terms.
I want to use 1 query total and not msearch for example.
Can someone please help me with that?
{
"explain": true,
"query": {
"filtered": {
"filter": {
"bool": {
"must_not": [
{
"term": {
"p_id": "123"
}
},
{
"term": {
"p_id": "124"
}
}
]
}
},
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"bool": {
"minimum_should_match": 1,
"should": [
{
"term": {
"cat": "1"
}
},
{
"term": {
"cat": "2"
}
},
{
"term": {
"keys": "a"
}
},
{
"term": {
"keys": "b"
}
}
]
}
},
{
"bool": {
"minimum_should_match": 1,
"should": [
{
"term": {
"cat": "6"
}
},
{
"term": {
"cat": "7"
}
},
{
"term": {
"keys": "r"
}
},
{
"term": {
"keys": "u"
}
}
]
}
}
]
}
}
}
},
"from": 0,
"size": 3
}
You can try using a terms aggregation on multiple fields with scripting and add a top hits aggregation as a sub-aggregation. Be warned this will be pretty slow. Add this after the query/filter and adjust the size parameter as needed
"aggs": {
"Cat_and_Keys": {
"terms": {
"script": "doc['cat'].values + doc['keys'].values"
},
"aggs":{ "separate_docs": {"top_hits":{"size":1 }} }
}
}

elasticsearch: Add weight for each match of array

I want to add a weight for each match (instead of adding a weight once if one of those matched):
Having docs like this:
[{
"username": "xyz",
"categories": [
{
"category.id": 1
},
{
"category.id": 2
}
]
}, {
"username": "xyz2",
"categories": [
{
"category.id": 1
}
]
}]
And currently, I have this query:
{
"query": {
"filtered": {
"query": {
"function_score": {
"query": {
"bool": {}
},
"score_mode": "sum",
"boost_mode": "sum",
"functions": [
{
"weight": 1.1,
"filter": {
"terms": {
"category.id": [
1,
2
]
}
}
}
]
}
},
"filter": {
"bool": {
"must_not": [
{
"terms": {
"_id": [
8
]
}
}
]
}
}
}
},
"from": 0,
"size": 30
}
With this query, both entries would receive a single weight of 1.1, but I want the first entry to get 2 * 1.1 because 2 categories are matched. How could I achieve that?
EDIT: Sorry, I missed to add elastic search version. It's 1.7.2.
This might be a bit cumbersome, since for multiple IDs that query will need to have multiple statements, but I don't think there is any other way. Also, notice that your field referencing is not complete - it should be categories.category.id to be correct. Also, be careful when upgrading with dots in field names. This changed in some releases over time.
{
"query": {
"filtered": {
"query": {
"function_score": {
"query": {
"match_all": {}
},
"score_mode": "sum",
"boost_mode": "sum",
"functions": [
{
"weight": 1.1,
"filter": {
"term": {
"categories.category.id": 1
}
}
},
{
"weight": 1.1,
"filter": {
"term": {
"categories.category.id": 2
}
}
}
]
}
},
"filter": {
"bool": {
"must_not": [
{
"terms": {
"_id": [
8
]
}
}
]
}
}
}
},
"from": 0,
"size": 30
}

Elasticsearch must_not filter not works with a big bunch of values

I have the next query that include some filters:
{
"from": 0,
"query": {
"function_score": {
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"idpais": [
115
]
}
},
{
"term": {
"tipo": [
1
]
}
}
],
"must_not": [
{
"term": {
"idregistro": [
5912471,
3433876,
9814443,
11703069,
6333176,
8288242,
9924922,
6677850,
11852501,
12530205,
4703469,
12776479,
12287659,
11823679,
12456304,
12777457,
10977614,
...
]
}
}
]
}
},
"query": {
"bool": {
"should": [
{
"match_phrase": {
"area": "Coordinator"
}
},
{
"match_phrase": {
"company": {
"boost": 5,
"query": "IBM"
}
}
},
{
"match_phrase": {
"topic": "IT and internet stuff"
}
},
{
"match_phrase": {
"institution": {
"boost": 5,
"query": "University of my city"
}
}
}
]
}
}
}
},
"script_score": {
"params": {
"idpais": 115,
"idprovincia": 0,
"relationships": []
},
"script_id": "ScoreUsuarios"
}
}
},
"size": 24,
"sort": [
{
"_script": {
"order": "desc",
"script_id": "SortUsuarios",
"type": "number"
}
}
]
}
The must_not filter has a big bunch of values to exclude (around 200 values), but it looks like elasticsearch ignores those values and it includes on the result set. If I try to set only a few values (10 to 20 values) then elasticsearch applies the must_not filter.
Exists some restriction a bout the amount of values in the filters? Exists some way to remove a big amount of results from the query?
terms query is used for passing a list of values not term query.You have to use it like below in your must filter.
{
"query": {
"terms": {
"field_name": [
"VALUE1",
"VALUE2"
]
}
}
}

Resources