Elasticsearch Sorting on Multiple Nested Fields with Nested Filtering - sorting

Given the following data with nested objects (members within teams), I need to sort objects on multiple nested fields, first by height, then by weight. This all needs to respect the filtering that is done on other nested fields (just position in this example).
Data
PUT sample
{
"mappings": {
"dynamic": "strict",
"properties": {
"teamId": { "type": "keyword", "index": true, "doc_values": true },
"members": {
"type": "nested",
"properties": {
"memberId": { "type": "keyword", "index": true, "doc_values": true },
"position": { "type": "keyword", "index": true, "doc_values": true},
"height": { "type": "integer", "index": true, "doc_values": true},
"weight": { "type": "integer", "index": true, "doc_values": true}
}
}
}
}
}
PUT sample/_doc/1
{
"teamId" : "A"
, "members" :
[
{ "memberId" : "A1_X" , "position": "X", "height": 70, "weight": 195}
, { "memberId" : "A2_Y" , "position": "Y", "height": 70, "weight": 170}
, { "memberId" : "A3_Z" , "position": "Z", "height": 75, "weight": 210}
]
}
PUT sample/_doc/2
{
"teamId" : "B"
, "members" :
[
{ "memberId" : "B1_Z" , "position": "Z", "height": 80, "weight": 220 }
, { "memberId" : "B2_X" , "position": "X", "height": 75, "weight": 190 }
, { "memberId" : "B3_X" , "position": "X", "height": 70, "weight": 200 }
, { "memberId" : "B4_Y" , "position": "Y", "height": 70, "weight": 170 }
]
}
PUT sample/_doc/3
{
"teamId" : "C"
, "members" :
[
{ "memberId" : "C1_Y" , "position": "Y", "height": 70, "weight": 190 }
, { "memberId" : "C2_X" , "position": "X", "height": 75, "weight": 180 }
, { "memberId" : "C3_Z" , "position": "Z", "height": 75, "weight": 225 }
]
}
Query
POST sample/_search?filter_path=hits.hits.inner_hits.members.hits.hits._source.height,hits.hits.inner_hits.members.hits.hits._source.weight,hits.hits.sort,hits.hits._source.teamId
{
"size" : 3
, "track_total_hits" : true
, "query" : { "bool" : { "filter" : [
{ "match_all" : { } }
, { "nested": { "path": "members" , "query": { "bool": { "must": [
//nested filters
{ "term" : { "members.position" : "X" } }
] } }
, "inner_hits" : { "size" : 3 }
} }
]}}
, "sort": [
{ "members.height": { "order": "asc" , "nested": { "path": "members", "filter": { "bool": { "must": [
//copy all nested filters below
{ "term" : { "members.position" : "X" } }
] } } } } }
, { "members.weight": { "order": "asc" , "nested": { "path": "members", "filter": { "bool": { "must": [
//copy all nested filters below
{ "term" : { "members.position" : "X" } }
] } } } } }
, { "teamId": { "order": "asc" } }
]
}
Results
{
"hits" : {
"hits" : [
{
"_source" : {
"teamId" : "B"
},
"sort" : [
70,
190,
"B"
],
"inner_hits" : {
"members" : {
"hits" : {
"hits" : [
{
"_source" : {
"weight" : 190,
"height" : 75
}
},
{
"_source" : {
"weight" : 200,
"height" : 70
}
}
]
}
}
}
},
{
"_source" : {
"teamId" : "A"
},
"sort" : [
70,
195,
"A"
],
"inner_hits" : {
"members" : {
"hits" : {
"hits" : [
{
"_source" : {
"weight" : 195,
"height" : 70
}
}
]
}
}
}
},
{
"_source" : {
"teamId" : "C"
},
"sort" : [
75,
180,
"C"
],
"inner_hits" : {
"members" : {
"hits" : {
"hits" : [
{
"_source" : {
"weight" : 180,
"height" : 75
}
}
]
}
}
}
}
]
}
}
So in the query above, I'm trying to
Find teams that have members with position X.
Sort the teams based on height then weight (both ascending) of such members, where the height and weight used must be from the same members.
The query is close, but has 2 problems I'd like to fix.
It requires that I copy all the nested filters for each sorting field. Is there a syntax that won't require me to do this?
The height it uses to sort by may not be from the same member as the weight that it uses to sort by. In the results above, you can see that it puts team B first because it uses the height of 70 from B3_X and the weight of 190 from B2_X, which is not what I want. I want it to use the height of 70 from B3_X and the weight of 200 from the same member.
(Note that a hack I want to avoid is concatenating the 2 sorting fields into a single string or integer. Although this would solve the problem as stated, it creates other problems in the real world scenario I'm basing this example on.)

Related

How to get per term statistics in Elasticsearch

I need to implement the following (on the backend): a user types a query and gets back hits as well as statistics for the hits. Below is a simplified example.
Suppose the query is Grif, then the user gets back (random words just for example)
Griffith
Griffin
Grif
Grift
Griffins
And frequency + number of documents a certain term occurs in, for example:
Griffith (freq 10, 3 docs)
Griffin (freq 17, 9 docs)
Grif (freq 6, 3 docs)
Grift (freq 9, 5 docs)
Griffins (freq 11, 4 docs)
I'm relatively new to Elasticsearch, so I'm not sure where to start to implement something like this. What type of query is the most suitable for this? What can I use to get that kind of statistics? Any other advice will be appreciated too.
There are multiple layers to this. You'd need:
n-gram / partial / search-as-you-type matching
a way to group the matched keywords by their original form
a mechanism to reversely look up the document & term frequencies.
I'm not aware of any way to achieve this in one go, but here's my take on it.
You could start off with a special, n-gram-powered analyzer, as explained in my other answer. There's the original content field, plus a multi-field mapping for the said analyzer, plus a keyword field to aggregate on down the line:
PUT my-index
{
"settings": {
"index": {
"max_ngram_diff": 20
},
"analysis": {
"tokenizer": {
"my_ngrams": {
"type": "ngram",
"min_gram": 3,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"my_ngrams_analyzer": {
"tokenizer": "my_ngrams",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"fields": {
"analyzed": {
"type": "text",
"analyzer": "my_ngrams_analyzer"
},
"keyword": {
"type": "keyword"
}
}
}
}
}
}
Next, bulk-insert some sample docs containing text inside the content field. Note that each doc has an _id too — you'll need those later on.
POST _bulk
{"index":{"_index":"my-index", "_id":1}}
{"content":"Griffith"}
{"index":{"_index":"my-index", "_id":2}}
{"content":"Griffin"}
{"index":{"_index":"my-index", "_id":3}}
{"content":"Grif"}
{"index":{"_index":"my-index", "_id":4}}
{"content":"Grift"}
{"index":{"_index":"my-index", "_id":5}}
{"content":"Griffins"}
{"index":{"_index":"my-index", "_id":6}}
{"content":"Griffith"}
{"index":{"_index":"my-index", "_id":7}}
{"content":"Griffins"}
Search for n-grams in the .analyzed field and group the matched documents by the original terms through the terms aggregation. At the same time, retrieve the _id of one of the bucketed documents through the top_hits aggregation. BTW — it doesn't matter which _id is returned in a given bucket — all will have contained the same bucketed term.
POST my-index/_search?filter_path=aggregations.*.buckets.key,aggregations.*.buckets.doc_count,aggregations.*.buckets.*.hits.hits._id
{
"size": 0,
"query": {
"term": {
"content.analyzed": "grif"
}
},
"aggs": {
"full_terms": {
"terms": {
"field": "content.keyword",
"size": 10
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1,
"_source": false
}
}
}
}
}
}
Observe the response. The filter_path URL parameter from the previous request reduces the response to just those attributes that we need — the untouched, original full_terms plus one of the underlying IDs:
{
"aggregations" : {
"full_terms" : {
"buckets" : [
{
"key" : "Griffins",
"doc_count" : 2,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "5"
}
]
}
}
},
{
"key" : "Griffith",
"doc_count" : 2,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "1"
}
]
}
}
},
{
"key" : "Grif",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "3"
}
]
}
}
},
{
"key" : "Griffin",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "2"
}
]
}
}
},
{
"key" : "Grift",
"doc_count" : 1,
"top_doc" : {
"hits" : {
"hits" : [
{
"_id" : "4"
}
]
}
}
}
]
}
}
}
Time for the fun part.
There's a specialized Elasticsearch API called Term Vectors which does exactly what you're after — it retrieves field & term stats from the whole index. In order for it to hand these stats over to you, it needs the document IDs — which you'll have obtained from the above aggregation!
Finally, since you've got multiple term vectors to work with, you can use the Multi term vectors API like so — again condensing the response thru filter_path:
POST /my-index/_mtermvectors?filter_path=docs.term_vectors.*.*.*.doc_freq,docs.term_vectors.*.*.*.term_freq
{
"docs": [
{
"_id": "5", <--- guaranteeing
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "1", <--- the response
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "3", <--- order
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "2",
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
},
{
"_id": "4",
"fields": [
"content.keyword"
],
"payloads": false,
"positions": false,
"offsets": false,
"field_statistics": false,
"term_statistics": true
}
]
}
The result can be post-processed in your backend to form your autocomplete response. You've got A) the full terms, B) the number of matching documents (doc_freq), and C), the term frequency:
{
"docs" : [
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffins" : { | term
"doc_freq" : 2, | <-- # of docs
"term_freq" : 1 | term frequency
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffith" : {
"doc_freq" : 2,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Grif" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Griffin" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
},
{
"term_vectors" : {
"content.keyword" : {
"terms" : {
"Grift" : {
"doc_freq" : 1,
"term_freq" : 1
}
}
}
}
}
]
}
Shameless plug: if you're new to Elasticsearch and, just like me, learn best from real-world examples, consider buying my Elasticsearch Handbook.

How to get multiple fields returned in elasticsearch query?

How to get multiple fields returned that are unique using elasticsearch query?
All of my documents have duplicate name and job fields. I would like to use an es query to get all the unique values which include the name and job in the same response, so they are tied together.
[
{
"name": "albert",
"job": "teacher",
"dob": "11/22/91"
},
{
"name": "albert",
"job": "teacher",
"dob": "11/22/91"
},
{
"name": "albert",
"job": "teacher",
"dob": "11/22/91"
},
{
"name": "justin",
"job": "engineer",
"dob": "1/2/93"
},
{
"name": "justin",
"job": "engineer",
"dob": "1/2/93"
},
{
"name": "luffy",
"job": "rubber man",
"dob": "1/2/99"
}
]
Expected result in any format -> I was trying to use aggs but I only get one field
[
{
"name": "albert",
"job": "teacher"
},
{
"name": "justin",
"job": "engineer"
},
{
"name": "luffy",
"job": "rubber man"
},
]
This is what I tried so far
GET name.test.index/_search
{
"size": 0,
"aggs" : {
"name" : {
"terms" : { "field" : "name.keyword" }
}
}
}
using the above query gets me this which is good that its unique
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 95,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Justin",
"doc_count" : 56
},
{
"key" : "Luffy",
"doc_count" : 31
},
{
"key" : "Albert",
"doc_count" : 8
}
]
}
}
}
I tried doing nested aggregation but that did not work. Is there an alternative solution for getting multiple unique values or am I missing something?
That's a good start! There are a few ways to achieve what you want, each provides a different response format, so you can decide which one you prefer.
The first option is to leverage the top_hits sub-aggregation and return the two fields for each name bucket:
GET name.test.index/_search
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "name.keyword"
},
"aggs": {
"top": {
"top_hits": {
"_source": [
"name",
"job"
],
"size": 1
}
}
}
}
}
}
The second option is to use a script in your terms aggregation instead of a field to return a compound value:
GET name.test.index/_search
{
"size": 0,
"aggs": {
"name": {
"terms": {
"script": "doc['name'].value + ' - ' + doc['job'].value"
}
}
}
}
The third option is to use two levels of field collapsing:
GET name.test.index/_search
{
"collapse": {
"field": "name",
"inner_hits": {
"name": "by_job",
"collapse": {
"field": "job"
},
"size": 1
}
}
}

Elasticsearch: filter documents by array passed in request contains all document array elements

My documents stored in elasticsearch have following structure:
{
"id": 1,
"test": "name",
"rules": [
{
"id": 2,
"name": "rule1",
"ruleDetails": [
{
"id": 3,
"requiredAnswerId": 1
},
{
"id": 4,
"requiredAnswerId": 2
},
{
"id": 5,
"requiredAnswerId": 3
}
]
}
]
}
where, rules property has nested type.
I need to query documents by checking that array of requiredAnswerId passed in the search request (provided terms) contains all rules.ruleDetails.requiredAnswerId stored in the document.
Does anyone know which elasticsearch option I can use to build such specific query? Or maybe, it is better to fetch the whole document and perform filtering on the application level.
UPDATED
Adding mapping
{
"my_index": {
"mappings": {
"properties": {
"id": {
"type": "long"
},
"test": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"rules": {
"type": "nested",
"properties": {
"id": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"ruleDetails": {
"properties": {
"id": {
"type": "long"
},
"requiredAnswerId": {
"type": "long"
}
}
}
}
}
}
}
}
}
Mapping:
{
"index4" : {
"mappings" : {
"properties" : {
"id" : {
"type" : "integer"
},
"rules" : {
"type" : "nested",
"properties" : {
"id" : {
"type" : "integer"
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
},
"ruleDetails" : {
"properties" : {
"id" : {
"type" : "long"
},
"requiredAnswerId" : {
"type" : "long"
}
}
}
}
},
"test" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
}
}
}
}
}
Query: This will need use of scripts which are not good from performance perspective. I am looping through all documents and checking if field is present is passed parameters
{
"query": {
"nested": {
"path": "rules",
"query": {
"script": {
"script": {
"source": "for(a in doc['rules.ruleDetails.requiredAnswerId']){if(!params.Ids.contains((int)a)) return false; } return true;",
"params": {
"Ids": [
1,
2,
3
]
}
}
}
},
"inner_hits": {}
}
}
}
Result:
"hits" : [
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "TxOpvnEBf42mOjxvvLQB",
"_score" : 4.0,
"_source" : {
"id" : 1,
"test" : "name",
"rules" : [
{
"id" : 2,
"name" : "rule1",
"ruleDetails" : [
{
"id" : 3,
"requiredAnswerId" : 1
},
{
"id" : 4,
"requiredAnswerId" : 2
},
{
"id" : 5,
"requiredAnswerId" : 3
}
]
},
{
"id" : 3,
"name" : "rule3",
"ruleDetails" : [
{
"id" : 3,
"requiredAnswerId" : 1
},
{
"id" : 4,
"requiredAnswerId" : 2
}
]
}
]
},
"inner_hits" : {
"rules" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 4.0,
"hits" : [
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "TxOpvnEBf42mOjxvvLQB",
"_nested" : {
"field" : "rules",
"offset" : 0
},
"_score" : 4.0,
"_source" : {
"id" : 2,
"name" : "rule1",
"ruleDetails" : [
{
"id" : 3,
"requiredAnswerId" : 1
},
{
"id" : 4,
"requiredAnswerId" : 2
},
{
"id" : 5,
"requiredAnswerId" : 3
}
]
}
}
]
}
}
}
}
]
EDIT 1
Terms_set can be used as an alternative. It will be faster compared to script query
Returns documents that contain a minimum number of exact terms in a
provided field.
minimum_should_match_script- size of array can be used to match the minimum number of passed values.
Query:
{
"query": {
"nested": {
"path": "rules",
"query": {
"bool": {
"filter": {
"terms_set": {
"rules.ruleDetails.requiredAnswerId": {
"terms": [
1,
2,
3
],
"minimum_should_match_script": {
"source": "doc['rules.ruleDetails.requiredAnswerId'].size()"
}
}
}
}
}
},
"inner_hits": {}
}
}
}
After some time playing with ES and reading its documentation, I found that you should keep in mind that provided script should be compiled and applied for the document, hence it will be slower, if you just know the required number elements that should match in advance.
Therefore, I created a separate field requiredMatches that stores the number of rules.ruleDetails.requiredAnswerId elements for every document and calculate it before indexing document. Then, instead of using minimum_should_match_script in my search query, I am using minimum_should_match_field:
{
"query": {
"nested": {
"path": "rules",
"query": {
"bool": {
"filter": {
"terms_set": {
"rules.ruleDetails.requiredAnswerId": {
"terms": [
1,
2,
3
],
"minimum_should_match_field": "requiredMatches"
}
}
}
}
},
"inner_hits": {}
}
}
}
I used, following example, as a reference

How to use value of nested documents in script scoring

Schema looks like this:
"mappings": {
"_doc": {
"_all": {
"enabled": false
},
"properties": {
"category_boost": {
"type": "nested",
"properties" : {
"category": {
"type": "text",
"index": false
},
"boost": {
"type": "integer",
"index": false
}
}
}
}
}
}
The document in elastic does have data:
"category_boost": [
{
"category": "A",
"boost": 98
},
{
"category": "B",
"boost": 96
},
{
"category": "C",
"boost": 94
},
],
Inside scoring function:
for (int i=0; i<doc['"'category_boost.boost'"'].size(); ++i) {
if (doc['"'category_boost.category'"'][i].value.equals(params.category)) {
boost = doc['"'category_boost.boost'"'][i].value;
}
}
Also tried length to get size of the array, but did help. Since it does not affect results, I tried to divide by size() and it throws division by zero error, so I conclude the size is 0.
Overall problem: have a map of category->boost which is dynamic and I cannot hardcode into schema. I tried type object with json object, but it turned out you cannot access those objects in scoring functions, therefore I went with arrays with defined types.
nested datatype create sub-documents for representing the items of your collections. So access their doc values in a script is possible but you need to be inside a nested query.
Here is one way of doing it, I hope it fulfills your requirements. This example only returns the document with a score depending on the chosen category.
NB : I used elasticsearch 7 in my local, so your will have to modify the mapping to add your "_doc" entry etc....
Here is the modified mapping, I removed the index: false in nested properties since we now use them in queries
PUT test-score_nested
{
"mappings": {
"properties": {
"category_boost": {
"type": "nested",
"properties": {
"category": {
"type": "keyword"
},
"boost": {
"type": "integer"
}
}
}
}
}
}
Then I add your sample data :
POST test-score_nested/_doc
{
"category_boost": [
{
"category": "A",
"boost": 98
},
{
"category": "B",
"boost": 96
},
{
"category": "C",
"boost": 94
}
]
}
And then the query.
We go one level deep in the nested collection
Inside the collection we use a function score query with the replace mode
Inside the function score, we use a filter query to "select" the good category and use its boost for the scoring
POST test-score_nested/_search
{
"query": {
"nested": {
"path": "category_boost",
"query": {
"function_score": {
"boost_mode": "replace",
"query": {
"term": {
"category_boost.category": {
"value": "A"
}
}
},
"functions": [
{
"field_value_factor": {
"field": "category_boost.boost"
}
}
]
}
}
}
}
}
returns
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 98.0,
"hits" : [
{
"_index" : "test-score_nested",
"_type" : "_doc",
"_id" : "v3Smqm0BZ7nyeX7PPevA",
"_score" : 98.0,
"_source" : {
"category_boost" : [
{
"category" : "A",
"boost" : 98
},
{
"category" : "B",
"boost" : 96
},
{
"category" : "C",
"boost" : 94
}
]
}
}
]
}
}
I hope it will help you!

elastic search query for aggregating on sub-fields of array

My elastic search index contain data in such format, the data is an array of object which contain date , order, visit for that date on that term :-
{
"term": "ふるさと納税",
"data": [
{
"date": "2018-01-25",
"visits": 17670,
"ranking": 1,
"orders": 154
},
{
"date": "2018-02-14",
"visits": 13758,
"ranking": 1,
"orders": 116
},
{
"date": "2017-12-24",
"visits": 142578,
"ranking": 1,
"orders": 2565
},
{
"date": "2018-03-08",
"visits": 21799,
"ranking": 1,
"orders": 312
}
]
},{
"term": "帯 中古 振袖",
"data": [
{
"date": "2018-01-30",
"ranking": 2966,
"orders": 0,
"visits": 345
}
]
}
i would like to sum all the visits and orders for the term within a defined date range
I have created this query :-
{
"_source": [],
"query": {
"bool": {
"filter": [
{"range": {"data.date": {"gte" : "2018-03-21"}}},
{"range": {"data.date": {"lte" : "2018-03-21"}}}
]
}
},
"aggs" : {
"by_term": {
"terms": {
"field": "term",
"order":{"sum_ranking":"desc"},
"size":100
},"aggs": {
"sum_ranking": {
"sum": {
"field" : "data.visits"
}
}
}
}
},
"from" : 0,
"size" : 0
}
it seems the filter is not working .
can any one help.
The mapping is :-
{
"settings" : {
"number_of_shards" : 1
},
"mappings" : {
"keyword" : {
"properties" : {
"term" : { "type" : "keyword" }
}
}
}
}

Resources