Elasticsearch, ordering aggregations by geo distance and score - elasticsearch

My mapping is the following:
PUT places
{
"mappings": {
"test": {
"properties": {
"id_product": { "type": "keyword" },
"id_product_unique": { "type": "integer" },
"location": { "type": "geo_point" },
"suggest": {
"type": "text"
},
"active": {"type": "boolean"}
}
}
}
}
POST places/test
{
"id_product" : "A",
"id_product_unique": 1,
"location": {
"lat": 1.378446,
"lon": 103.763427
},
"suggest": ["coke","zero"],
"active": true
}
POST places/test
{
"id_product" : "A",
"id_product_unique": 2,
"location": {
"lat": 1.878446,
"lon": 108.763427
},
"suggest": ["coke","zero"],
"active": true
}
POST places/test
{
"id_product" : "B",
"id_product_unique": 3,
"location": {
"lat": 1.478446,
"lon": 104.763427
},
"suggest": ["coke"],
"active": true
}
POST places/test
{
"id_product" : "C",
"id_product_unique": 4,
"location": {
"lat": 1.218446,
"lon": 102.763427
},
"suggest": ["coke","light"],
"active": true
}
In my example there is 2 can of coke zero ("id_product_unique" = 1 and 2), 1 can of coke ("id_product_unique" = 3) and one can of coke light ("id_product_unique" = 4)
All these cans are in different locations.
An "id_product" is not unique as an exact same "can of coke" can be sold in two different locations (ex "id_product_unique" = 1 and 2).
Only "id_product_unique" and "location" change from a "can of coke" to an other one (2 same "can of coke" have the same fields "suggest" and "id_product" but not the same "id_product_unique" and "location").
My goal is to search for a product from a given GPS location, and display a unique result by id_product (the closest one):
POST /places/_search?size=0
{
"aggs" : {
"group-by-type" : {
"terms" : { "field" : "id_product"},
"aggs": {
"min-distance": {
"top_hits": {
"sort": {
"_script": {
"type": "number",
"script": {
"source": "def x = doc['location'].lat; def y = doc['location'].lon; return Math.abs(x-1.178446) + Math.abs(y-101.763427)",
"lang": "painless"
},
"order": "asc"
}
},
"size" : 1
}
}
}
}
}
}
From this list of result I'd like now to apply a should query and to re-order my list of result by computed score. I tried the following:
POST /places/_search?size=0
{
"query" : {
"bool": {
"filter": {"term" : { "active" : "true" }},
"should": [
{"match" : { "suggest" : "coke" }},
{"match" : { "suggest" : "light" }}
]
}
},
"aggs" : {
"group-by-type" : {
"terms" : { "field" : "id_product"},
"aggs": {
"min-distance": {
"top_hits": {
"sort": {
"_script": {
"type": "number",
"script": {
"source": "def x = doc['location'].lat; def y = doc['location'].lon; return Math.abs(x-1.178446) + Math.abs(y-101.763427)",
"lang": "painless"
},
"order": "asc"
}
},
"size" : 1
}
}
}
}
}
}
But I cannot figure how to replace the distance sort score by the doc score.
Any help would be great.

I managed to do it by adding a new aggregation "max_score":
"max_score": {
"max": {
"script": {
"lang": "painless",
"source": "_score"
}
}
}
and by ordering by max_score.value desc:
"order": {"max_score.value": "desc"}
My final query is the following:
POST /places/_search?size=0
{
"query" : {
"bool": {
"filter": {"term" : { "active" : "true" }},
"should": [
{"match" : { "suggest" : "coke" }},
{"match" : { "suggest" : "light" }}
]
}
},
"aggs" : {
"group-by-type" : {
"terms" : {
"field" : "id_product",
"order": {"max_score.value": "desc"}
},
"aggs": {
"min-distance": {
"top_hits": {
"sort": {
"_script": {
"type": "number",
"script": {
"source": "def x = doc['location'].lat; def y = doc['location'].lon; return Math.abs(x-1.178446) + Math.abs(y-101.763427)",
"lang": "painless"
},
"order": "asc"
}
},
"size" : 1
}
},
"max_score": {
"max": {
"script": {
"lang": "painless",
"inline": "_score"
}
}
}
}
}
}
}
answer:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"group-by-type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "C",
"doc_count": 1,
"max_score": {
"value": 1.0300811529159546
},
"min-distance": {
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "places",
"_type": "test",
"_id": "VhJdOmIBKhzTB9xcDvfk",
"_score": null,
"_source": {
"id_product": "C",
"id_product_unique": 4,
"location": {
"lat": 1.218446,
"lon": 102.763427
},
"suggest": [
"coke",
"light"
],
"active": true
},
"sort": [
1.0399999646503995
]
}
]
}
}
},
{
"key": "A",
"doc_count": 2,
"max_score": {
"value": 0.28768208622932434
},
"min-distance": {
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "places",
"_type": "test",
"_id": "UhJcOmIBKhzTB9xc6ve-",
"_score": null,
"_source": {
"id_product": "A",
"id_product_unique": 1,
"location": {
"lat": 1.378446,
"lon": 103.763427
},
"suggest": [
"coke",
"zero"
],
"active": true
},
"sort": [
2.1999999592114756
]
}
]
}
}
},
{
"key": "B",
"doc_count": 1,
"max_score": {
"value": 0.1596570909023285
},
"min-distance": {
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "places",
"_type": "test",
"_id": "VRJcOmIBKhzTB9xc_vc0",
"_score": null,
"_source": {
"id_product": "B",
"id_product_unique": 3,
"location": {
"lat": 1.478446,
"lon": 104.763427
},
"suggest": [
"coke"
],
"active": true
},
"sort": [
3.2999999020282695
]
}
]
}
}
}
]
}
}
}

From what I gathered, your use case is where you want to factor in the value of a particular field in your document into the calculation of the relevance score.
This is typical in scenarios where you want the boost the relevance of a document based on a value of a field, like a price or here a query for a particular product.
If you are searching for produt A, that is more important in this scenario than the distance of the products themselves. So if B is 2 miles away from origin and A is 5 miles, A is the closest of the product you are searching for.
What you need is a Function Score Query using a decay_function based on the distance. I think you want a gauss type to reflect the rate of decay, which operates like a bell curve.
Here is an example using a decay function of the exp (exponent) type. This use case is doing the same thing, but it is using a different field type (date) than
you are, but the idea should be the same.
Suppose that instead of wanting to boost incrementally by the value of
a field, you have an ideal value you want to target and you want the
boost factor to decay the further away you move from the value. This
is typically useful in boosts based on lat/long, numeric fields like
price, or dates. In our contrived example, we are searching for books
on “search engines” ideally published around June 2014.
POST /bookdb_index/book/_search
{
"query": {
"function_score": {
"query": {
"multi_match" : {
"query" : "search engine",
"fields": ["title", "summary"]
}
},
"functions": [
{
"exp": {
"publish_date" : {
"origin": "2014-06-15",
"offset": "7d",
"scale" : "30d"
}
}
}
],
"boost_mode" : "replace"
}
},
"_source": ["title", "summary", "publish_date", "num_reviews"]
}
Here are some useful references for this:
Elasticsearch 6.2 Function Score document
Elastcisearch Example Queries
The Closer the Better
This is an Elasticsearch 2x Decay Function example and even though it's a different version, I think it is very similar to your use case

Related

Elasticsearch - Nested field sorting

I have an index defined by the following :
{
"mappings": {
"properties": {
"firstName": {
"type": "keyword"
},
"lastName": {
"type": "keyword"
},
"affiliations": {
"type": "nested",
"properties": {
"organisation": {
"type": "keyword"
},
"team": {
"type": "keyword"
},
"dateBeginning": {
"type": "date",
"format": "yyyy-MM-dd"
},
"dateEnding": {
"type": "date",
"format": "yyyy-MM-dd"
},
"country": {
"type": "keyword"
}
}
}
}
}
}
Basically, for each researcher (researchers is how I named my index) I want to sort the the affiliations by dateBeginning, in descending order. I've read about inner hits in the EL official doc, and not being exactly sure how it works I've tried this for researcher with _id : 3 :
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
},
"inner_hits": {
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations",
"filter": {
"term": { "_id": 3 }
}
}
}
}
]
}
}
}
}
And it doesn't really work.
Having two affiliation for researchers with _id : 3, with one dateBeginning set on 2015-06-30, and the other on 2017-06-30. So I've tried this also :
{
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations"
}
}
}
],
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
}
}
}
}
And it doesn't sort the affiliations by dateBeginning.
I've also tried to do it with the SQL API (since I'm more familiar with SQL language), and still, I can't get the data I want.
So I'm quite new to ElasticSearch, I'm using version 7.10, and I don't know what else to do.
Any suggestions about what I'm doing wrong here ?
EDIT
here's an example of a document from that index:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [{
"_index": "researchers",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"firstName": "Kimmich",
"lastName": "Yoshua",
"affiliations": [{
"organisation": "University of Ottawa",
"team": "Neural Network Elite Team",
"dateBeginning": "2015-06-30",
"datEnding": "2017-01-31",
"country": "Canada"
},
{
"organisation": "University of Montréal",
"team": "Picture processing team",
"dateBeginning": "2017-06-30",
"dateEnding": null,
"country": "Canada"
}
]
}
}]
}
}
Once you're inside the nested query, the inner hits don't need the extra nested query. Remove it and the sort will work properly:
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": {
"_id": 3
}
},
"inner_hits": {
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc"
}
}
]
}
}
}
}
Note that this wouldn't sort the top-level hits -- only the inner hits.
But you can sort on the top level by the values of affiliations.dateBeginning like so:
POST researchers/_search
{
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc",
"nested_path": "affiliations"
}
}
]
}
but note that the syntax is now slightly different: instead of path we're saying nested_path.

Two filters (RANGE) in different fields in elasticsearch

I am a beginner in elasticsarch and I wanted this query below to work with the two filters, having two range of different fields, but only the first range is working.
This filter is working normally:
"range" : {"pgrk" : { "gte" : 1, "lte" : 10} }
Could someone tell me why this second filter below doesn't work?
"should" : {
"range" : {"url_length" : { "lte" : 100 } }
--------------------------Follow my query below with the two filters--------------------------
{
"from" : 0, "size" : 10,
"sort" : [
{ "pgrk" : {"order" : "desc"} },
{ "url_length" : {"order" : "asc"} }
],
"query": {
"bool": {
"must": {
"multi_match" : {
"query": "netflix",
"type": "cross_fields",
"fields": [ "titulo", "descricao", "url" ],
"operator": "and"
}
},
"filter": {
"range" : {"pgrk" : { "gte" : 1, "lte" : 10} }
},
"should" : {
"range" : {"url_length" : { "lte" : 100 } }
}
}
}
}
Not sure, what is your requirement as index mapping and sample documents are not provided but I created my own mapping and sample documents to show you how to create multiple range queries in filter context.
Please comment, so that I can modify if its results are not according to your requirements.
Index Def
{
"mappings": {
"properties": {
"title": {
"type": "text"
},
"url": {
"type": "keyword"
},
"pgrk": {
"type": "integer"
},
"url_length": {
"type": "integer"
}
}
}
}
Index sample docs
{
"title": "netflix",
"url" : "www.netflix.com", --> this shouldn't match as `pgrk > 10`
"pgrk": 12,
"url_length" : 50
}
{
"title": "Netflix", --> this should match both filetrs
"url" : "www.netflix.com",
"pgrk": 8,
"url_length" : 50
}
{
"title": "Netflix", --> this should match both filetrs
"url" : "www.netflix",
"pgrk": 5,
"url_length" : 50
}
{
"title": "netflix",
"url" : "www.netflix",
"pgrk": 5,
"url_length" : 80. --> note pgrk has same 5 as prev and url_length is diff
}
Search query
{
"from": 0,
"size": 10,
"sort": [
{
"pgrk": {
"order": "desc"
}
},
{
"url_length": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": {
"multi_match": {
"query": "netflix",
"type": "cross_fields",
"fields": [
"title",
"url"
],
"operator": "and"
}
},
"filter": [ --> note filter array to have multiple range queries in filter context
{
"range": {
"pgrk": {
"gte": 1,
"lte" : 10
}
}
},
{
"range": {
"url_length": {
"lte": 100
}
}
}
]
}
}
}
And search result which brings only three docs (even 2 has same pgrk value)
"hits": [
{
"_index": "so_range",
"_type": "_doc",
"_id": "1",
"_score": null,
"_source": {
"title": "netflix",
"url": "www.netflix.com",
"pgrk": 8,
"url_length": 50
},
"sort": [
8,
50
]
},
{
"_index": "so_range",
"_type": "_doc",
"_id": "3",
"_score": null,
"_source": {
"title": "netflix",
"url": "www.netflix",
"pgrk": 5,
"url_length": 50
},
"sort": [
5,
50
]
},
{
"_index": "so_range",
"_type": "_doc",
"_id": "4",
"_score": null,
"_source": {
"title": "netflix",
"url": "www.netflix",
"pgrk": 5,
"url_length": 80
},
"sort": [
5,
80
]
}
]

ElasticSearch - Return unique result by field values

I have 3 "places" having each a type and a location:
PUT places
{
"mappings": {
"test": {
"properties": {
"type": { "type": "keyword" },
"location": { "type": "geo_point" }
}
}
}
}
POST places/test
{
"type" : "A",
"location": {
"lat": 1.378446,
"lon": 103.763427
}
}
POST places/test
{
"type" : "B",
"location": {
"lat": 1.478446,
"lon": 104.763427
}
}
POST places/test
{
"type" : "A",
"location": {
"lat": 1.278446,
"lon": 102.763427
}
}
I'd like to retrieve only one place per "type": the closest from a random position lets say "lat": 1.178446, "lon": 101.763427
In my example result answer should be composed by exactly 2 elements (one for "type: A" and one for "type: B").
I'd also prefer to avoid "aggregations" as I will need the _source of each places.
Any help would be great.
Without an aggregation, such an operation seems impossible executing one query.
This can be achieved with the top-hits-aggregation.
The following has been tested with elasticsearch 6:
POST /places/_search?size=0
{
"aggs" : {
"group-by-type" : {
"terms" : { "field" : "type" },
"aggs": {
"min-distance": {
"top_hits": {
"sort": {
"_script": {
"type": "number",
"script": {
"source": "def x = doc['location'].lat; def y = doc['location'].lon; return Math.abs(x-1.178446) + Math.abs(y-101.763427)",
"lang": "painless"
},
"order": "asc"
}
},
"_source": {
"includes": [ "type", "location" ]
},
"size" : 1
}
}
}
}
}
}
Note, I calculated the distance as: |location.x - givenPoint.x| + |location.y - givenPoint.y|
This is the response:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"group-by-type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "A",
"doc_count": 2,
"min-distance": {
"hits": {
"total": 2,
"max_score": null,
"hits": [{
"_index": "places",
"_type": "test",
"_id": "3",
"_score": null,
"_source": {
"location": {
"lon": 102.763427,
"lat": 1.278446
},
"type": "A"
},
"sort": [1.1000006934661934]
}]
}
}
}, {
"key": "B",
"doc_count": 1,
"min-distance": {
"hits": {
"total": 1,
"max_score": null,
"hits": [{
"_index": "places",
"_type": "test",
"_id": "2",
"_score": null,
"_source": {
"location": {
"lon": 104.763427,
"lat": 1.478446
},
"type": "B"
},
"sort": [3.3000007411499093]
}]
}
}
}]
}
}
}

How to filter nested objects on a should query?

I have my mappings as below and I am doing a bool should query on name and other properties as shown below but what I need is that I want to filter CustomerPrices by CustomerId on response.
Each products have same CustomerIds so for eaxample;
product1 -CustomerPrice( CustomerId :1234 -Price:4)
CustomerPrice( CustomerId :567-Price:5)
.
.
Product2 - CustomerPrice(CustomerId :1234 -Price:8)
CustomerPrice(CustomerId :567-Price:10)
.
.
So according to that when I query Product1, response should have only customerPrice for customerId:1234
{
"Product": {
"properties": {
"CustomerPrices": {
"type": "nested",
"properties": {
"Price": {
"store": true,
"type": "float"
},
"CustomerId": {
"type": "integer"
}
}
},
"Name": {
"index": "not_analyzed",
"store": true,
"type": "string"
}
}
}
}
I tried following query but this is not filtering nested objects. I guess it filters product objects as it makes sense because all products have customerId:1234
"query":{
"bool":{
"should":[
{
"multi_match":{
"type":"best_fields",
"query":"product 1",
"fields":[
"Name^7"]
}
},
{
"multi_match":{
"type":"best_fields",
"query":"product 1",
"operator":"and",
"fields":[
"Code^10",
"ShortDescription^6"]
}
},
{
"nested":{
"query":{
"term":{
"CustomerPrices.CustomerId":{
"value":1234
}
}
},
"path":"CustomerPrices"
}
}]
}
},
I've spent some time on your question since it was interesting how this can be achieved and the only solution I found for now is relying on the inner_hits which gives the exact nested object the match was on. I've also deactivated the _source which isn't used anymore.
So given your mapping and having 2 products like:
PUT product/Product/product1
{
"CustomerPrices": [
{
"CustomerId": 1234,
"Price": 4
},
{
"CustomerId": 567,
"Price": 5
}
],
"Name": "John"
}
PUT product/Product/product2
{
"CustomerPrices": [
{
"CustomerId": 1234,
"Price": 8
},
{
"CustomerId": 567,
"Price": 10
}
],
"Name": "Bob"
}
When running the following query: (Used must just to see 1 result, works with should as well)
GET product/_search
{
"_source": false,
"query": {
"bool": {
"must": [
{ "match": { "Name": "Bob"}}
],
"filter": [
{
"nested" : {
"path" : "CustomerPrices",
"score_mode" : "avg",
"query" : {
"bool" : {
"should" : [
{ "match" : {"CustomerPrices.CustomerId" : 1234}}
]
}
},
"inner_hits": {}
}
}
]
}
}
}
I was able to get the result where only "Price" from customer with id 1234 was present:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "product",
"_type": "Product",
"_id": "product2",
"_score": 0.2876821,
"inner_hits": {
"CustomerPrices": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "product",
"_type": "Product",
"_id": "product2",
"_nested": {
"field": "CustomerPrices",
"offset": 0
},
"_score": 1,
"_source": {
"CustomerId": 1234,
"Price": 8
}
}
]
}
}
}
}
]
}
}
Couldn't find an official way of returning partial results of the document by only having the matched nested object. Maybe something that we need to inform elasticsearch guys about to consider for some next releases. Hope it helps you.

How to return only matched texts in Elasticsearch aggregation and in source too

My query :
POST /testqueryidx/testQuery/_search
{
"size" : 10,
"query" : {
"bool" : {
"must" : [ {
"multi_match": {
"query": "sales*",
"fields": ["skills"]
}
}, {
"query_string" : {
"query" : "jay12",
"fields" : [ "idNum" ]
}
} ]
}
},
"aggregations" : {
"aggs" : {
"terms" : {
"field" : "skills_sort",
"size" : 0,
"order" : {
"_term" : "asc"
}
}
}
}
}
Query Results :
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.9734945,
"hits": [
{
"_index": "testqueryidx",
"_type": "testQuery",
"_id": "56909fbdaecb813e8c64e1e8",
"_score": 0.9734945,
"_source": {
"skills": [
"Account Management",
"Sales force",
"Adobe Creative Suite"
],
"_id": "56909fbdaecb813e8c64e1e8",
"idNum": "jay12"
}
}
]
},
"aggregations": {
"aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Account Management",
"doc_count": 1
},
{
"key": "Adobe Creative Suite",
"doc_count": 1
},
{
"key": "Sales force",
"doc_count": 1
}
]
}
}
}
Here I searched for keyword Sales in field skills and I got matched documents. You can see one matched sample below:
"skills": [
"Account Management",
"Sales force",
"Adobe Creative Suite"
],
But I don't want "Account Management" and "Adobe Creative Suite" in query results as well in query aggregations. See below aggregation results:
"buckets": [
{
"key": "Account Management",
"doc_count": 1
},
{
"key": "Adobe Creative Suite",
"doc_count": 1
},
{
"key": "Sales force",
"doc_count": 1
}
]
Same way I don't want above "key": "Account Management" and "key": "Adobe Creative Suite" in aggregation results as I searched only for sales* .
I got above highlighted texts because skills field in my document has all these three skills but I am interested only in searched keywords. Please help me if anyone has solution for this.
I think this is kind of achievable. You can use include for terms aggregation which will give you only sales*. As far as query is concerned you have to use highlight to get only particular value of any field and you can retrieve others with source filtering. This is my setup
POST only_index
{
"mappings": {
"my_type": {
"properties": {
"skills": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"idNum" : {
"type" : "string"
}
}
}
}
}
After indexing your document, I run the following query
GET only_index/_search
{
"size": 10,
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "sales*",
"fields": [
"skills"
]
}
},
{
"query_string": {
"query": "jay12",
"fields": [
"idNum"
]
}
}
]
}
},
"aggregations": {
"aggs": {
"terms": {
"field": "skills.raw",
"size": 0,
"include": "(?i)sales.*",
"order": {
"_term": "asc"
}
}
}
},
"highlight": {
"fields": {
"skills": {}
}
},
"_source": [
"idNum"
]
}
I have used (?i) flag for case insensitive match. This is what I get
"hits": {
"total": 1,
"max_score": 0.29834434,
"hits": [
{
"_index": "only_index",
"_type": "my_type",
"_id": "1",
"_score": 0.29834434,
"_source": {
"idNum": "jay12"
},
"highlight": {
"skills": [
"<em>Sales</em> force"
]
}
}
]
},
"aggregations": {
"aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Sales force",
"doc_count": 1
}
]
}
}
Hope this helps!!

Resources