Aggregate script ONLY on results of sorted query with filter, not full dataset

Aggregate script ONLY on results of sorted query with filter, not full dataset - elasticsearch

FYI - elasticsearch #v1.5; npm elasticsearch #4.0.2
For my specific use case, I need to find the five nearest points, around some other point, and calculate the max dist of those five results. For some reason, my query below is returning the max dist of all the filtered data, not the five nearest.
Here's my query thus far:
elasticsearchAPI = Meteor.npmRequire('elasticsearch');
esClient = new elasticsearchAPI.Client({
host: 'myHost'
});
var esQueryObject = {
"index": "ma_homes",
"size": 5,
"body": {
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"geo_distance": {
"LOCATION": {
"lat": 42.5125339,
"lon": -71.06748
},
"distance": "3mi",
"optimize_bbox": "memory"
}
}
}
},
"size": 5,
"sort": [{
"_geo_distance": {
"LOCATION": {
"lat": 42.5125339,
"lon": -71.06748
},
"order": "asc",
"unit": "mi",
"distance_type": "sloppy_arc"
}
}],
"fields": ["F1_V7_2_F1TOWN"],
"aggs": {
"max_dist": {
"max": {
"script": "doc[\u0027LOCATION\u0027].arcDistanceInMiles(lat,lon)",
"params" : {
"lat" : 42.5125339,
"lon" : -71.06748
}
}
}
}
}
}
try {
esClient.search(esQueryObject, function(err, res) {
if ( err ) console.log("err: ", err);
if ( res ) {
console.log("res: ", JSON.stringify(res, null, "\t"));
};
});
}
catch(error) {
console.log("search err: ", error);
};
My problem is this returns a max_dist of 2.99, but I can clearly see from the hits that it should only be 0.02268!
Lastly, is there a better way of calculating the max distance? I don't live having to use a script.
See the results, below:
I20160729-14:46:08.447(-7)? {
I20160729-14:46:08.447(-7)? "took": 119,
I20160729-14:46:08.447(-7)? "timed_out": false,
I20160729-14:46:08.447(-7)? "_shards": {
I20160729-14:46:08.447(-7)? "total": 5,
I20160729-14:46:08.448(-7)? "successful": 5,
I20160729-14:46:08.448(-7)? "failed": 0
I20160729-14:46:08.448(-7)? },
I20160729-14:46:08.448(-7)? "hits": {
I20160729-14:46:08.448(-7)? "total": 19428,
I20160729-14:46:08.448(-7)? "max_score": null,
I20160729-14:46:08.452(-7)? "hits": [
I20160729-14:46:08.452(-7)? {
I20160729-14:46:08.452(-7)? "_index": "ma_homes",
I20160729-14:46:08.452(-7)? "_type": "home",
I20160729-14:46:08.453(-7)? "_id": "AVY1KqHN5rKRAKXZHxQf",
I20160729-14:46:08.453(-7)? "_score": null,
I20160729-14:46:08.453(-7)? "fields": {
I20160729-14:46:08.453(-7)? "F1_V7_2_F1TOWN": [
I20160729-14:46:08.453(-7)? "7WHITECIRWAKEFIELDMA"
I20160729-14:46:08.454(-7)? ]
I20160729-14:46:08.454(-7)? },
I20160729-14:46:08.454(-7)? "sort": [
I20160729-14:46:08.454(-7)? 0.013847018573431258
I20160729-14:46:08.454(-7)? ]
I20160729-14:46:08.455(-7)? },
I20160729-14:46:08.455(-7)? {
I20160729-14:46:08.455(-7)? "_index": "ma_homes",
I20160729-14:46:08.455(-7)? "_type": "home",
I20160729-14:46:08.456(-7)? "_id": "AVY1Ewoc5rKRAKXZGhMp",
I20160729-14:46:08.456(-7)? "_score": null,
I20160729-14:46:08.456(-7)? "fields": {
I20160729-14:46:08.456(-7)? "F1_V7_2_F1TOWN": [
I20160729-14:46:08.456(-7)? "8WHITECIRWAKEFIELDMA"
I20160729-14:46:08.457(-7)? ]
I20160729-14:46:08.457(-7)? },
I20160729-14:46:08.457(-7)? "sort": [
I20160729-14:46:08.458(-7)? 0.01675513175670524
I20160729-14:46:08.458(-7)? ]
I20160729-14:46:08.458(-7)? },
I20160729-14:46:08.458(-7)? {
I20160729-14:46:08.458(-7)? "_index": "ma_homes",
I20160729-14:46:08.459(-7)? "_type": "home",
I20160729-14:46:08.459(-7)? "_id": "AVY1T0cn5rKRAKXZJwC8",
I20160729-14:46:08.459(-7)? "_score": null,
I20160729-14:46:08.459(-7)? "fields": {
I20160729-14:46:08.459(-7)? "F1_V7_2_F1TOWN": [
I20160729-14:46:08.460(-7)? "10WHITECIRWAKEFIELDMA"
I20160729-14:46:08.460(-7)? ]
I20160729-14:46:08.460(-7)? },
I20160729-14:46:08.460(-7)? "sort": [
I20160729-14:46:08.461(-7)? 0.018417500448048605
I20160729-14:46:08.461(-7)? ]
I20160729-14:46:08.463(-7)? },
I20160729-14:46:08.464(-7)? {
I20160729-14:46:08.464(-7)? "_index": "ma_homes",
I20160729-14:46:08.464(-7)? "_type": "home",
I20160729-14:46:08.464(-7)? "_id": "AVY1Xb2P5rKRAKXZKhUh",
I20160729-14:46:08.464(-7)? "_score": null,
I20160729-14:46:08.465(-7)? "fields": {
I20160729-14:46:08.465(-7)? "F1_V7_2_F1TOWN": [
I20160729-14:46:08.465(-7)? "11WHITECIRWAKEFIELDMA"
I20160729-14:46:08.465(-7)? ]
I20160729-14:46:08.466(-7)? },
I20160729-14:46:08.466(-7)? "sort": [
I20160729-14:46:08.466(-7)? 0.018816876925529115
I20160729-14:46:08.467(-7)? ]
I20160729-14:46:08.467(-7)? },
I20160729-14:46:08.467(-7)? {
I20160729-14:46:08.468(-7)? "_index": "ma_homes",
I20160729-14:46:08.468(-7)? "_type": "home",
I20160729-14:46:08.468(-7)? "_id": "AVY1TNJh5rKRAKXZJnx0",
I20160729-14:46:08.468(-7)? "_score": null,
I20160729-14:46:08.469(-7)? "fields": {
I20160729-14:46:08.469(-7)? "F1_V7_2_F1TOWN": [
I20160729-14:46:08.470(-7)? "6WHITECIRWAKEFIELDMA"
I20160729-14:46:08.470(-7)? ]
I20160729-14:46:08.470(-7)? },
I20160729-14:46:08.471(-7)? "sort": [
I20160729-14:46:08.471(-7)? 0.022680252269458714
I20160729-14:46:08.471(-7)? ]
I20160729-14:46:08.471(-7)? }
I20160729-14:46:08.471(-7)? ]
I20160729-14:46:08.472(-7)? },
I20160729-14:46:08.472(-7)? "aggregations": {
I20160729-14:46:08.472(-7)? "max_dist": {
I20160729-14:46:08.472(-7)? "value": 2.999906924854209,
I20160729-14:46:08.473(-7)? "value_as_string": "2.999906924854209"
I20160729-14:46:08.473(-7)? }
I20160729-14:46:08.473(-7)? }
I20160729-14:46:08.474(-7)? }

There's two things wrong here, with the second strongly related to the first:
You're assuming that the sorting order has any impact on the aggregation. It doesn't. You may want to have a look at Elasticsearch: The Definitive Guide on Scoping Aggregations.
The gist is that the total result of the query, including not-returned-hits are a part of the aggregation's scope. In your exact case, it noted that there were "total": 19428 documents that matched your search. You just got back the closest 5.
You're sorting by ascending order, which means it sorts from least to greatest. This means you're only getting the top 5 closest distances, which is what you want, but that doesn't mean that's all the aggregation saw as the true max.
To those points, you need to figure out how to limit the top 5, or just not aggregate at all, which I would suggest is the easiest thing to do here. Simply get the top 5, then grab the last value and you're done getting both answers that you want.
Sorting is constrained to what's within 3 miles because of the 3 miles, which is good, but perhaps you can do something better depending on your needs by using a faster search distance_type:
{
"size": 5,
"_source": "F1_V7_2_F1TOWN",
"query": {
"filtered": {
"filter": [
{
"geo_distance": {
"LOCATION": {
"lat": 42.5125339,
"lon": -71.06748
},
"distance": "3mi",
"distance_type": "plane"
}
}
]
}
},
"sort": [
{
"_geo_distance": {
"LOCATION": {
"lat": 42.5125339,
"lon": -71.06748
},
"order": "asc",
"unit": "mi",
"distance_type": "sloppy_arc"
}
}
]
}
Notice I don't aggregate, I use _source instead of fields (fields is meant for stored fields, not limiting the source document output), and I am I switched to using plane for the filter distance_type because it's faster for short distances outside of the poles; I doubt too many homes are going to be using distances in the poles. For scoring, I left it as sloppy_arc because it can use a slightly more refined equation after being filtered.
I only get 5 documents back, and of those 5, the last one will be the furthest one away as its score.
As a big side note, ES 2.2+ increased geo performance significantly.

Related

Elasticsearch OR query with nested objects returns inner_hits not matching the criteria

I'm getting weird results when querying nested objects. Imagine the following structure:
{ owner.name = "fred",
...,
pets [
{ name = "daisy", ... },
{ name = "flopsy", ... }
]
}
If I only have the document shown above, and I search pets matching this criteria:
pets.name = "daisy" OR
(owner.name = "julie" and pet.name = "flopsy")
I would expect to only get one result ("daisy"), but I'm getting both pet names.
This is one way to reproduce this:
# Create nested mapping
PUT pet-owners
{
"mappings": {
"animals": {
"properties": {
"owner": {"type": "text"},
"pets": {
"type": "nested",
"properties": {
"name": {"type": "text", "fielddata": true}
}
}
}
}
}
}
# Insert nested object
PUT pet-owners/animals/1?op_type=create
{
"owner" : "fred",
"pets" : [
{ "name" : "daisy"},
{ "name" : "flopsy"}
]
}
# Query
GET pet-owners/_search
{ "from": 0, "size": 50,
"query": {
"constant_score": {
"filter": { "bool": {"must": [
{"bool": {"should": [
{"nested": {"query":
{"term": {"pets.name": "daisy"}},
"path":"pets",
"inner_hits": {
"name": "pets_hits_1",
"size": 99,
"_source": false,
"docvalue_fields": ["pets.name"]
}
}},
{"bool": {"must": [
{"term": {"owner": "julie"}},
{"nested": {"query":
{"term": {"pets.name": "flopsy"}},
"path":"pets",
"inner_hits": {
"name": "pets_hits_2",
"size": 99,
"_source": false,
"docvalue_fields": ["pets.name"]
}
}}
]}}
]}}
]}}}},
"_source": false
}
The query returns both pets names (as opposed to the expected one).
Is this behavior normal? Am I doing something wrong, or my reasoning about the nested structure or the query behavior is flawed?
Any help or guidance will be much appreciated.
I'm running this query under ElasticSearch 6.3.x
EDIT: I'm adding the response received, to better illustrate the case
{
"took": 16,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_score": 1,
"inner_hits": {
"pets_hits_1": {
"hits": {
"total": 1,
"max_score": 0.6931472,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_nested": {
"field": "pets",
"offset": 0
},
"_score": 0.6931472,
"fields": {
"pets.name": [
"daisy"
]
}
}
]
}
},
"pets_hits_2": {
"hits": {
"total": 1,
"max_score": 0.6931472,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_nested": {
"field": "pets",
"offset": 1
},
"_score": 0.6931472,
"fields": {
"pets.name": [
"flopsy"
]
}
}
]
}
}
}
}
]
}
}
So we can see that it's not that the query matches and returns the whole existing document, but that it returns each of the pets independently, one inside each of the inner_hits. It's this result that's surprising to me.

(edited) - in summary this issue is around the context of the 'inner_hits':
It looks like the inner_hits 'pets_hits_2' is returning a match because it is belonging to the nested query that simply searches the pets field for 'flopsy'.
As an independent query on our single document, that is a valid hit.
However, because that query is within a list of bool/must queries, where other queries will not match on our document, you may well expect that the inner_hits should pick up on this and therefore not return a hit.
I haven't been able to find any docs to clarify whether this is intentional behaviour or not - might be worth raising with elastic ...

How to write query elastic search?

I have a index review like this
{
"_index": "zody_review",
"_type": "review",
"_id": "5b3c6c9e68cf860e1af5f7fd",
"_score": null,
"_source": {
"user": "571899623dc63af34d67a662",
"merchant": "56f8f80119a4c1ae791cf7bf",
"point": 3,
"score": 2,
"createdAt": "2018-07-04T13:43:42.331+07:00",
"location": {
"lat": 16.07054040054832,
"lon": 108.22062939405441
},
"feedback": "Phuc vu khong tot lam "
}
},
How can I query to get list review nearby, but limit get 5 reviews for each group by field merchant?
I've been stuck here too long!
Thank you!

You need to only return reviews that are near (say 100m) a given location and then you need to aggregate by marchant terms and add a top_hits sub-aggregation. It goes like this:
{
"size": 0,
"query": {
"geo_distance": {
"distance": "100m",
"location": {
"lat": 16.07055,
"lon": 108.2207
}
}
},
"aggs": {
"by_merchant": {
"terms": {
"field": "merchant"
},
"aggs": {
"top_5": {
"top_hits": {
"_source": [
"feedback"
],
"size": 5
}
}
}
}
}
}
Simply replace the location by the one you want to search around and probably the distance if you need a larger or smaller distance.

Fuzzy search in the elasticsearch gives matches with incorrect order

I am trying to build an engine where we can match areas mentioned in the address with the list available in elasticsearch.
I am using this query to search areas similar to "iit".
My query is :
{
"query": {
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}
I am getting below results :
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2.1290483,
"hits": [
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRiZ04pEQsZFpK6A_",
"_score": 2.1290483,
"_source": {
"locality": [
"emerald isle ii"
]
},
"highlight": {
"locality": [
"emerald isle <em>ii</em>"
]
}
},
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRfof4pEQsZFpK59H",
"_score": 1.877402,
"_source": {
"locality": [
"iit - bombay",
"iitb",
"indian institute of technology - bombay"
]
},
"highlight": {
"locality": [
"<em>iit</em> - bombay",
"<em>iitb</em>"
]
}
}
]
}
}
Because "iit" is directly available in the 2nd document and hence I was expecting that to be returned as best match with highest score. What is the change that I should make so that I get 2nd document with highest score.
I am using ES 2.3.4 .

If you also are interested in exact matching to score better, I always suggest a bool with should statements and adding a match or term query in there. In this way the combined scores will favor the exact matching:
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
{
"match": {
"locality": "iit"
}
}
]
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}

Specifying total size of results to return for ElasticSearch query when using inner_hits

ElasticSearch allows inner_hits to specify 'from' and 'size' parameters, as can the outer request body of a search.
As an example, assume my index contains 25 books, each having less than 50 chapters. The below snippet would return all chapters across all books, because a 'size' of 100 books includes all of 25 books and a 'size' of 50 chapters includes all of "less than 50 chapters":
"index": 'books',
"type": 'book',
"body": {
"from" : 0, "size" : 100, // outer hits, or books
"query": {
"filtered": {
"filter": {
"nested": {
"inner_hits": {
"size": 50 // inner hits, or chapters
},
"path": "chapter",
"query": { "match_all": { } },
}
}
}
},
.
.
.
Now, I'd like to implement paging with a scenario like this. My question is, how?
In this case, do I have to return back the above max of 100 * 50 = 5000 documents from the search query and implement paging in the application level by displaying only the slice I am interested in? Or, is there a way to specify the total number of hits to return back in the search query itself, independent of the inner/outer size?
I am looking at the "response" as follows, and so would like this data to be able to be paginated:
response.hits.hits.forEach(function(book) {
chapters = book.inner_hits.chapters.hits.hits;
chapters.forEach(function(chapter) {
// ... this is one displayed result ...
});
});

I don't think this is possible with Elasticsearch and nested fields. The way you see the results is correct: ES paginates and returns books and it doesn't see inside nested inner_hits. Is not how it works. You need to handle the pagination manually in your code.
There is another option, but you need a parent/child relationship instead of nested.
Then you are able to query the children (meaning, the chapters) and paginate the results (the chapters). You can use inner_hits and return back the parent (the book itself).
PUT /library
{
"mappings": {
"book": {
"properties": {
"name": {
"type": "string"
}
}
},
"chapter": {
"_parent": {
"type": "book"
},
"properties": {
"title": {
"type": "string"
}
}
}
}
}
The query:
GET /library/chapter/_search
{
"size": 5,
"query": {
"has_parent": {
"type": "book",
"query": {
"match_all": {}
},
"inner_hits" : {}
}
}
}
And a sample output (trimmed, complete example here):
"hits": [
{
"_index": "library",
"_type": "chapter",
"_id": "1",
"_score": 1,
"_source": {
"title": "chap1"
},
"inner_hits": {
"book": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "library",
"_type": "book",
"_id": "book1",
"_score": 1,
"_source": {
"name": "book1"
}
}
]
}
}
}
},
{
"_index": "library",
"_type": "chapter",
"_id": "2",
"_score": 1,
"_source": {
"title": "chap2"
},
"inner_hits": {
"book": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "library",
"_type": "book",
"_id": "book1",
"_score": 1,
"_source": {
"name": "book1"
}
}
]
}
}
}
}

The search api allows for the addition of certain standard parameters, listed in the docs at: https://www.elastic.co/guide/en/elasticsearch/client/javascript-api/current/api-reference-2-0.html#api-search-2-0
According to the doc:
size Number — Number of hits to return (default: 10)
Which would make your request something like:
"size": 5000,
"index": 'books',
"type": 'book',
"body": {

Elastic Search- Fetch Distinct Tags

I have document of following format:
{
_id :"1",
tags:["guava","apple","mango", "banana", "gulmohar"]
}
{
_id:"2",
tags: ["orange","guava", "mango shakes", "apple pie", "grammar"]
}
{
_id:"3",
tags: ["apple","grapes", "water", "gulmohar","water-melon", "green"]
}
Now, I want to fetch unique tags value from whole document 'tags field' starting with prefix g*, so that these unique tags will be display by tag suggestors(Stackoverflow site is an example).
For example: Whenever user types, 'g':
"guava", "gulmohar", "grammar", "grapes" and "green" should be returned as a result.
ie. the query should returns distinct tags with prefix g*.
I tried everywhere, browse whole documentations, searched es forum, but I didn't find any clue, much to my dismay.
I tried aggregations, but aggregations returns the distinct count for whole words/token in tags field. It does not return the unique list of tags starting with 'g'.
"query": {
"filtered": {
"query": {
"bool": {
"should": [
{
"query_string": {
"allow_leading_wildcard": false,
"fields": [
"tags"
],
"query": "g*",
"fuzziness":0
}
}
]
}
},
"filter": {
//some condition on other field...
}
}
},
"aggs": {
"distinct_tags": {
"terms": {
"field": "tags",
"size": 10
}
}
},
result of above: guava(w), apple(q), mango(1),...
Can someone please suggest me the correct way to fetch all the distinct tags with prefix input_prefix*?

It's a bit of a hack, but this seems to accomplish what you want.
I created an index and added your docs:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"tags":["guava","apple","mango", "banana", "gulmohar"]}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"tags": ["orange","guava", "mango shakes", "apple pie", "grammar"]}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"tags": ["guava","apple","grapes", "water", "grammar","gulmohar","water-melon", "green"]}
Then I used a combination of prefix query and highlighting as follows:
POST /test_index/_search
{
"query": {
"prefix": {
"tags": {
"value": "g"
}
}
},
"fields": [ ],
"highlight": {
"pre_tags": [""],
"post_tags": [""],
"fields": {
"tags": {}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"highlight": {
"tags": [
"guava",
"gulmohar"
]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"highlight": {
"tags": [
"guava",
"grammar"
]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"highlight": {
"tags": [
"guava",
"grapes",
"grammar",
"gulmohar",
"green"
]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/c14675ee8bd3934389a6cb0c85ff57621a17bf11
What you're trying to do amounts to autocomplete, of course, and there are perhaps better ways of going about that than what I posted above (though they are a bit more involved). Here are a couple of blog posts we did about ways to set up autocomplete:
http://blog.qbox.io/quick-and-dirty-autocomplete-with-elasticsearch-completion-suggest
http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams

As per #Sloan Ahrens advice, I did following:
Updated the mapping:
"tags": {
"type": "completion",
"context": {
"filter_color": {
"type": "category",
"default": "",
"path": "fruits.color"
},
"filter_type": {
"type": "category",
"default": "",
"path": "fruits.type"
}
}
}
Reference: ES API Guide
Inserted these indexes:
{
_id :"1",
tags:{input" :["guava","apple","mango", "banana", "gulmohar"]},
fruits:{color:'bar',type:'alice'}
}
{
_id:"2",
tags:{["orange","guava", "mango shakes", "apple pie", "grammar"]}
fruits:{color:'foo',type:'bob'}
}
{
_id:"3",
tags:{ ["apple","grapes", "water", "gulmohar","water-melon", "green"]}
fruits:{color:'foo',type:'alice'}
}
I don't need to modify much, my original index. Just added input before tags array.
POST rescu1/_suggest?pretty'
{
"suggest": {
"text": "g",
"completion": {
"field": "tags",
"size": 10,
"context": {
"filter_color": "bar",
"filter_type": "alice"
}
}
}
}
gave me the desired output.
I accepted #Sloan Ahrens answer as his suggestions worked like a charm for me, and he showed me the right direction.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio