Elastic Search- Fetch Distinct Tags - elasticsearch

I have document of following format:
{
_id :"1",
tags:["guava","apple","mango", "banana", "gulmohar"]
}
{
_id:"2",
tags: ["orange","guava", "mango shakes", "apple pie", "grammar"]
}
{
_id:"3",
tags: ["apple","grapes", "water", "gulmohar","water-melon", "green"]
}
Now, I want to fetch unique tags value from whole document 'tags field' starting with prefix g*, so that these unique tags will be display by tag suggestors(Stackoverflow site is an example).
For example: Whenever user types, 'g':
"guava", "gulmohar", "grammar", "grapes" and "green" should be returned as a result.
ie. the query should returns distinct tags with prefix g*.
I tried everywhere, browse whole documentations, searched es forum, but I didn't find any clue, much to my dismay.
I tried aggregations, but aggregations returns the distinct count for whole words/token in tags field. It does not return the unique list of tags starting with 'g'.
"query": {
"filtered": {
"query": {
"bool": {
"should": [
{
"query_string": {
"allow_leading_wildcard": false,
"fields": [
"tags"
],
"query": "g*",
"fuzziness":0
}
}
]
}
},
"filter": {
//some condition on other field...
}
}
},
"aggs": {
"distinct_tags": {
"terms": {
"field": "tags",
"size": 10
}
}
},
result of above: guava(w), apple(q), mango(1),...
Can someone please suggest me the correct way to fetch all the distinct tags with prefix input_prefix*?

It's a bit of a hack, but this seems to accomplish what you want.
I created an index and added your docs:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"tags":["guava","apple","mango", "banana", "gulmohar"]}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"tags": ["orange","guava", "mango shakes", "apple pie", "grammar"]}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"tags": ["guava","apple","grapes", "water", "grammar","gulmohar","water-melon", "green"]}
Then I used a combination of prefix query and highlighting as follows:
POST /test_index/_search
{
"query": {
"prefix": {
"tags": {
"value": "g"
}
}
},
"fields": [ ],
"highlight": {
"pre_tags": [""],
"post_tags": [""],
"fields": {
"tags": {}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"highlight": {
"tags": [
"guava",
"gulmohar"
]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"highlight": {
"tags": [
"guava",
"grammar"
]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"highlight": {
"tags": [
"guava",
"grapes",
"grammar",
"gulmohar",
"green"
]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/c14675ee8bd3934389a6cb0c85ff57621a17bf11
What you're trying to do amounts to autocomplete, of course, and there are perhaps better ways of going about that than what I posted above (though they are a bit more involved). Here are a couple of blog posts we did about ways to set up autocomplete:
http://blog.qbox.io/quick-and-dirty-autocomplete-with-elasticsearch-completion-suggest
http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams

As per #Sloan Ahrens advice, I did following:
Updated the mapping:
"tags": {
"type": "completion",
"context": {
"filter_color": {
"type": "category",
"default": "",
"path": "fruits.color"
},
"filter_type": {
"type": "category",
"default": "",
"path": "fruits.type"
}
}
}
Reference: ES API Guide
Inserted these indexes:
{
_id :"1",
tags:{input" :["guava","apple","mango", "banana", "gulmohar"]},
fruits:{color:'bar',type:'alice'}
}
{
_id:"2",
tags:{["orange","guava", "mango shakes", "apple pie", "grammar"]}
fruits:{color:'foo',type:'bob'}
}
{
_id:"3",
tags:{ ["apple","grapes", "water", "gulmohar","water-melon", "green"]}
fruits:{color:'foo',type:'alice'}
}
I don't need to modify much, my original index. Just added input before tags array.
POST rescu1/_suggest?pretty'
{
"suggest": {
"text": "g",
"completion": {
"field": "tags",
"size": 10,
"context": {
"filter_color": "bar",
"filter_type": "alice"
}
}
}
}
gave me the desired output.
I accepted #Sloan Ahrens answer as his suggestions worked like a charm for me, and he showed me the right direction.

Related

Elasticsearch OR query with nested objects returns inner_hits not matching the criteria

I'm getting weird results when querying nested objects. Imagine the following structure:
{ owner.name = "fred",
...,
pets [
{ name = "daisy", ... },
{ name = "flopsy", ... }
]
}
If I only have the document shown above, and I search pets matching this criteria:
pets.name = "daisy" OR
(owner.name = "julie" and pet.name = "flopsy")
I would expect to only get one result ("daisy"), but I'm getting both pet names.
This is one way to reproduce this:
# Create nested mapping
PUT pet-owners
{
"mappings": {
"animals": {
"properties": {
"owner": {"type": "text"},
"pets": {
"type": "nested",
"properties": {
"name": {"type": "text", "fielddata": true}
}
}
}
}
}
}
# Insert nested object
PUT pet-owners/animals/1?op_type=create
{
"owner" : "fred",
"pets" : [
{ "name" : "daisy"},
{ "name" : "flopsy"}
]
}
# Query
GET pet-owners/_search
{ "from": 0, "size": 50,
"query": {
"constant_score": {
"filter": { "bool": {"must": [
{"bool": {"should": [
{"nested": {"query":
{"term": {"pets.name": "daisy"}},
"path":"pets",
"inner_hits": {
"name": "pets_hits_1",
"size": 99,
"_source": false,
"docvalue_fields": ["pets.name"]
}
}},
{"bool": {"must": [
{"term": {"owner": "julie"}},
{"nested": {"query":
{"term": {"pets.name": "flopsy"}},
"path":"pets",
"inner_hits": {
"name": "pets_hits_2",
"size": 99,
"_source": false,
"docvalue_fields": ["pets.name"]
}
}}
]}}
]}}
]}}}},
"_source": false
}
The query returns both pets names (as opposed to the expected one).
Is this behavior normal? Am I doing something wrong, or my reasoning about the nested structure or the query behavior is flawed?
Any help or guidance will be much appreciated.
I'm running this query under ElasticSearch 6.3.x
EDIT: I'm adding the response received, to better illustrate the case
{
"took": 16,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_score": 1,
"inner_hits": {
"pets_hits_1": {
"hits": {
"total": 1,
"max_score": 0.6931472,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_nested": {
"field": "pets",
"offset": 0
},
"_score": 0.6931472,
"fields": {
"pets.name": [
"daisy"
]
}
}
]
}
},
"pets_hits_2": {
"hits": {
"total": 1,
"max_score": 0.6931472,
"hits": [
{
"_index": "pet-owners",
"_type": "animals",
"_id": "1",
"_nested": {
"field": "pets",
"offset": 1
},
"_score": 0.6931472,
"fields": {
"pets.name": [
"flopsy"
]
}
}
]
}
}
}
}
]
}
}
So we can see that it's not that the query matches and returns the whole existing document, but that it returns each of the pets independently, one inside each of the inner_hits. It's this result that's surprising to me.
(edited) - in summary this issue is around the context of the 'inner_hits':
It looks like the inner_hits 'pets_hits_2' is returning a match because it is belonging to the nested query that simply searches the pets field for 'flopsy'.
As an independent query on our single document, that is a valid hit.
However, because that query is within a list of bool/must queries, where other queries will not match on our document, you may well expect that the inner_hits should pick up on this and therefore not return a hit.
I haven't been able to find any docs to clarify whether this is intentional behaviour or not - might be worth raising with elastic ...

How to perform elastic search _update_by_query using painless script - for complex condition

Can you suggest how to update documents (with a script - i guess painless) that based on condition fields?
its purpose is to add/or remove values from the document
so if I have those input documents:
doc //1st
{
"Tags":["foo"],
"flag":"true"
}
doc //2nd
{
"flag":"true"
}
doc //3rd
{
"Tags": ["goo"],
"flag":"false"
}
And I want to perform something like this:
Update all documents that have "flag=true" with:
Added tags: "me", "one"
Deleted tags: "goo","foo"
so expected result should be something like:
doc //1st
{
"Tags":["me","one"],
"flag":"true"
}
doc //2nd
{
"Tags":["me","one"],
"flag":"true"
}
doc //3rd
{
"Tags": ["goo"],
"flag":"false"
}
Create mapping:
PUT documents
{
"mappings": {
"document": {
"properties": {
"tags": {
"type": "keyword",
"index": "not_analyzed"
},
"flag": {
"type": "boolean"
}
}
}
}
}
Insert first doc:
PUT documents/document/1
{
"tags":["foo"],
"flag": true
}
Insert second doc (keep in mind that for empty tags I specified empty tags array because if you don't have field at all you will need to check in script does field exists):
PUT documents/document/2
{
"tags": [],
"flag": true
}
Add third doc:
PUT documents/document/3
{
"tags": ["goo"],
"flag": false
}
And then run _update_by_query which has two arrays as params one for elements to add and one for elements to remove:
POST documents/_update_by_query
{
"script": {
"inline": "for(int i = 0; i < params.add_tags.size(); i++) { if(!ctx._source.tags.contains(params.add_tags[i].value)) { ctx._source.tags.add(params.add_tags[i].value)}} for(int i = 0; i < params.remove_tags.size(); i++) { if(ctx._source.tags.contains(params.remove_tags[i].value)){ctx._source.tags.removeAll(Collections.singleton(params.remove_tags[i].value))}}",
"params": {
"add_tags": [
{"value": "me"},
{"value": "one"}
],
"remove_tags": [
{"value": "goo"},
{"value": "foo"}
]
}
},
"query": {
"bool": {
"must": [
{"term": {"flag": true}}
]
}
}
}
If you then do following search:
GET documents/_search
you will get following result (which I think is what you want):
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [{
"_index": "documents",
"_type": "document",
"_id": "2",
"_score": 1,
"_source": {
"flag": true,
"tags": [
"me",
"one"
]
}
},
{
"_index": "documents",
"_type": "document",
"_id": "1",
"_score": 1,
"_source": {
"flag": true,
"tags": [
"me",
"one"
]
}
},
{
"_index": "documents",
"_type": "document",
"_id": "3",
"_score": 1,
"_source": {
"tags": [
"goo"
],
"flag": false
}
}
]
}
}

Fuzzy search in the elasticsearch gives matches with incorrect order

I am trying to build an engine where we can match areas mentioned in the address with the list available in elasticsearch.
I am using this query to search areas similar to "iit".
My query is :
{
"query": {
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}
I am getting below results :
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 2.1290483,
"hits": [
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRiZ04pEQsZFpK6A_",
"_score": 2.1290483,
"_source": {
"locality": [
"emerald isle ii"
]
},
"highlight": {
"locality": [
"emerald isle <em>ii</em>"
]
}
},
{
"_index": "geocoding_1",
"_type": "localities",
"_id": "AVuzRfof4pEQsZFpK59H",
"_score": 1.877402,
"_source": {
"locality": [
"iit - bombay",
"iitb",
"indian institute of technology - bombay"
]
},
"highlight": {
"locality": [
"<em>iit</em> - bombay",
"<em>iitb</em>"
]
}
}
]
}
}
Because "iit" is directly available in the 2nd document and hence I was expecting that to be returned as best match with highest score. What is the change that I should make so that I get 2nd document with highest score.
I am using ES 2.3.4 .
If you also are interested in exact matching to score better, I always suggest a bool with should statements and adding a match or term query in there. In this way the combined scores will favor the exact matching:
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"locality": {
"value": "iit",
"fuzziness": 1
}
}
},
{
"match": {
"locality": "iit"
}
}
]
}
},
"highlight": {
"fields": {
"locality": {}
}
}
}

Specifying total size of results to return for ElasticSearch query when using inner_hits

ElasticSearch allows inner_hits to specify 'from' and 'size' parameters, as can the outer request body of a search.
As an example, assume my index contains 25 books, each having less than 50 chapters. The below snippet would return all chapters across all books, because a 'size' of 100 books includes all of 25 books and a 'size' of 50 chapters includes all of "less than 50 chapters":
"index": 'books',
"type": 'book',
"body": {
"from" : 0, "size" : 100, // outer hits, or books
"query": {
"filtered": {
"filter": {
"nested": {
"inner_hits": {
"size": 50 // inner hits, or chapters
},
"path": "chapter",
"query": { "match_all": { } },
}
}
}
},
.
.
.
Now, I'd like to implement paging with a scenario like this. My question is, how?
In this case, do I have to return back the above max of 100 * 50 = 5000 documents from the search query and implement paging in the application level by displaying only the slice I am interested in? Or, is there a way to specify the total number of hits to return back in the search query itself, independent of the inner/outer size?
I am looking at the "response" as follows, and so would like this data to be able to be paginated:
response.hits.hits.forEach(function(book) {
chapters = book.inner_hits.chapters.hits.hits;
chapters.forEach(function(chapter) {
// ... this is one displayed result ...
});
});
I don't think this is possible with Elasticsearch and nested fields. The way you see the results is correct: ES paginates and returns books and it doesn't see inside nested inner_hits. Is not how it works. You need to handle the pagination manually in your code.
There is another option, but you need a parent/child relationship instead of nested.
Then you are able to query the children (meaning, the chapters) and paginate the results (the chapters). You can use inner_hits and return back the parent (the book itself).
PUT /library
{
"mappings": {
"book": {
"properties": {
"name": {
"type": "string"
}
}
},
"chapter": {
"_parent": {
"type": "book"
},
"properties": {
"title": {
"type": "string"
}
}
}
}
}
The query:
GET /library/chapter/_search
{
"size": 5,
"query": {
"has_parent": {
"type": "book",
"query": {
"match_all": {}
},
"inner_hits" : {}
}
}
}
And a sample output (trimmed, complete example here):
"hits": [
{
"_index": "library",
"_type": "chapter",
"_id": "1",
"_score": 1,
"_source": {
"title": "chap1"
},
"inner_hits": {
"book": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "library",
"_type": "book",
"_id": "book1",
"_score": 1,
"_source": {
"name": "book1"
}
}
]
}
}
}
},
{
"_index": "library",
"_type": "chapter",
"_id": "2",
"_score": 1,
"_source": {
"title": "chap2"
},
"inner_hits": {
"book": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "library",
"_type": "book",
"_id": "book1",
"_score": 1,
"_source": {
"name": "book1"
}
}
]
}
}
}
}
The search api allows for the addition of certain standard parameters, listed in the docs at: https://www.elastic.co/guide/en/elasticsearch/client/javascript-api/current/api-reference-2-0.html#api-search-2-0
According to the doc:
size Number — Number of hits to return (default: 10)
Which would make your request something like:
"size": 5000,
"index": 'books',
"type": 'book',
"body": {

ElasticSearch - prefix with space and filtering

My ElasticSearch server contains documents of the following form:
{
"_index": "xindex",
"_type": "xtype",
"_id": "1100",
"_score": 3.00010,
"_source": {
"_id": "2333345",
"field1": "11111111111111",
"field2": "y",
"name": "hello world",
}
}
I need to get all the documents with name prefix "hello wo" and field2 "y".
Tried a lot of queries and none have worked. There are all kind of solutions for the prefix with space issue, but when adding the filtering/another query for field2, results get corrupted.
Thanks.
You can achieve this in 3 steps :
Change your mapping of field name to not_analyzed
Use a match_phrase_prefix query (documentation here)
Filter this query results by wrapping it in a filtered query and use a term filter on the field2 with value "y"
You can see it working with the following dataset :
PUT test/prefix/_mapping
{
"properties": {
"name":{
"type": "string",
"index": "not_analyzed"
}
}
}
//should match
PUT test/prefix/2333345
{
"field1": "11111111111111",
"field2": "y",
"name": "hello world"
}
//should match
PUT test/prefix/1112223
{
"field1": "22222222222222",
"field2": "y",
"name": "hello wombat"
}
//should not match (field2 value is different)
PUT test/prefix/4445556
{
"field1": "33333333333333",
"field2": "z",
"name": "hello world"
}
//should not match (second word not starting with wo)
PUT test/prefix/4445556
{
"field1": "33333333333333",
"field2": "y",
"name": "hello zombie"
}
Then, the query is :
GET test/prefix/_search
{
"query": {
"filtered": {
"query": {
"match_phrase_prefix" : {
"name" : "hello wo"
}
},
"filter": {
"term": {
"field2": "y"
}
}
}
}
}
which outputs the documents 1112223 and 2333345 as expected :
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.592944,
"hits": [
{
"_index": "test",
"_type": "prefix",
"_id": "2333345",
"_score": 1.592944,
"_source": {
"field1": "11111111111111",
"field2": "y",
"name": "hello world"
}
},
{
"_index": "test",
"_type": "prefix",
"_id": "1112223",
"_score": 1.592944,
"_source": {
"field1": "22222222222222",
"field2": "y",
"name": "hello wombat"
}
}
]
}
}
use simple_query_string, This approach solved my issue:
{
"query": {
"bool": {
"should": [
{
"simple_query_string": {
"fields": [
"name"
],
"default_operator": "and",
"query": "(hello world*)"
}
}
]
}
}
}

Resources