full text search with multiple tokens in elasticsearch - elasticsearch

Given that I have multiple documents contains a sentence such as "welcome to how are you doing today?" I applied a simple_query_string query to search the above sentence. When I first use welcome to how. It returns 0 hit. However, when I use how are you doing today it shows all the documents. Can someone tell me what causes this?
the query is like:
query: {
simple_query_string : {
query: '\ welcome to \',
fields : ['content'],
default_operator: 'AND' }
}
The settings for the analyzer are:
{
"number_of_shards": 2,
"refresh_interval": "30s",
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"charSplit": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"char_filter": [
"my_char_filter"
],
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "1"
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": "specialCharacters"
}
}
}
}

Related

Elasicsearch mixing NGram with Simple query string query

Currently, I am using Ngram tokenizer to-do partial matching of Employees.
I can match on FullName, Email address and Employee Number
My current setup looks as follow:
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
The problem that I am facing is that Employee Number can be 1 character long and because of the min_gram and max_gram, I can never match. I can't make the min_gram 1 either because the results do not look correct.
So I tried to mix the Ngram with a standard tokenizer and instead of doing in Multimatch search I am doing an simple_query_string.
This seems to also work partially.
My question is how can I partially match on all 3 fields bearing in mind that employee number can be 1 or 2 chars long. And exact match if I use semi quotes around a word or number
In the below example how can search for 11 and return documents 4 and 5?
Also, I would like document 2 to return if I had to search for 706 which is a partial match, but if I had to search with "7061" I would only return document 2
Full Code
PUT index
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "text",
"analyzer": "english",
"fields": {
"exact": {
"type": "text",
"analyzer": "english_exact"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
PUT index/_doc/1
{
"number" : 1,
"fullName": "Brenda eaton"
}
PUT index/_doc/2
{
"number" : 7061,
"fullName": "Bruce wayne"
}
PUT index/_doc/3
{
"number" : 23,
"fullName": "Bruce Banner"
}
PUT index/_doc/4
{
"number" : 111,
"fullName": "Cat woman"
}
PUT index/_doc/5
{
"number" : 1112,
"fullName": "0723568521"
}
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
You need to change the analyzer of the number.exact field and reduce the min_gram
count to 2. Modify the index mapping as shown below
Adding a working example
Index Mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword", // note this
"fields": {
"exact": {
"type": "text",
"analyzer": "my_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
Search Result:
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "4",
"_score": 0.9929736,
"_source": {
"number": 111,
"fullName": "Cat woman"
}
},
{
"_index": "66311552",
"_type": "_doc",
"_id": "5",
"_score": 0.8505551,
"_source": {
"number": 1112,
"fullName": "0723568521"
}
}
]
Update 1:
If you just need to search for 1, modify the data type of the number field from text type to keyword type, as shown in the index mapping above.
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact","number"],
"query": "1"
}
}
}
Search Result will be
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "1",
"_score": 1.3862942,
"_source": {
"number": 1,
"fullName": "Brenda eaton"
}
}
]
Update 2:
You can use two separate analyzers with n-gram tokenizer for the fullName field and number field. Modify with the below index mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"name_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "name_tokenizer"
},
"number_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "number_tokenizer"
}
},
"tokenizer": {
"name_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
"number_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword",
"fields": {
"exact": {
"type": "text",
"analyzer": "number_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "name_analyzer"
}
},
"analyzer": "standard"
}
}
}
}

Search partial word in elasticsearch

I'm kind of new to Elasticsearch but I would like to search the partial in the word
For example if I search "helloworld" is it possible to type only "world"?
Right now it work perfectly for case "hello" the elasticsearch return the suggestion helloworld for me
Here is the code:
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"word": {
"properties": {
"text": {
"type": "string",
"analyzer": "autocomplete"
}
}
}
}
}
Can anyone give me any suggestion?

Why does my Elasticsearch multi-match query look only for prefixes?

I am trying to write an Elasticsearch multi-match query (with the Java API) to create a "search-as-you-type" program. The query is applied to two fields, title and description, which are analyzed as ngrams.
My problem is, it seems that Elasticsearch tries to find only words beginning like my query. For instance, if I search for "nut", then it matches with documents featuring "nut", "nuts", "Nutella", etc, but it does not match documents featuring "walnut", which should be matched.
Here are my settings :
{
"index": {
"analysis": {
"analyzer": {
"edgeNGramAnalyzer": {
"tokenizer": "edgeTokenizer",
"filter": [
"word_delimiter",
"lowercase",
"unique"
]
}
},
"tokenizer": {
"edgeTokenizer": {
"type": "edgeNGram",
"min_gram": "3",
"max_gram": "8",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}
Here is the relevant part of my mapping :
{
"content": {
"properties": {
"title": {
"type": "text",
"analyzer": "edgeNGramAnalyzer",
"fields": {
"sort": {
"type": "keyword"
}
}
},
"description": {
"type": "text",
"analyzer": "edgeNGramAnalyzer",
"fields": {
"sort": {
"type": "keyword"
}
}
}
}
}
}
And here is my query :
new MultiMatchQueryBuilder(query).field("title", 3).field("description", 1).fuzziness(0).tieBreaker(1).minimumShouldMatch("100%")
Do you have any idea what I could be doing wrong ?
That's because you're using an edgeNGram tokenizer instead of nGram one. The former only indexes prefixes, while the latter will index prefixes, suffixes and also sub-parts of your data.
Change your analyzer definition to this instead and it should work as expected:
{
"index": {
"analysis": {
"analyzer": {
"edgeNGramAnalyzer": {
"tokenizer": "edgeTokenizer",
"filter": [
"word_delimiter",
"lowercase",
"unique"
]
}
},
"tokenizer": {
"edgeTokenizer": {
"type": "nGram", <---- change this
"min_gram": "3",
"max_gram": "8",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}

How to declare more than one tokenizer in settings for elasticsearch

I want to create a search index with a property for which i want results in following order:
first all the results which starts with the search term
then all the results containing the search term
so for this, i want to use https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-edgengram-tokenizer.html
but i already have a tokenizer kuromoji_tokenizer in settings for my index.
So how can i add another tokenizer in settings (and later use it in analyzer), so that i can fulfill above scneario?
So for example in below json, can i add another child to tokenzier or tokenzier needs to be an array?
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": [
"letter"
]
}
}
}
}
I believe you can, yes. Just add it next to the first one, don't create an array, just give it another name (in my example i called it "my_other_tokenizer"):
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": [
"letter"
]
},
"my_other_tokenizer": {
"type": "kuromoji_tokenizer",
"mode": "extended",
"discard_punctuation": "false",
"user_dictionary": "userdict_ja.txt"
}
}
}
}
And then just use it in your analyzer setting, just as you did for the first tokenizer.

ElasticSearch indexing so query returns contains

I've been trying to create my own index for users, where the query is indexed on the "name" value.
This is my current index settings:
{
"users": {
"settings": {
"index": {
"analysis": {
"filter": {
"shingle_filter": {
"max_shingle_size": "2",
"min_shingle_size": "2",
"output_unigrams": "true",
"type": "shingle"
},
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete_query_analyzer": {
"filter": [
"standard",
"asciifolding",
"lowercase"
],
"tokenizer": "standard"
},
"autocomplete_index_analyzer": {
"filter": [
"standard",
"asciifolding",
"lowercase",
"shingle_filter",
"edgeNGram_filter"
],
"tokenizer": "standard"
}
}
},
"number_of_shards": "1",
"number_of_replicas": "1"
}
}
}
}
and my mapping:
{
"users": {
"mappings": {
"data": {
"properties": {
"name": {
"type": "string",
"analyzer": "autocomplete_index_analyzer",
"search_analyzer": "autocomplete_query_analyzer"
}
}
}
}
}
}
Right now my problem is that search queries do not return results that contain the term. For example if I have a user "David", the search queries "Da", "Dav", "Davi", etc will return the value but search for "vid" or "avid" will not return any values.
Is this because of some value I'm missing in the settings?
You need to use nGram instead of edgeNGram. So simply change this
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "20"
}
into this
"edgeNGram_filter": {
"type": "nGram", <--- change here
"min_gram": "1",
"max_gram": "20"
}
Note that you need to wipe your index, recreate it and the populate it again.

Resources