ElasticSearch indexing so query returns contains - elasticsearch

I've been trying to create my own index for users, where the query is indexed on the "name" value.
This is my current index settings:
{
"users": {
"settings": {
"index": {
"analysis": {
"filter": {
"shingle_filter": {
"max_shingle_size": "2",
"min_shingle_size": "2",
"output_unigrams": "true",
"type": "shingle"
},
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete_query_analyzer": {
"filter": [
"standard",
"asciifolding",
"lowercase"
],
"tokenizer": "standard"
},
"autocomplete_index_analyzer": {
"filter": [
"standard",
"asciifolding",
"lowercase",
"shingle_filter",
"edgeNGram_filter"
],
"tokenizer": "standard"
}
}
},
"number_of_shards": "1",
"number_of_replicas": "1"
}
}
}
}
and my mapping:
{
"users": {
"mappings": {
"data": {
"properties": {
"name": {
"type": "string",
"analyzer": "autocomplete_index_analyzer",
"search_analyzer": "autocomplete_query_analyzer"
}
}
}
}
}
}
Right now my problem is that search queries do not return results that contain the term. For example if I have a user "David", the search queries "Da", "Dav", "Davi", etc will return the value but search for "vid" or "avid" will not return any values.
Is this because of some value I'm missing in the settings?

You need to use nGram instead of edgeNGram. So simply change this
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "20"
}
into this
"edgeNGram_filter": {
"type": "nGram", <--- change here
"min_gram": "1",
"max_gram": "20"
}
Note that you need to wipe your index, recreate it and the populate it again.

Related

Elasicsearch mixing NGram with Simple query string query

Currently, I am using Ngram tokenizer to-do partial matching of Employees.
I can match on FullName, Email address and Employee Number
My current setup looks as follow:
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
The problem that I am facing is that Employee Number can be 1 character long and because of the min_gram and max_gram, I can never match. I can't make the min_gram 1 either because the results do not look correct.
So I tried to mix the Ngram with a standard tokenizer and instead of doing in Multimatch search I am doing an simple_query_string.
This seems to also work partially.
My question is how can I partially match on all 3 fields bearing in mind that employee number can be 1 or 2 chars long. And exact match if I use semi quotes around a word or number
In the below example how can search for 11 and return documents 4 and 5?
Also, I would like document 2 to return if I had to search for 706 which is a partial match, but if I had to search with "7061" I would only return document 2
Full Code
PUT index
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "text",
"analyzer": "english",
"fields": {
"exact": {
"type": "text",
"analyzer": "english_exact"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
PUT index/_doc/1
{
"number" : 1,
"fullName": "Brenda eaton"
}
PUT index/_doc/2
{
"number" : 7061,
"fullName": "Bruce wayne"
}
PUT index/_doc/3
{
"number" : 23,
"fullName": "Bruce Banner"
}
PUT index/_doc/4
{
"number" : 111,
"fullName": "Cat woman"
}
PUT index/_doc/5
{
"number" : 1112,
"fullName": "0723568521"
}
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
You need to change the analyzer of the number.exact field and reduce the min_gram
count to 2. Modify the index mapping as shown below
Adding a working example
Index Mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword", // note this
"fields": {
"exact": {
"type": "text",
"analyzer": "my_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
Search Result:
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "4",
"_score": 0.9929736,
"_source": {
"number": 111,
"fullName": "Cat woman"
}
},
{
"_index": "66311552",
"_type": "_doc",
"_id": "5",
"_score": 0.8505551,
"_source": {
"number": 1112,
"fullName": "0723568521"
}
}
]
Update 1:
If you just need to search for 1, modify the data type of the number field from text type to keyword type, as shown in the index mapping above.
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact","number"],
"query": "1"
}
}
}
Search Result will be
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "1",
"_score": 1.3862942,
"_source": {
"number": 1,
"fullName": "Brenda eaton"
}
}
]
Update 2:
You can use two separate analyzers with n-gram tokenizer for the fullName field and number field. Modify with the below index mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"name_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "name_tokenizer"
},
"number_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "number_tokenizer"
}
},
"tokenizer": {
"name_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
"number_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword",
"fields": {
"exact": {
"type": "text",
"analyzer": "number_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "name_analyzer"
}
},
"analyzer": "standard"
}
}
}
}

Elasticsearch NGram Analyser - Change the Order of the results of Query

Elasticsearch Query change display results according to the scoring
The current Query gives the result of the Field title in the following order.
Quick 123
Foxes Quick
Quick
Foxes Quick Quick
Quick Foxes
Shouldn't
3. Quick be coming as a first result instead?
Also , Foxes Quick Quick has two occurances of Quick, it should have some preference in the Queried result . But it is coming at 4th poistion .
Index Settings .
{
"fundraisers": {
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "fundraisers",
"creation_date": "1546515635025",
"analysis": {
"analyzer": {
"my_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_tokenizer"
},
"search_analyzer_search": {
"filter": [
"lowercase"
],
"tokenizer": "search_tokenizer_search"
}
},
"tokenizer": {
"my_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "3",
"type": "edge_ngram",
"max_gram": "50"
},
"search_tokenizer_search": {
"token_chars": [
"letter",
"digit",
"whitespace"
],
"min_gram": "3",
"type": "ngram",
"max_gram": "50"
}
}
},
"number_of_replicas": "1",
"uuid": "mVweO4_sT3Ww00MzdLyavw",
"version": {
"created": "6020399"
}
}
}
}
}
Query
GET fundraisers/_search?explain=true
{
"query": {
"match_phrase": {
"title": {
"query": "qui",
"analyzer": "my_analyzer"
}
}
}
}
Mapping
{
"fundraisers": {
"mappings": {
"fundraisers": {
"properties": {
"status": {
"type": "text"
},
"suggest": {
"type": "completion",
"analyzer": "simple",
"preserve_separators": true,
"preserve_position_increments": true,
"max_input_length": 50
},
"title": {
"type": "text",
"analyzer": "my_analyzer"
},
"twitterUrl": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"videoLinks": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"zipCode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
Am I complicating this too much by using match_phrase,search analyzer and ngrams or is there any simpler way to achieve the expected result ?
Ref:
https://www.elastic.co/guide/en/elasticsearch/reference/6.5/query-dsl-match-query.html
Ok, first let's create a minimal and reproducible setup:
PUT test
{
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "1",
"analysis": {
"analyzer": {
"my_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_tokenizer"
},
"search_analyzer_search": {
"filter": [
"lowercase"
],
"tokenizer": "search_tokenizer_search"
}
},
"tokenizer": {
"my_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "3",
"type": "edge_ngram",
"max_gram": "50"
},
"search_tokenizer_search": {
"token_chars": [
"letter",
"digit",
"whitespace"
],
"min_gram": "3",
"type": "ngram",
"max_gram": "50"
}
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"title": {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
}
}
PUT test/_doc/1
{
"title": "Quick 123"
}
PUT test/_doc/2
{
"title": "Foxes Quick"
}
PUT test/_doc/3
{
"title": "Quick"
}
PUT test/_doc/4
{
"title": "Foxes Quick Quick"
}
PUT test/_doc/5
{
"title": "Quick Foxes"
}
Then let's try the simplest query:
GET test/_search
{
"query": {
"match": {
"title": {
"query": "qui"
}
}
}
}
And now your order is:
Quick
Foxes Quick Quick
Quick 123
Foxes Quick
Quick Foxes
That's pretty much what you were expecting, right? There might be other usecases, which are not covered by this query, but IMO you'll have to use multi_match and search on different analyzers, because I'm not sure a phrase_search on an edgegram makes much sense.

full text search with multiple tokens in elasticsearch

Given that I have multiple documents contains a sentence such as "welcome to how are you doing today?" I applied a simple_query_string query to search the above sentence. When I first use welcome to how. It returns 0 hit. However, when I use how are you doing today it shows all the documents. Can someone tell me what causes this?
the query is like:
query: {
simple_query_string : {
query: '\ welcome to \',
fields : ['content'],
default_operator: 'AND' }
}
The settings for the analyzer are:
{
"number_of_shards": 2,
"refresh_interval": "30s",
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"charSplit": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"char_filter": [
"my_char_filter"
],
"filter": [
"lowercase",
"autocomplete_filter"
]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "1"
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": "specialCharacters"
}
}
}
}

Why does my Elasticsearch multi-match query look only for prefixes?

I am trying to write an Elasticsearch multi-match query (with the Java API) to create a "search-as-you-type" program. The query is applied to two fields, title and description, which are analyzed as ngrams.
My problem is, it seems that Elasticsearch tries to find only words beginning like my query. For instance, if I search for "nut", then it matches with documents featuring "nut", "nuts", "Nutella", etc, but it does not match documents featuring "walnut", which should be matched.
Here are my settings :
{
"index": {
"analysis": {
"analyzer": {
"edgeNGramAnalyzer": {
"tokenizer": "edgeTokenizer",
"filter": [
"word_delimiter",
"lowercase",
"unique"
]
}
},
"tokenizer": {
"edgeTokenizer": {
"type": "edgeNGram",
"min_gram": "3",
"max_gram": "8",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}
Here is the relevant part of my mapping :
{
"content": {
"properties": {
"title": {
"type": "text",
"analyzer": "edgeNGramAnalyzer",
"fields": {
"sort": {
"type": "keyword"
}
}
},
"description": {
"type": "text",
"analyzer": "edgeNGramAnalyzer",
"fields": {
"sort": {
"type": "keyword"
}
}
}
}
}
}
And here is my query :
new MultiMatchQueryBuilder(query).field("title", 3).field("description", 1).fuzziness(0).tieBreaker(1).minimumShouldMatch("100%")
Do you have any idea what I could be doing wrong ?
That's because you're using an edgeNGram tokenizer instead of nGram one. The former only indexes prefixes, while the latter will index prefixes, suffixes and also sub-parts of your data.
Change your analyzer definition to this instead and it should work as expected:
{
"index": {
"analysis": {
"analyzer": {
"edgeNGramAnalyzer": {
"tokenizer": "edgeTokenizer",
"filter": [
"word_delimiter",
"lowercase",
"unique"
]
}
},
"tokenizer": {
"edgeTokenizer": {
"type": "nGram", <---- change this
"min_gram": "3",
"max_gram": "8",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}

How get custome analyzer source from elasticsearch?

I made a _mapping request to elasticsearch and see that for one field custom analyzer is used. The output for field like that:
"myFieldName": {
"type": "string",
"analyzer": "someCustomAnalyzer"
}
So is there are a way to get source for that someCustomAnalyzer? I have tried request curl -XGET localhost:9200/_analyze?analyzer=someCustomAnalyzer
and got:
{
"error": "ElasticsearchIllegalArgumentException[text is missing]",
"status": 400
}
If I add text argument for query string I got analyzing result for analyzing, but I need analyzer definition.
You can see it with settings. It's more readable now in 1.5 than it used to be.
So if I create an index with a non-trivial analyzer:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"edge_ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"edge_ngram_filter"
]
}
}
}
},
"mappings": {
"doc": {
"_all": {
"enabled": true,
"index_analyzer": "edge_ngram_analyzer",
"search_analyzer": "standard"
},
"properties": {
"first_name": {
"type": "string",
"include_in_all": true
},
"last_name": {
"type": "string",
"include_in_all": true
},
"ssn": {
"type": "string",
"index": "not_analyzed",
"include_in_all": true
}
}
}
}
}
I can get the index settings with:
GET /test_index/_settings
...
{
"test_index": {
"settings": {
"index": {
"creation_date": "1430394627755",
"uuid": "78oYlYU9RS6LZ5YFyeaMRQ",
"analysis": {
"filter": {
"edge_ngram_filter": {
"min_gram": "2",
"type": "edge_ngram",
"max_gram": "20"
}
},
"analyzer": {
"edge_ngram_analyzer": {
"type": "custom",
"filter": [
"lowercase",
"edge_ngram_filter"
],
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"number_of_shards": "1",
"version": {
"created": "1050099"
}
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4a38bdb0cb7d381caa29b9ce2c3c154b63cdc1f8

Resources