Implement autocomplete using custom analyzers in elasticsearch - elasticsearch

Referring to this post, I've created the following mapping:
POST music
{
"song": {
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}
},
"mappings": {
"song_field_1": {
"type": "string",
"index": "not_analyzed",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
Inserted the following document:
POST music/song
{
"song_field_1" : "Premeditiated fella"
}
And sent this query:
POST music/song/_search
{
"size": 10,
"query": {
"match": {
"_all": {
"query": "pre"
}
}
}
}
I expected to get the document as an autocomplete option, but didn't get any result.

You need to create your index like this:
POST music
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}
},
"mappings": {
"song": {
"properties": {
"song_field_1": {
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
}
So:
song goes inside mappings
No need for "index": "not_analyzed" since you're specifying analyzers

Related

Ignore specific character during fuzzy searches analyzer in Elastic search

I have a fuzzy search analyzer in elastic search with following documents
PUT test_index
{
"settings": {
"index": {
"max_ngram_diff": 40
},
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete"
]
},
"autocomplete_search": {
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
},
"filter": {
"autocomplete": {
"type": "ngram",
"min_gram": 2,
"max_gram": 40
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
PUT test_index/_doc/1
{ "title": "HRT 2018-BN18 N-SB" }
PUT test_index/_doc/2
{ "title": "GMC 2019-BN18 A-SB" }
How can i ignore the hyphen ('-') during my fuzzy search so that GMC 2019-BN18 A-SB , gmc 2019, gmc 2019-BN18 A-SB and GMC 2019-BN18 ASB yield the same document
I had tried to create another analyzer separately but i am not sure how can we apply multiple analyzer on the same field
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
]
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => "
]
}
}
}
}
You're on the right path, you just need to add that character filter to both analyzers to make sure the hyphens get removed at indexing and search time:
PUT test_index
{
"settings": {
"index": {
"max_ngram_diff": 40
},
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => "
]
}
},
"analyzer": {
"autocomplete": {
"char_filter": [
"my_char_filter"
],
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete"
]
},
"autocomplete_search": {
"char_filter": [
"my_char_filter"
],
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
},
"filter": {
"autocomplete": {
"type": "ngram",
"min_gram": 2,
"max_gram": 40
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}

Must specify either an analyzer type, or a tokenizer

I am basically new to elastic search .I am trying to implement fuzzy search , synonym search ,edge ngram and autocomplete on "name_auto" field , but it seems like my index creation is failing.
another question can i implement all the analyzer for "name" field if so how can i do it.
{
"settings": {
"index": {
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"format": "wordnet",
"synonyms_path": "analysis/wn_s.pl"
}
},
"analyzer": {
"synonym": {
"tokenizer": "whitespace",
"filter": [
"synonym"
]
},
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 25,
"token_chars": [
"letter"
]
}
}
},
"mappings": {
"properties": {
"firebaseId": {
"type": "text"
},
"name": {
"fielddata": true,
"type": "text",
"analyzer": "standard"
},
"name_auto": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
},
"synonym_analyzer": {
"type": "synonym",
"analyzer": "synonym"
}
}
}
}
}
}
}
}
}
This is the output :
> {
> "error": {
> "root_cause": [
> {
> "type": "illegal_argument_exception",
> "reason": "analyzer [tokenizer] must specify either an analyzer type, or a tokenizer"
> }
> ],
> "type": "illegal_argument_exception",
> "reason": "analyzer [tokenizer] must specify either an analyzer type, or a tokenizer"
> },
> "status": 400
> }
where am i doing wrong please guide me through right direction.
Your tokenizer section is located inside the analyzer section, which is not correct. Try with this instead, it should work:
{
"settings": {
"index": {
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"format": "wordnet",
"synonyms_path": "analysis/wn_s.pl"
}
},
"analyzer": {
"synonym": {
"tokenizer": "whitespace",
"filter": [
"synonym"
]
},
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 25,
"token_chars": [
"letter"
]
}
}
},
"mappings": {
"properties": {
"firebaseId": {
"type": "text"
},
"name": {
"fielddata": true,
"type": "text",
"analyzer": "standard"
},
"name_auto": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
},
"synonym_analyzer": {
"type": "synonym",
"analyzer": "synonym"
}
}
}
}
}
}
}
}

Synonyms for Sankt and St

I'm trying to get synonyms working for my existing setup. Currently I have this settings:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
},
"autocomplete_search": {
"tokenizer": "lowercase",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
}
},
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15,
"token_chars": [
"letter",
"digit",
"symbol"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
In this City Index I have documents like that:
St. Wolfgang or Sankt Wolfgang and so on. For me St. and Sankt are synonyms. So if I search for Sankt both of the documents should appear.
I created a new Filter and added the filter to my autocomplete analyzer:
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st."
]
}
So good for now. But the issues I faced are following:
Its clear that the dot after st is not analyzed and not searchable at the moment. But For the synonym the dot is important.
The second issue is if I search for sankt the synonym is st which gives me all documents which starts with st like Stuttgart. So this happens also because the dot is not used.
Do you have any idea how I can achieve the stuff? If you need any more information, please let me know.
Update:
After discussions I did this changes in my settings:
changed edge_ngram tokenizer to a standard tokenizer.
added an edgeNGram filter and added this filter to my analyzer.
deleted the filter german_normalization and my_ascii_folding from my analyzer to simplify the tests.
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "autocomplete",
"filter": [
"my_synonym_filter",
"lowercase"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "standard"
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
I added these 3 documents to the index:
"name":"Sankt Wolfgang",
"name":"Stuttgart",
"name":"St. Wolfgang"
Query String - Result
st -> "St. Wolfgang", "Stuttgart"
st. -> "St. Wolfgang", "Sankt Wolfgang"
sankt -> "St. Wolfgang", "Sankt Wolfgang"
This works pretty well for me. The main point here is to make sure to
put the synonym filter after the lowercase one
put the edge-n-gram filter at the end
use the edge-n-gram only at indexing time
So we create the index:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st. => sankt"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
Then we index data:
PUT city/city/1
{
"name":"St. Wolfgang"
}
PUT city/city/2
{
"name":"Stuttgart"
}
PUT city/city/3
{
"name":"Sankt Wolfgang"
}
Finally searching for either st or sankt will only return documents 1 and 3 but not 2
POST city/_search?q=name:st
POST city/_search?q=name:sankt

ElasticSearch : Can we apply both n-gram and language analyzers during indexing

Thanks a lot #Random , I have modified the mapping as follows. For testing I have used "movie" as my type for indexing.
Note: I have added search_analyzer also. I was not getting proper results without that.
However I have following doubts for using search_analyzer.
1] Can we use custom search_analyzer in case of language analyzers ?
2] am I getting all the results due to n-gram analyzer I have used and not due to english analyzer?
{
"settings": {
"analysis": {
"analyzer": {
"english_ngram": {
"type": "custom",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_stemmer",
"ngram_filter"
],
"tokenizer": "whitespace"
},
"search_analyzer":{
"type": "custom",
"tokenizer": "whitespace",
"filter": "lowercase"
}
},
"filter": {
"english_stop": {
"type": "stop"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"ngram_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 25
}
}
}
},
"mappings": {
"movie": {
"properties": {
"title": {
"type": "string",
"fields": {
"en": {
"type": "string",
"analyzer": "english_ngram",
"search_analyzer": "search_analyzer"
}
}
}
}
}
}
}
Update :
Using search analyzer also is not working consistently.and need more help with this.Updating question with my findings.
I used following mapping as suggested (Note: This mapping does not use search analyzer), for simplicity lets consider only English analyzer.
{
"settings": {
"analysis": {
"analyzer": {
"english_ngram": {
"type": "custom",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_stemmer",
"ngram_filter"
],
"tokenizer": "standard"
}
},
"filter": {
"english_stop": {
"type": "stop"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 25
}
}
}
}
}
Created index:
PUT http://localhost:9200/movies/movie/1
{"title":"$peci#l movie"}
Tried following query:
GET http://localhost:9200/movies/movie/_search
{
"query": {
"multi_match": {
"query": "$peci mov",
"fields": ["title"],
"operator": "and"
}
}
}
}
I got no results for this, am I doing anything wrong ?
I am trying to get results for:
1] Special characters
2] Partial matches
3] Space separated partial and full words
Thanks again !
You can create a custom analyzer based on language analyzers. The only difference is that you add your ngram_filter token filter to the end of the chain. In this case you first get language-stemmed tokens (default chain) that converted to edge ngrams in the end (your filter). You can find the implementation of language analyzers here https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#english-analyzer in order to override them. Here is an example of this change for english language:
{
"settings": {
"analysis": {
"analyzer": {
"english_ngram": {
"type": "custom",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_stemmer",
"ngram_filter"
],
"tokenizer": "standard"
}
},
"filter": {
"english_stop": {
"type": "stop"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 25
}
}
}
}
}
UPDATE
To support special characters you can try to use whitespace tokenizer instead of standard. In this case these characters will be part of your tokens:
{
"settings": {
"analysis": {
"analyzer": {
"english_ngram": {
"type": "custom",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_stemmer",
"ngram_filter"
],
"tokenizer": "whitespace"
}
},
"filter": {
"english_stop": {
"type": "stop"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 25
}
}
}
}
}

Boost if result begin with the word

I use Elasticsearch to search with autocompletion with an ngram filter. I need to boost a result if it starts with the search keyword.
My query is simple :
"query": {
"match": {
"query": "re",
"operator": "and"
}
}
And this is my results :
Restaurants
Couture et retouches
Restauration rapide
But I want them like this :
Restaurants
Restauration rapide
Couture et retouches
How can I boost a result starting with the keyword?
In case it can helps, here is my mapping :
{
"settings": {
"analysis": {
"analyzer": {
"partialAnalyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"searchAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "letter", "digit" ]
}
}
}
},
"mappings": {
"place": {
"properties": {
"name": {
"type": "string",
"index_analyzer": "partialAnalyzer",
"search_analyzer": "searchAnalyzer",
"term_vector": "with_positions_offsets"
}
}
}
}
}
Regards,
How about this idea, not 100% sure of it as it depends on the data I think:
create a sub-field in your name field that should be analyzed with keyword analyzer (pretty much staying as is)
change the query to be a bool with shoulds
one should is the query you have now
the other should is a match with phrase_prefix on the sub-field.
The mapping:
{
"settings": {
"analysis": {
"analyzer": {
"partialAnalyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"filter": [
"asciifolding",
"lowercase"
]
},
"searchAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase"
]
},
"keyword_lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase"
]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"place": {
"properties": {
"name": {
"type": "string",
"index_analyzer": "partialAnalyzer",
"search_analyzer": "searchAnalyzer",
"term_vector": "with_positions_offsets",
"fields": {
"as_is": {
"type": "string",
"analyzer": "keyword_lowercase"
}
}
}
}
}
}
}
The query:
{
"query": {
"bool": {
"should": [
{
"match": {
"name": {
"query": "re",
"operator": "and"
}
}
},
{
"match": {
"name.as_is": {
"query": "re",
"type": "phrase_prefix"
}
}
}
]
}
}
}

Resources