Preventing tokenisation of certain alpha characters in ElasticSearch - elasticsearch

I would like to prevent - and / from being tokenised or stemmed for a particular field.
I thought I had some code to achieve such behavior:
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
However, it errors:
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
}
],
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
},
"status": 400
}
Looking online I've found PatternReplaceFilterFactory and a few other methods, however these substitute the characters. I wish for the interpreter to handle the two chars as strings.
So I would like the string 5/3mm to be tokenised as such. Not split into 5 and 3mm.
Please could someone advise the correct way to achieve this? Here's a simplified PUT and some POST/Analyse requests.
// doc 1 contains what I would like to match
POST /products_example/_doc/1
{
"ProductDescription_stripped":"RipCurl 5/3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// doc 2 contains only 3mm. Should be prioritised below 5/3mm (match 1)
POST /products_example/_doc/2
{
"ProductDescription_stripped":"RipCurl 3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// here you can see 3mm have been tokenised where as 5/3mm should have been preserved
POST /products_example/_analyze
{
"tokenizer": "standard",
"filter": [ "lowercase","asciifolding","synonym","stop","kstem"],
"text": "5/3mm ripcurl wetsuit omega"
}
PUT /products/
{
"settings": {
"index.mapping.total_fields.limit": 1000000,
"index.max_ngram_diff" : 2,
"analysis": {
"filter": {
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
"description_stemmer_da" : {"type" : "stemmer","name" : "danish"},
"stop_da" : {"type" : "stop","stopwords": "_danish_"},
"synonym" : {
"type" : "synonym",
"synonyms" : ["ripcurl, ripccurl => rip curl"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram", "min_gram": 3, "max_gram": 5,
"token_chars": ["letter","digit"]
}
},
"analyzer": {
"description" : {
"type": "custom",
"tokenizer": "standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop",
"kstem"
]
},
"description_da": {
"type":"custom", "tokenizer":"standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop_da",
"description_stemmer_da"
]
}
}
}
},
"mappings": {
"properties": {
"ProductDescription_stripped": {
"type": "text",
"analyzer" : "description"
},
"ProductDescription_da_stripped": {
"type": "text",
"analyzer": "danish"
}
}
}
}

Related

Elasticsearch : Root mapping definition has unsupported parameter

I am creating a way to search for words in Thai by Elasticsearch and Kibana. I have a problem with mapping.
PUT test
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": {
"true_name": {
"properties": {
"correct": { "type": "text", "analyzer": "trigrams" }
}
}
}
}
and error like this
{
"error" : {
"root_cause" : [
{
"type" : "mapper_parsing_exception",
"reason" : "Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]"
}
],
"type" : "mapper_parsing_exception",
"reason" : "Failed to parse mapping [_doc]: Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]",
"caused_by" : {
"type" : "mapper_parsing_exception",
"reason" : "Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]"
}
},
"status" : 400
}
Mapping types are deprecated. Refer to this documentation to know more about the removal of mapping types.
Indices created in Elasticsearch 6.0.0 or later may only contain a
single mapping type. Indices created in 5.x with multiple mapping
types will continue to function as before in Elasticsearch 6.x. Types
will be deprecated in APIs in Elasticsearch 7.0.0, and completely
removed in 8.0.0.
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": { // note this
"properties": {
"correct": {
"type": "text",
"analyzer": "trigrams"
}
}
}
}
If your JSON document is like this:
{
"true_name": {
"correct": "mapping types deprecated"
}
}
Then index mapping will be -
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": {
"properties": { // note this
"true_name": {
"properties": {
"correct": {
"type": "text",
"analyzer": "trigrams"
}
}
}
}
}
}

elasticsearch Updating Index settings analyzer

I have a Books Index which contains multiple subjects
chemistry
biology
etc
Each subject have there own set of synonyms and a global synonyms
PUT /books/_settings
{
"analysis": {
"filter": {
"biology_synonyms": {
"type": "synonym",
"synonyms": [
"a, aa, aaa"
]
},
"chemistry_synonyms": {
"type": "synonym",
"synonyms": [
"c, cc, ccc"
]
},
"global_synonyms": {
"type": "synonym",
"synonym": [
"x, xx, xxx"
]
}
},
"analyzer": {
"chemistry_analyzer": {
"filter": [
"global_synonyms", "chemistry_synonyms"
]
},
"biology_analyzer": {
"filter": [
"global_synonyms", "biology_synonyms"
]
}
}
}
}
Let's say at any point in time, I want to add new subject named "Astronomy"
Now the problem is how do I Update the index settings to add new "Astronomy_synonyms" and "Astronomy_analyzer"
my application requires me to append settings with existing filters and analyzers, I don't want to overwrite(replace settings)
You can definitely append new token filters and analyzers, however you need to close your index before updating the settings and reopen it when done. In what follows, I assume the index already exists.
Let's say you create your index with the following initial settings:
PUT /books
{
"settings": {
"analysis": {
"filter": {
"biology_synonyms": {
"type": "synonym",
"synonyms": [
"a, aa, aaa"
]
},
"chemistry_synonyms": {
"type": "synonym",
"synonyms": [
"c, cc, ccc"
]
},
"global_synonyms": {
"type": "synonym",
"synonyms": [
"x, xx, xxx"
]
}
},
"analyzer": {
"chemistry_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"global_synonyms",
"chemistry_synonyms"
]
},
"biology_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"global_synonyms",
"biology_synonyms"
]
}
}
}
}
}
Then you need to close your index:
POST books/_close
Then you can append new analyzers and token filters:
PUT /books/_settings
{
"analysis": {
"filter": {
"astronomy_synonyms": {
"type": "synonym",
"synonyms": [
"x, xx, xxx"
]
}
},
"analyzer": {
"astronomy_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"global_synonyms",
"astronomy_synonyms"
]
}
}
}
}
And finally reopen your index
POST books/_open
If you then check your index settings, you'll see that everything has been properly merged.
You can only define new analyzers on closed indices.
To add an analyzer, you must close the index, define the analyzer, and reopen the index.
POST /books/_close
PUT /books/_settings
{
"analysis": {
"filter": {
"astronomy_synonyms": {
"type": "synonym",
"synonyms": [
"a, aa, aaa=>a"
]
}
},
"analyzer": {
"astronomy_analyzer": {
"tokenizer" : "whitespace",
"filter": [
"global_synonyms", "astronomy_synonyms"
]
}
}
}
}
POST /books/_open

Elasticsearch - Special Characters in Query String

I'm having trouble trying to search special characters using query string. I need to search an email address in format "xxx#xxx.xxx". At index time I use a custom normalizer which provide lowercase and ascii folding. At search time I use a custom analyzer which provide a tokenizer for whitespace and a filter that apply lowercase and ascii folding. By the way I am not able to search for a simple email address.
This is my mapping
{
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1,
"analysis": {
"analyzer": {
"folding": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"normalizer": {
"lowerasciinormalizer": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "integer"
},
"email": {
"type": "keyword",
"normalizer": "lowerasciinormalizer"
}
}
}
And this is my search query
{
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "pippo#pluto.it",
"fields": [
"email"
],
"analyzer": "folding"
}
}
]
}
}
}
Searching without special characters works fine. Infact if I do "query": "pippo*" I get the correct results.
I also tested the tokenizer doing
GET /_analyze
{
"analyzer": "whitespace",
"text": "pippo#pluto.com"
}
I get what I expect
{
"tokens" : [
{
"token" : "pippo#pluto.com",
"start_offset" : 0,
"end_offset" : 15,
"type" : "word",
"position" : 0
}
]
}
Any suggestions?
Thanks.
Edit:
I'm using elasticsearch 7.5.1
This works right. My problem was somewhere else.

Elasticsearch phrase suggester prefix phonetic differences

I was wondering if there is any way for the phrase suggester to correct prefix spelling mistakes on phonetic differences.
Elasticsearch 5.1.2
Testing in Kibana 5.1.2
For Example:
Instead of "circus" someone wrote "sircus", or instead of "coding" someone wrote "koding".
Funny thing is, that instead of "phrase" you can write "frase" and get a suggestion.
Here is my setup.
Settings:
PUT text_index
{
"settings": {
"analysis": {
"analyzer": {
"suggests_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"shingle_filter"
],
"type": "custom"
},
"reverse": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "reverse"]
}
},
"filter": {
"shingle_filter": {
"min_shingle_size": 2,
"max_shingle_size": 5,
"type": "shingle"
}
}
}
},
"mappings": {
"testtype": {
"properties": {
"suggest_field": {
"type": "text",
"analyzer": "suggests_analyzer",
"fields": {
"reverse": {
"type": "text",
"analyzer": "reverse"
}
}
}
}
}
}
}
Some documents:
POST test_index/test_type/_bulk
{"index":{}}
{ "suggest_field": "phrase"}
{"index":{}}
{ "suggest_field": "Circus"}
{"index":{}}
{ "suggest_field": "Coding"}
Querying:
POST /so-index/_search
{
"suggest" : {
"text" : "sircus",
"simple_phrase" : {
"phrase" : {
"field" : "suggest_field",
"max_errors": 0.9,
"highlight": {
"pre_tag": "<em>",
"post_tag": "</em>"
},
"direct_generator" : [ {
"field" : "suggest_field",
"suggest_mode" : "always"
}, {
"field" : "suggest_field.reverse",
"suggest_mode" : "always",
"pre_filter" : "reverse",
"post_filter" : "reverse"
}]
}
}
}
}
Also, I repeat following steps a few times (between 5 and 10) without changing anything:
delete index
put index, settings & mappings
add documents
query (codign)
Sometimes I get suggestions and sometimes I don't. Is there any explanation for it?
Try setting "prefix_length": 0 in the direct_generator.

Elasticsearch shingle token filter not working

I'm trying this on a local 1.7.5 elasticsearch installation
http://localhost:9200/_analyze?filter=shingle&tokenizer=keyword&text=alkis stack
I see this
{
"tokens":[
{
"token":"alkis stack",
"start_offset":0,
"end_offset":11,
"type":"word",
"position":1
}
]
}
And I expected to see something like this
{
"tokens":[
{
"token":"alkis stack",
"start_offset":0,
"end_offset":11,
"type":"word",
"position":1
},
{
"token":"stack alkis",
"start_offset":0,
"end_offset":11,
"type":"word",
"position":1
}
]
}
Am I missing something?
Update
{
"number_of_shards": 2,
"number_of_replicas": 0,
"analysis": {
"char_filter": {
"map_special_chars": {
"type": "mapping",
"mappings": [
"- => \\u0020",
". => \\u0020",
"? => \\u0020",
", => \\u0020",
"` => \\u0020",
"' => \\u0020",
"\" => \\u0020"
]
}
},
"filter": {
"permutate_fullname": {
"type": "shingle",
"max_shingle_size": 4,
"min_shingle_size": 2,
"output_unigrams": true,
"token_separator": " ",
"filler_token": "_"
}
},
"analyzer": {
"fullname_analyzer_search": {
"char_filter": [
"map_special_chars"
],
"filter": [
"asciifolding",
"lowercase",
"trim"
],
"type": "custom",
"tokenizer": "keyword"
},
"fullname_analyzer_index": {
"char_filter": [
"map_special_chars"
],
"filter": [
"asciifolding",
"lowercase",
"trim",
"permutate_fullname"
],
"type": "custom",
"tokenizer": "keyword"
}
}
}
}
And I'm trying to test like this
http://localhost:9200/INDEX_NAME/_analyze?analyzer=fullname_analyzer_index&text=alkis stack
Index first name and last name in two separate fields in ES, just as you have them in the DB. The text received as query can be analyzed (match does it for example, query_string does it). And there are ways to search both fields at the same time with all the terms in the search string. I think you are over-complicating the use case with single name in one go and creating names permutations at indexing time.

Resources