Elasticsearch : Root mapping definition has unsupported parameter - elasticsearch

I am creating a way to search for words in Thai by Elasticsearch and Kibana. I have a problem with mapping.
PUT test
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": {
"true_name": {
"properties": {
"correct": { "type": "text", "analyzer": "trigrams" }
}
}
}
}
and error like this
{
"error" : {
"root_cause" : [
{
"type" : "mapper_parsing_exception",
"reason" : "Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]"
}
],
"type" : "mapper_parsing_exception",
"reason" : "Failed to parse mapping [_doc]: Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]",
"caused_by" : {
"type" : "mapper_parsing_exception",
"reason" : "Root mapping definition has unsupported parameters: [true_name : {properties={correct={analyzer=trigrams, type=text}}}]"
}
},
"status" : 400
}

Mapping types are deprecated. Refer to this documentation to know more about the removal of mapping types.
Indices created in Elasticsearch 6.0.0 or later may only contain a
single mapping type. Indices created in 5.x with multiple mapping
types will continue to function as before in Elasticsearch 6.x. Types
will be deprecated in APIs in Elasticsearch 7.0.0, and completely
removed in 8.0.0.
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": { // note this
"properties": {
"correct": {
"type": "text",
"analyzer": "trigrams"
}
}
}
}
If your JSON document is like this:
{
"true_name": {
"correct": "mapping types deprecated"
}
}
Then index mapping will be -
{
"settings": {
"analysis": {
"analyzer": {
"trigrams": {
"tokenizer": "trigram_tokenizer",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"trigram_tokenizer": {
"type": "ngram",
"min_ngram": 3,
"max_ngram": 3,
"token_chars": []
}
}
}
},
"mappings": {
"properties": { // note this
"true_name": {
"properties": {
"correct": {
"type": "text",
"analyzer": "trigrams"
}
}
}
}
}
}

Related

Preventing tokenisation of certain alpha characters in ElasticSearch

I would like to prevent - and / from being tokenised or stemmed for a particular field.
I thought I had some code to achieve such behavior:
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
However, it errors:
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
}
],
"type": "illegal_argument_exception",
"reason": "Token filter [char_filters] cannot be used to parse synonyms"
},
"status": 400
}
Looking online I've found PatternReplaceFilterFactory and a few other methods, however these substitute the characters. I wish for the interpreter to handle the two chars as strings.
So I would like the string 5/3mm to be tokenised as such. Not split into 5 and 3mm.
Please could someone advise the correct way to achieve this? Here's a simplified PUT and some POST/Analyse requests.
// doc 1 contains what I would like to match
POST /products_example/_doc/1
{
"ProductDescription_stripped":"RipCurl 5/3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// doc 2 contains only 3mm. Should be prioritised below 5/3mm (match 1)
POST /products_example/_doc/2
{
"ProductDescription_stripped":"RipCurl 3mm wetsuit omega",
"ProductDescription_da_stripped": "RipCurl 5/3mm wetsuit omega"
}
// here you can see 3mm have been tokenised where as 5/3mm should have been preserved
POST /products_example/_analyze
{
"tokenizer": "standard",
"filter": [ "lowercase","asciifolding","synonym","stop","kstem"],
"text": "5/3mm ripcurl wetsuit omega"
}
PUT /products/
{
"settings": {
"index.mapping.total_fields.limit": 1000000,
"index.max_ngram_diff" : 2,
"analysis": {
"filter": {
"char_filters": {
"type": "word_delimiter",
"type_table": [
"- => ALPHA",
"/ => ALPHA"
]
},
"description_stemmer_da" : {"type" : "stemmer","name" : "danish"},
"stop_da" : {"type" : "stop","stopwords": "_danish_"},
"synonym" : {
"type" : "synonym",
"synonyms" : ["ripcurl, ripccurl => rip curl"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram", "min_gram": 3, "max_gram": 5,
"token_chars": ["letter","digit"]
}
},
"analyzer": {
"description" : {
"type": "custom",
"tokenizer": "standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop",
"kstem"
]
},
"description_da": {
"type":"custom", "tokenizer":"standard",
"filter": [
"char_filters",
"lowercase",
"asciifolding",
"synonym",
"stop_da",
"description_stemmer_da"
]
}
}
}
},
"mappings": {
"properties": {
"ProductDescription_stripped": {
"type": "text",
"analyzer" : "description"
},
"ProductDescription_da_stripped": {
"type": "text",
"analyzer": "danish"
}
}
}
}

ElasticSearch "more like this" returning empty result

I made a very simple test to figure out my mistake, but did not find it. I created two indexes and I'm trying to search documents in the ppa index that are similar to a given document in the ods index (like the second example here https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html).
These are my settings, mappings and documents for the ppa index:
PUT /ppa
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_stemmer": {
"type": "stemmer",
"language": "brazilian"
}
},
"analyzer": {
"brazilian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"brazilian_stop",
"brazilian_stemmer"
]
}
}
}
}
}
PUT /ppa/_mapping/ppa
{"properties": {"descricao": {"type": "text", "analyzer": "brazilian"}}}
POST /_bulk
{"index":{"_index":"ppa","_type":"ppa"}}
{"descricao": "erradicar a pobreza"}
{"index":{"_index":"ppa","_type":"ppa"}}
{"descricao": "erradicar a pobreza"}
These are my settings, mappings and documents for the ods index:
PUT /ods
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_stemmer": {
"type": "stemmer",
"language": "brazilian"
}
},
"analyzer": {
"brazilian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"brazilian_stop",
"brazilian_stemmer"
]
}
}
}
}
}
PUT /ods/_mapping/ods
{"properties": {"metaodsdescricao": {"type": "text", "analyzer": "brazilian"},"metaodsid": {"type": "integer"}}}
POST /_bulk
{"index":{"_index":"ods","_type":"ods", "_id" : "1" }}
{ "metaodsdescricao": "erradicar a pobreza","metaodsid": 1}
{"index":{"_index":"ods","_type":"ods", "_id" : "2" }}
{"metaodsdescricao": "crianças que vivem na pobreza", "metaodsid": 2}
Now, this search doesn't work:
GET /ppa/ppa/_search
{
"query": {
"more_like_this" : {
"fields" : ["descricao"],
"like" : [
{
"_index" : "ods",
"_type" : "ods",
"_id" : "1"
}
],
"min_term_freq" : 1,
"min_doc_freq" : 1,
"max_query_terms" : 20
}
}
}
But this one does work:
GET /ppa/ppa/_search
{
"query": {
"more_like_this" : {
"fields" : ["descricao"],
"like" : ["erradicar a pobreza"],
"min_term_freq" : 1,
"min_doc_freq" : 1,
"max_query_terms" : 20
}
}
}
What is happening?
Please, help me make this return something other than empty.
The "more like this" query work well when you have indexed a lot of data. The empty result can be symptom of very few documents present in the elastic index.

Get exact match after doing mapping as not_analyzed

I have elasticsearch type I mapped as below,
mappings": {
"jardata": {
"properties": {
"groupID": {
"index": "not_analyzed",
"type": "string"
},
"artifactID": {
"index": "not_analyzed",
"type": "string"
},
"directory": {
"type": "string"
},
"jarFileName": {
"index": "not_analyzed",
"type": "string"
},
"version": {
"index": "not_analyzed",
"type": "string"
}
}
}
}
I am using index of directory as analyzed since I want give only the last folder and get the results, But when I want to search a specific directory I need to give the whole path since there can be same folder in two paths. The problem here is since it is analyzed it will all data instead the specific one I want.
The problem here is I want to act it like both analyzed and not_analyzed. is there a way for that?
Let's say you have the following document indexed:
{
"directory": "/home/docs/public"
}
The standard analyzer is not enough in your case as it will create following terms while indexing:
[home, docs, public]
Note that it misses [/home/docs/public] token - characters like "/" etc. are acting as separators here.
One solution could be to use NGram tokenizer with punctuation character class in token_chars list. Elasticsearch would treat "/" as it would be a letter or digit. This would allow to search with following tokens:
[/hom, /home, ..., /home/docs/publi, /home/docs/public, ..., /docs/public, etc...]
Index mapping:
{
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 4,
"max_gram": 18,
"token_chars": [
"letter",
"digit",
"punctuation"
]
}
}
}
},
"mappings": {
"jardata": {
"properties": {
"directory": {
"type": "string",
"analyzer": "ngram_analyzer"
}
}
}
}
}
Now both search queries:
{
"query": {
"bool" : {
"must" : {
"term" : {
"directory": "/docs/private"
}
}
}
}
}
and
{
"query": {
"bool" : {
"must" : {
"term" : {
"directory": "/home/docs/private"
}
}
}
}
}
will give the indexed document in result.
One thing you have to consider is the maximum length of the token that is specified in "max_gram" setting. In case of directory paths it could be necessary to have it longer.
Alternative solution is to use Whitespace tokenizer, that breaks the phrase into terms only on whitespaces, and NGram filter with following mapping:
{
"settings": {
"analysis": {
"filter": {
"ngram_filter": {
"type": "ngram",
"min_gram": 4,
"max_gram": 20
}
},
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"ngram_filter"
]
}
}
}
},
"mappings": {
"jardata": {
"properties": {
"directory": {
"type": "string",
"analyzer": "my_analyzer"
}
}
}
}
}
update the mapping of the directory field to contain raw field like this:
"directory": {
"type": "string",
"fields": {
"raw": {
"index": "not_analyzed",
"type": "string"
}
}
}
And modify your query to include directory.raw which will treat it like not_analyzed. Refer this.

Elasticsearch: index first char of string

I'm using version 5.3.
I have a text field a. I'd like to aggregate on the first char of a. I also need the entire original value.
I'm assuming the most efficient way is to have a keyword field a.firstLetter with a custom normalizer. I've tried to achieve this with a pattern replace char filter but am struggling with the regexp.
Am I going at this entirely wrong? Can you help me?
EDIT
This is what I've tried.
settings.json
{
"settings": {
"index": {
"analysis": {
"char_filter": {
"first_char": {
"type": "pattern_replace",
"pattern": "(?<=^.)(.*)",
"replacement": ""
}
}
"normalizer": {
"first_letter": {
"type": "custom",
"char_filter": ["first_char"]
"filter": ["lowercase"]
}
}
}
}
}
}
mappings.json
{
"properties": {
"a": {
"type": "text",
"index_options": "positions",
"fields": {
"firstLetter": {
"type": "keyword",
"normalizer": "first_letter"
}
}
}
}
}
I get no buckets when I try to aggregate like so:
"aggregations": {
"grouping": {
"terms": {
"field": "a.firstLetter"
}
}
}
So basically my approach was "replace all but the first char with an empty string." The regexp is something I was able to gather by googling.
EDIT 2
I had misconfigured the normalizer (I've fixed the examples). The correct configuration reveals that normalizers do not support pattern replace char filters due to issue 23142. Apparently support for it will be implemented earliest in version 5.4.
So are there any other options? I'd hate to do this in code, by adding a field in the doc for the first letter, since I'm using Elasticsearch features for every other aggregation.
You can use the truncate filter with a length of one
PUT foo
{
"mappings": {
"bar" : {
"properties": {
"name" : {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
},
"settings": {
"index": {
"analysis": {
"analyzer" : {
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "keyword",
"filter" : [ "my_filter", "lowercase" ]
}
},
"filter": {
"my_filter": {
"type": "truncate",
"length": 1
}
}
}
}
}
}
GET foo/_analyze
{
"field" : "name",
"text" : "New York"
}
# response
{
"tokens": [
{
"token": "n",
"start_offset": 0,
"end_offset": 8,
"type": "word",
"position": 0
}
]
}

Elasticsearch phrase suggester prefix phonetic differences

I was wondering if there is any way for the phrase suggester to correct prefix spelling mistakes on phonetic differences.
Elasticsearch 5.1.2
Testing in Kibana 5.1.2
For Example:
Instead of "circus" someone wrote "sircus", or instead of "coding" someone wrote "koding".
Funny thing is, that instead of "phrase" you can write "frase" and get a suggestion.
Here is my setup.
Settings:
PUT text_index
{
"settings": {
"analysis": {
"analyzer": {
"suggests_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"shingle_filter"
],
"type": "custom"
},
"reverse": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "reverse"]
}
},
"filter": {
"shingle_filter": {
"min_shingle_size": 2,
"max_shingle_size": 5,
"type": "shingle"
}
}
}
},
"mappings": {
"testtype": {
"properties": {
"suggest_field": {
"type": "text",
"analyzer": "suggests_analyzer",
"fields": {
"reverse": {
"type": "text",
"analyzer": "reverse"
}
}
}
}
}
}
}
Some documents:
POST test_index/test_type/_bulk
{"index":{}}
{ "suggest_field": "phrase"}
{"index":{}}
{ "suggest_field": "Circus"}
{"index":{}}
{ "suggest_field": "Coding"}
Querying:
POST /so-index/_search
{
"suggest" : {
"text" : "sircus",
"simple_phrase" : {
"phrase" : {
"field" : "suggest_field",
"max_errors": 0.9,
"highlight": {
"pre_tag": "<em>",
"post_tag": "</em>"
},
"direct_generator" : [ {
"field" : "suggest_field",
"suggest_mode" : "always"
}, {
"field" : "suggest_field.reverse",
"suggest_mode" : "always",
"pre_filter" : "reverse",
"post_filter" : "reverse"
}]
}
}
}
}
Also, I repeat following steps a few times (between 5 and 10) without changing anything:
delete index
put index, settings & mappings
add documents
query (codign)
Sometimes I get suggestions and sometimes I don't. Is there any explanation for it?
Try setting "prefix_length": 0 in the direct_generator.

Resources