testing an elasticsearch custom analyzer - pipe delimited keywords - elasticsearch

I have this index with pipe as custom analyzer. When I am trying to test it, it returns every char, and not pipe delimited words.
I am trying to construct for an use case where my input line keywords looks like: crockpot refried beans|corningware replacement|crockpot lids|recipe refried beans and EL will return matches after it has been exploded.
{
"keywords": {
"aliases": {
},
"mappings": {
"cloud": {
"properties": {
"keywords": {
"type": "text",
"analyzer": "pipe"
}
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "keywords",
"creation_date": "1513890909384",
"analysis": {
"analyzer": {
"pipe": {
"type": "custom",
"tokenizer": "pipe"
}
},
"tokenizer": {
"pipe": {
"pattern": "|",
"type": "pattern"
}
}
},
"number_of_replicas": "1",
"uuid": "DOLV_FBbSC2CBU4p7oT3yw",
"version": {
"created": "6000099"
}
}
}
}
}
When I am trying to test it following this guide.
curl -XPOST 'http://localhost:9200/keywords/_analyze' -d '{
"analyzer": "pipe",
"text": "pipe|pipe2"
}'
I get back char-by-char results.
{
"tokens": [
{
"token": "p",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "i",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "p",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "e",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
},

Good work, you're almost there. Since the pipe | character is a reserved character in regular expressions, you need to escape it like this:
"tokenizer": {
"pipe": {
"pattern": "\\|", <--- change this
"type": "pattern"
}
}
And then your analyzer will work and produce this:
{
"tokens": [
{
"token": "pipe",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "pipe2",
"start_offset": 5,
"end_offset": 10,
"type": "word",
"position": 1
}
]
}

Related

Which Analyzer can meet my need in elasticsearch?

In my situation, my field is like "abc,123", I want it can be searched either "abc" or "123".
my index mapping is just like the code below
{
"myfield": {
"type": "text",
"analyzer": "stop",
"search_analyzer": "stop" }
But when I use es _analyzer API to test, I got the result
{
"tokens": [
{
"token": "abc",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0
}
]
}
"123" was lost.
If I want to meet my situation, do I need to choose some other analyzer or just to add some special configs?
You need to choose standard analyzer instead as stop analyzer breaks text into terms whenever it encounters a character which is not a letter and removes stop words like 'the'. In your case "abc,123" results in token abc when using stop analyzer. Using standard analyzer it returns abc and 123 as shown below
POST _analyze
{
"analyzer": "standard",
"text": "abc, 123"
}
Output:
{
"tokens": [
{
"token": "abc",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "123",
"start_offset": 5,
"end_offset": 8,
"type": "<NUM>",
"position": 1
}
]
}
EDIT1 Using Simple Pattern Split Tokenizer
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "simple_pattern_split",
"pattern": ","
}
}
}
}
}
POST my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "abc,123"
}
Output:
{
"tokens": [
{
"token": "abc",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0
},
{
"token": "123",
"start_offset": 4,
"end_offset": 7,
"type": "word",
"position": 1
}
]
}

Elastic search : query not searching numeric value

I have one field in which I am storing values like O5467508 (Starting with alphabet "O")
Below is my query.
{"from":0,"size":10,"query":{"bool":{"must":[{"match":{"field_LIST_105":{"query":"o5467508","type":"phrase_prefix"}}},{"bool":{"must":[{"match":{"RegionName":"Virginia"}}]}}]}}}
it is giving me correct result, But when i am searching for only numeric value "5467508", query result is empty.
Thanks in advance.
One of the possible solution that could help you, use word_delimiter filter, with the option preserve_original, which will save original token.
Something like this:
{
"settings": {
"analysis": {
"analyzer": {
"so_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"my_word_delimiter"
]
}
},
"filter": {
"my_word_delimiter": {
"type": "word_delimiter",
"preserve_original": true
}
}
}
},
"mappings": {
"my_type": {
"properties": {
"field_LIST_105": {
"type": "text",
"analyzer": "so_analyzer"
}
}
}
}
}
I did a quick test of analysis, and this is the tokens that it give to me.
{
"tokens": [
{
"token": "o5467508",
"start_offset": 0,
"end_offset": 8,
"type": "word",
"position": 0
},
{
"token": "o",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "5467508",
"start_offset": 1,
"end_offset": 8,
"type": "word",
"position": 1
}
]
}
For more information - https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html

Using autocomplete with email in elastic search doesn't work

I have a field with the following mapping defined :
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
}
}
}
My email analyser looks like this:
{
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
}
},
"analyzer": {
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
}
}
When I try to search for an email id, like test.xyz#example.com
Searching for terms like tes,test.xy etc. doesn't work. But if I search for
test.xyz or test.xyz#example.com, it works fine. I tried analyzing the tokens using my email filter and it works fine as expected
Ex. Hitting http://localhost:9200/my_index/_analyze?analyzer=email&text=test.xyz#example.com
I get:
{
"tokens": [{
"token": "tes",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.x",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xy",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#e",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#ex",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exa",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exam",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#examp",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exampl",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.c",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.co",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.com",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}]
}
So I know that the tokenisation works. But while searching, it fails to search partial strings.
For ex. Looking for http://localhost:9200/my_index/my_field/_search?q=test, the result shows no hits.
Details of my index :
{
"my_index": {
"aliases": {
"alias_default": {}
},
"mappings": {
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
},
"boost": {
"type": "long"
},
"createdat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"instanceid": {
"type": "long"
},
"isdeleted": {
"type": "integer"
},
"object": {
"type": "string"
},
"objecthash": {
"type": "string"
},
"objectid": {
"type": "string"
},
"parent": {
"type": "short"
},
"parentid": {
"type": "integer"
},
"updatedat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
},
"settings": {
"index": {
"creation_date": "1480342980403",
"number_of_replicas": "1",
"max_result_window": "100000",
"uuid": "OUuiTma8CA2VNtw9Og",
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"filter": [
"lowercase",
"autocomplete_filter"
],
"tokenizer": "standard"
},
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
},
"number_of_shards": "5",
"version": {
"created": "2010099"
}
}
},
"warmers": {}
}
}
Ok, everything looks correct, except your query.
You simply need to specify the address field in your query like this and it will work:
http://localhost:9200/my_index/my_field/_search?q=address:test
If you don't specify the address field, the query will work on the _all field whose search analyzer is the standard one by default, hence why you're not finding anything.

elastic Search not working for having special character '^(caret symbol)'

The problem is any character sequence having boost operator "^(caret symbol)" does not returning any search results.
But as per the below elastic search documentation
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_reserved_characters
&& || ! ( ) { } [ ] ^ " ~ * ? : \ characters can be escaped with \ symbol.
Have a requirement to do a contains search using n-gram analyser in elastic search.
Below is the mapping structure of the sample use case and the
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"nGram_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "ngram_tokenizer"
},
"whitespace_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "whitespace"
}
},
"tokenizer": {
"ngram_tokenizer": {
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram": "2",
"type": "nGram",
"max_gram": "20"
}
}
}
}
},
"mappings": {
"employee": {
"properties": {
"employeeName": {
"type": "string",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
}
Have a employee name like below with special characters included
xyz%^&*
Also the sample query used for the contains search as below
GET
{
"query": {
"bool": {
"must": [
{
"match": {
"employeeName": {
"query": "xyz%^",
"type": "boolean",
"operator": "or"
}
}
}
]
}
}
}
Even if we try to escape as "query": "xyz%\^" its errors out. So not able to search any character contains search having "^(caret symbol)"
Any help is greatly appreciated.
There is a bug in ngram tokenizer related to issue.
Essentially ^ is not considered either Symbol |Letter |Punctuation by ngram-tokenizer.
As a result it tokenizes the input on ^.
Example: (url encoded xyz%^):
GET <index_name>/_analyze?tokenizer=ngram_tokenizer&text=xyz%25%5E
The above result of analyze api shows there is no ^ as shown in the response below :
{
"tokens": [
{
"token": "xy",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
"token": "xyz",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 1
},
{
"token": "xyz%",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 2
},
{
"token": "yz",
"start_offset": 1,
"end_offset": 3,
"type": "word",
"position": 3
},
{
"token": "yz%",
"start_offset": 1,
"end_offset": 4,
"type": "word",
"position": 4
},
{
"token": "z%",
"start_offset": 2,
"end_offset": 4,
"type": "word",
"position": 5
}
]
}
Since '^' is not indexed therefore there are no matches

Elasticsearch custom analyzer being ignored

I'm using Elasticsearch 2.2.0 and I'm trying to use the lowercase + asciifolding filters on a field.
This is the output of http://localhost:9200/myindex/
{
"myindex": {
"aliases": {},
"mappings": {
"products": {
"properties": {
"fold": {
"analyzer": "folding",
"type": "string"
}
}
}
},
"settings": {
"index": {
"analysis": {
"analyzer": {
"folding": {
"token_filters": [
"lowercase",
"asciifolding"
],
"tokenizer": "standard",
"type": "custom"
}
}
},
"creation_date": "1456180612715",
"number_of_replicas": "1",
"number_of_shards": "5",
"uuid": "vBMZEasPSAyucXICur3GVA",
"version": {
"created": "2020099"
}
}
},
"warmers": {}
}
}
And when I try to test the folding custom filter using the _analyze API, this is what I get as an output of http://localhost:9200/myindex/_analyze?analyzer=folding&text=%C3%89sta%20est%C3%A1%20loca
{
"tokens": [
{
"end_offset": 4,
"position": 0,
"start_offset": 0,
"token": "Ésta",
"type": "<ALPHANUM>"
},
{
"end_offset": 9,
"position": 1,
"start_offset": 5,
"token": "está",
"type": "<ALPHANUM>"
},
{
"end_offset": 14,
"position": 2,
"start_offset": 10,
"token": "loca",
"type": "<ALPHANUM>"
}
]
}
As you can see, the returned tokens are: Ésta, está, loca instead of esta, esta, loca. What's going on? it seems that this folding analyzer is being ignored.
Looks like a simple typo when you are creating your index.
In your "analysis":{"analyzer":{...}} block, this:
"token_filters": [...]
Should be
"filter": [...]
Check the documentation for confirmation of this. Because your filter array wasn't named correctly, ES completely ignored it, and just decided to use the standard analyzer. Here is a small example written using the Sense chrome plugin. Execute them in order:
DELETE /test
PUT /test
{
"analysis": {
"analyzer": {
"folding": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "standard"
}
}
}
}
GET /test/_analyze
{
"analyzer":"folding",
"text":"Ésta está loca"
}
And the results of the last GET /test/_analyze:
"tokens": [
{
"token": "esta",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "esta",
"start_offset": 5,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "loca",
"start_offset": 10,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 2
}
]

Resources