Using autocomplete with email in elastic search doesn't work - elasticsearch

I have a field with the following mapping defined :
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
}
}
}
My email analyser looks like this:
{
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
}
},
"analyzer": {
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
}
}
When I try to search for an email id, like test.xyz#example.com
Searching for terms like tes,test.xy etc. doesn't work. But if I search for
test.xyz or test.xyz#example.com, it works fine. I tried analyzing the tokens using my email filter and it works fine as expected
Ex. Hitting http://localhost:9200/my_index/_analyze?analyzer=email&text=test.xyz#example.com
I get:
{
"tokens": [{
"token": "tes",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.x",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xy",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#e",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#ex",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exa",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exam",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#examp",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exampl",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.c",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.co",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.com",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}]
}
So I know that the tokenisation works. But while searching, it fails to search partial strings.
For ex. Looking for http://localhost:9200/my_index/my_field/_search?q=test, the result shows no hits.
Details of my index :
{
"my_index": {
"aliases": {
"alias_default": {}
},
"mappings": {
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
},
"boost": {
"type": "long"
},
"createdat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"instanceid": {
"type": "long"
},
"isdeleted": {
"type": "integer"
},
"object": {
"type": "string"
},
"objecthash": {
"type": "string"
},
"objectid": {
"type": "string"
},
"parent": {
"type": "short"
},
"parentid": {
"type": "integer"
},
"updatedat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
},
"settings": {
"index": {
"creation_date": "1480342980403",
"number_of_replicas": "1",
"max_result_window": "100000",
"uuid": "OUuiTma8CA2VNtw9Og",
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"filter": [
"lowercase",
"autocomplete_filter"
],
"tokenizer": "standard"
},
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
},
"number_of_shards": "5",
"version": {
"created": "2010099"
}
}
},
"warmers": {}
}
}

Ok, everything looks correct, except your query.
You simply need to specify the address field in your query like this and it will work:
http://localhost:9200/my_index/my_field/_search?q=address:test
If you don't specify the address field, the query will work on the _all field whose search analyzer is the standard one by default, hence why you're not finding anything.

Related

Elasticsearch Analyzer first 4 and last 4 characters

With Elasticsearch, I would like to specify a search analyzer where the first 4 characters and last 4 characters are tokenized.
For example: supercalifragilisticexpialidocious => ["supe", "ious"]
I have had a go with an ngram as follows
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 4,
"max_gram": 4
}
}
}
}
}
I am testing the analyzer as follows
POST my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "supercalifragilisticexpialidocious."
}
And get back `super' ... loads of stuff I don't want and 'cious'. The problem for me is how can I take only the first and last results from the ngram tokenizer specified above?
{
"tokens": [
{
"token": "supe",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "uper",
"start_offset": 1,
"end_offset": 5,
"type": "word",
"position": 1
},
...
{
"token": "ciou",
"start_offset": 29,
"end_offset": 33,
"type": "word",
"position": 29
},
{
"token": "ious",
"start_offset": 30,
"end_offset": 34,
"type": "word",
"position": 30
},
{
"token": "ous.",
"start_offset": 31,
"end_offset": 35,
"type": "word",
"position": 31
}
]
}
One way to achieve this is to leverage the pattern_capture token filter and take the first 4 and last 4 characters.
First, define your index like this:
PUT my_index
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"first_last_four"
]
}
},
"filter": {
"first_last_four": {
"type": "pattern_capture",
"preserve_original": false,
"patterns": [
"""(\w{4}).*(\w{4})"""
]
}
}
}
}
}
}
Then, you can test your new custom analyzer:
POST test/_analyze
{
"text": "supercalifragilisticexpialidocious",
"analyzer": "my_analyzer"
}
And see that the tokens you expect are there:
{
"tokens" : [
{
"token" : "supe",
"start_offset" : 0,
"end_offset" : 34,
"type" : "word",
"position" : 0
},
{
"token" : "ious",
"start_offset" : 0,
"end_offset" : 34,
"type" : "word",
"position" : 0
}
]
}

Different position incremental behaviour of Synonym Filter

My use case is to search for edge_ngrams with synonym support where the tokens to match should be in sequence.
While trying out the analysis, I observed 2 different behaviour of the filter chain with respect to position increments.
With filter-chain as lowercase, synonym there is no position increment due to SynonymFilter
With filter-chain as lowercase, edge_ngram, synonym there is position increment due to SynonymFilter
Here are the queries I'm running for each case:
Case 1. No position increment
PUT synonym_test
{
"index": {
"analysis": {
"analyzer": {
"by_smart": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"custom_synonym"
]
}
},
"filter": {
"custom_synonym": {
"type": "synonym",
"synonyms": [
"begin => start"
]
}
}
}
}
}
GET synonym_test/_analyze
{
"text": "begin working",
"analyzer": "by_smart"
}
Outputs :
{
"tokens": [
{
"token": "start",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 0
},
{
"token": "working",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 1
}
]
}
Case 2. Position increment
PUT synonym_test
{
"index": {
"analysis": {
"analyzer": {
"by_smart": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"custom_edge_ngram",
"custom_synonym"
]
}
},
"filter": {
"custom_synonym": {
"type": "synonym",
"synonyms": [
"begin => start"
]
},
"custom_edge_ngram": {
"type": "edge_ngram",
"min_gram": "2",
"max_gram": "60"
}
}
}
}
}
GET synonym_test/_analyze
{
"text": "begin working",
"analyzer": "by_smart"
}
Outputs :
{
"tokens": [
{
"token": "be",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "beg",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "begi",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "start",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 1
},
{
"token": "wo",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "wor",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "work",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "worki",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "workin",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "working",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
}
]
}
Notice how in Case1 the token begin and start when replaced have the same position and there is no position increment. However in Case 2, when begin token is replaced by start the position got incremented for the subsequent token stream.
Now here are my questions :
Why is it not happening in Case 1 and only happening in Case 2 ?
The main issue that this is causing is when the input query is begi wor with match_phrase query (and default slop as 0) it doesn't matches begin work.
Which is happening since begi and wor are 2 positions away. Any suggestions on how can I achieve this behaviour without impacting my use case ?
I'm using ElasticSearch version 5.6.8 having lucene version 6.6.1.
I've read several documentation links and articles but I couldn't find any proper one explaining why is this happening and is there some settings to get my desired behaviour.

testing an elasticsearch custom analyzer - pipe delimited keywords

I have this index with pipe as custom analyzer. When I am trying to test it, it returns every char, and not pipe delimited words.
I am trying to construct for an use case where my input line keywords looks like: crockpot refried beans|corningware replacement|crockpot lids|recipe refried beans and EL will return matches after it has been exploded.
{
"keywords": {
"aliases": {
},
"mappings": {
"cloud": {
"properties": {
"keywords": {
"type": "text",
"analyzer": "pipe"
}
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "keywords",
"creation_date": "1513890909384",
"analysis": {
"analyzer": {
"pipe": {
"type": "custom",
"tokenizer": "pipe"
}
},
"tokenizer": {
"pipe": {
"pattern": "|",
"type": "pattern"
}
}
},
"number_of_replicas": "1",
"uuid": "DOLV_FBbSC2CBU4p7oT3yw",
"version": {
"created": "6000099"
}
}
}
}
}
When I am trying to test it following this guide.
curl -XPOST 'http://localhost:9200/keywords/_analyze' -d '{
"analyzer": "pipe",
"text": "pipe|pipe2"
}'
I get back char-by-char results.
{
"tokens": [
{
"token": "p",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "i",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "p",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "e",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
},
Good work, you're almost there. Since the pipe | character is a reserved character in regular expressions, you need to escape it like this:
"tokenizer": {
"pipe": {
"pattern": "\\|", <--- change this
"type": "pattern"
}
}
And then your analyzer will work and produce this:
{
"tokens": [
{
"token": "pipe",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "pipe2",
"start_offset": 5,
"end_offset": 10,
"type": "word",
"position": 1
}
]
}

Elasticsearch custom analyzer being ignored

I'm using Elasticsearch 2.2.0 and I'm trying to use the lowercase + asciifolding filters on a field.
This is the output of http://localhost:9200/myindex/
{
"myindex": {
"aliases": {},
"mappings": {
"products": {
"properties": {
"fold": {
"analyzer": "folding",
"type": "string"
}
}
}
},
"settings": {
"index": {
"analysis": {
"analyzer": {
"folding": {
"token_filters": [
"lowercase",
"asciifolding"
],
"tokenizer": "standard",
"type": "custom"
}
}
},
"creation_date": "1456180612715",
"number_of_replicas": "1",
"number_of_shards": "5",
"uuid": "vBMZEasPSAyucXICur3GVA",
"version": {
"created": "2020099"
}
}
},
"warmers": {}
}
}
And when I try to test the folding custom filter using the _analyze API, this is what I get as an output of http://localhost:9200/myindex/_analyze?analyzer=folding&text=%C3%89sta%20est%C3%A1%20loca
{
"tokens": [
{
"end_offset": 4,
"position": 0,
"start_offset": 0,
"token": "Ésta",
"type": "<ALPHANUM>"
},
{
"end_offset": 9,
"position": 1,
"start_offset": 5,
"token": "está",
"type": "<ALPHANUM>"
},
{
"end_offset": 14,
"position": 2,
"start_offset": 10,
"token": "loca",
"type": "<ALPHANUM>"
}
]
}
As you can see, the returned tokens are: Ésta, está, loca instead of esta, esta, loca. What's going on? it seems that this folding analyzer is being ignored.
Looks like a simple typo when you are creating your index.
In your "analysis":{"analyzer":{...}} block, this:
"token_filters": [...]
Should be
"filter": [...]
Check the documentation for confirmation of this. Because your filter array wasn't named correctly, ES completely ignored it, and just decided to use the standard analyzer. Here is a small example written using the Sense chrome plugin. Execute them in order:
DELETE /test
PUT /test
{
"analysis": {
"analyzer": {
"folding": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "standard"
}
}
}
}
GET /test/_analyze
{
"analyzer":"folding",
"text":"Ésta está loca"
}
And the results of the last GET /test/_analyze:
"tokens": [
{
"token": "esta",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "esta",
"start_offset": 5,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "loca",
"start_offset": 10,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 2
}
]

Pass "-" in Elastic Search Query

When we are passing a query containing special characters, Elastic Search is splitting the text.
E.g. If we pass "test-test" in query how can we make Elastic Search treat this as a single word and not split it up.
Analyzer used on the field we are searching:
"text_search_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
},
"standard_stop_filter": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"text_search_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"text_search_filter"
]
}
}
Also the query for search:
"query": {
"multi_match": {
"query": "test-test",
"type": "cross_fields",
"fields": [
"FIELD_NAME"
],
}
}
{
"tokens": [
{
"token": "'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
}
]
}
in my code i catch all words which contains "-" and added quotes for it.
example:
joe-doe -> "joe-doe"
java code for this:
static String placeWordsWithDashInQuote(String value) {
return Arrays.stream(value.split("\\s"))
.filter(v -> !v.isEmpty())
.map(v -> v.contains("-") && !v.startsWith("\"") ? "\"" + v + "\"" : v)
.collect(Collectors.joining(" "));
}
and after this example query looks like:
{
"query": {
"bool": {
"must": [
{
"query_string": {
"fields": [
"lastName",
"firstName"
],
"query": "\"joe-doe\"",
"default_operator": "AND"
}
}
]
}
},
"sort": [],
"from": 0,
"size": 10 }

Resources