When we are passing a query containing special characters, Elastic Search is splitting the text.
E.g. If we pass "test-test" in query how can we make Elastic Search treat this as a single word and not split it up.
Analyzer used on the field we are searching:
"text_search_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
},
"standard_stop_filter": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"text_search_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"text_search_filter"
]
}
}
Also the query for search:
"query": {
"multi_match": {
"query": "test-test",
"type": "cross_fields",
"fields": [
"FIELD_NAME"
],
}
}
{
"tokens": [
{
"token": "'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
}
]
}
in my code i catch all words which contains "-" and added quotes for it.
example:
joe-doe -> "joe-doe"
java code for this:
static String placeWordsWithDashInQuote(String value) {
return Arrays.stream(value.split("\\s"))
.filter(v -> !v.isEmpty())
.map(v -> v.contains("-") && !v.startsWith("\"") ? "\"" + v + "\"" : v)
.collect(Collectors.joining(" "));
}
and after this example query looks like:
{
"query": {
"bool": {
"must": [
{
"query_string": {
"fields": [
"lastName",
"firstName"
],
"query": "\"joe-doe\"",
"default_operator": "AND"
}
}
]
}
},
"sort": [],
"from": 0,
"size": 10 }
Related
I am using elastic 7.15.0.
I want to organize search by priority with/without spaces.
What does it mean?
query - "surf coff"
I want to see records contains or startswith "surf coff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN)
When records contains or startswith "surf" - (SURF, SURF LOVE, ENDLESS SURF)
When records contains or startswith "coff" - (LOVE COFFEE, COFFEE MAN)
query - "surfcoff"
i want to see records contains or startswith "surfcoff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN) only.
I created the analyzer with filters:
lowercase
word_delimiter_graph
shingle
edge n gram
pattern replace for spaces
{
"settings":{
"index": {
"max_shingle_diff" : 9,
"max_ngram_diff": 9
},
"analysis":{
"analyzer":{
"word_join_analyzer":{
"tokenizer":"standard",
"filter":[
"lowercase",
"word_delimiter_graph",
"my_shingle",
"my_edge_ngram",
"my_char_filter"
]
}
},
"filter":{
"my_shingle":{
"type":"shingle",
"min_shingle_size": 2,
"max_shingle_size": 10
},
"my_edge_ngram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": ["letter", "digit"]
},
"my_char_filter": {
"type": "pattern_replace",
"pattern": " ",
"replacement": ""
}
}
}
}
}
So when i analyzed text = "SURF COFFEE", i got this result
{
"tokens": [
{
"token": "su",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "sur",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "surf",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "su",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "sur",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surf",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surf",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surfc",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surfco",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surfcof",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surfcoff",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "surfcoffe",
"start_offset": 0,
"end_offset": 11,
"type": "shingle",
"position": 0,
"positionLength": 2
},
{
"token": "co",
"start_offset": 5,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cof",
"start_offset": 5,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "coff",
"start_offset": 5,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "coffe",
"start_offset": 5,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "coffee",
"start_offset": 5,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
}
]
}
As you can see, there is token "surfcoff".
How my search should be organized?
I've tried to combine approaches by bool should query with -
query_string, match_phrase_prefix, match_prefix and others.
But none of them gave correct results.
Can you please, help me.
How my query should be built?
Or maybe i should try other analyzer filters.
For example query
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "surf coff",
"default_field": "text",
"default_operator": "AND"
}
},
{
"query_string": {
"query": "surf",
"default_field": "text"
}
},
{
"query_string": {
"query": "coff",
"default_field": "text"
}
}
]
}
}
}
or this query
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "(surf coff) OR (surf) OR (coff)",
"default_field": "text"
}
}
]
}
}
}
or this query
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "((surf AND coff)^3 OR (surf)^2 OR (coff)^1)",
"default_field": "text"
}
}
]
}
}
}
or
{
"query": {
"match_bool_prefix" : {
"text" : "surf coff"
}
}
}
gives
SURF COFFEE SURFING NEVER ALONE
CONOSUR COLCHAGUA CONO SUR
SUNRISE CONCHA TORO SUNRISE 300 DAYS
SUN COFFEE
SURF COFFEE PROPAGANDA
....
but its strange for me, i think i misunderstand something.
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "(surf* AND coff*)^3 OR (surf*)^2 OR (coff*)^1",
"default_field": "text"
}
}
]
}
}
}
{
"settings":{
"index": {
"max_shingle_diff" : 9,
"max_ngram_diff": 9
},
"analysis":{
"analyzer":{
"word_join_analyzer":{
"tokenizer":"standard",
"filter":[
"lowercase",
"word_delimiter_graph",
"my_shingle",
"my_char_filter"
]
}
},
"filter":{
"my_shingle":{
"type":"shingle",
"min_shingle_size": 2,
"max_shingle_size": 10
},
"my_char_filter": {
"type": "pattern_replace",
"pattern": " ",
"replacement": ""
}
}
}
}
}
removing edge-n-gram and adding wilcard query with priority resolved my question.
But I still don't understand why edge n gram didn't work.
Finally resolved with
"filter":[
"lowercase",
"word_delimiter_graph",
"my_shingle",
"my_edge_ngram",
"my_char_filter"
]
the problem was with search_analyzer, because docs said " Sometimes, though, it can make sense to use a different analyzer at search time, such as when using the edge_ngram tokenizer for autocomplete or when using search-time synonyms."
So i added standard search_analyzer to my text field:
"text": { "type": "text", "analyzer": "word_join_analyzer", "search_analyzer": "standard" }
Search query:
{
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "surf coff",
"default_field": "text"
}
}
]
}
}
}
My use case is to search for edge_ngrams with synonym support where the tokens to match should be in sequence.
While trying out the analysis, I observed 2 different behaviour of the filter chain with respect to position increments.
With filter-chain as lowercase, synonym there is no position increment due to SynonymFilter
With filter-chain as lowercase, edge_ngram, synonym there is position increment due to SynonymFilter
Here are the queries I'm running for each case:
Case 1. No position increment
PUT synonym_test
{
"index": {
"analysis": {
"analyzer": {
"by_smart": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"custom_synonym"
]
}
},
"filter": {
"custom_synonym": {
"type": "synonym",
"synonyms": [
"begin => start"
]
}
}
}
}
}
GET synonym_test/_analyze
{
"text": "begin working",
"analyzer": "by_smart"
}
Outputs :
{
"tokens": [
{
"token": "start",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 0
},
{
"token": "working",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 1
}
]
}
Case 2. Position increment
PUT synonym_test
{
"index": {
"analysis": {
"analyzer": {
"by_smart": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"custom_edge_ngram",
"custom_synonym"
]
}
},
"filter": {
"custom_synonym": {
"type": "synonym",
"synonyms": [
"begin => start"
]
},
"custom_edge_ngram": {
"type": "edge_ngram",
"min_gram": "2",
"max_gram": "60"
}
}
}
}
}
GET synonym_test/_analyze
{
"text": "begin working",
"analyzer": "by_smart"
}
Outputs :
{
"tokens": [
{
"token": "be",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "beg",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "begi",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0
},
{
"token": "start",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 1
},
{
"token": "wo",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "wor",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "work",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "worki",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "workin",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
},
{
"token": "working",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 2
}
]
}
Notice how in Case1 the token begin and start when replaced have the same position and there is no position increment. However in Case 2, when begin token is replaced by start the position got incremented for the subsequent token stream.
Now here are my questions :
Why is it not happening in Case 1 and only happening in Case 2 ?
The main issue that this is causing is when the input query is begi wor with match_phrase query (and default slop as 0) it doesn't matches begin work.
Which is happening since begi and wor are 2 positions away. Any suggestions on how can I achieve this behaviour without impacting my use case ?
I'm using ElasticSearch version 5.6.8 having lucene version 6.6.1.
I've read several documentation links and articles but I couldn't find any proper one explaining why is this happening and is there some settings to get my desired behaviour.
Below is the elastic search mapping with one field called hostname and other field called catch_all which is basically copy_to field(there will be many more fields copying values to this)
{
"settings": {
"analysis": {
"filter": {
"myNGramFilter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 40
}},
"analyzer": {
"myNGramAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "myNGramFilter"]
}
}
}
},
"mappings": {
"test": {
"properties": {
"catch_all": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"store": true,
"ignore_above": 256
},
"grams": {
"type": "text",
"store": true,
"analyzer": "myNGramAnalyzer"
}
}
},
"hostname": {
"type": "text",
"copy_to": "catch_all"
}
}
}
}
}
When I do the
GET index/_analyze
{
"analyzer": "myNGramAnalyzer",
"text": "Dell PowerEdge R630"
}
{
"tokens": [
{
"token": "d",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "de",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "del",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "dell",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "p",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "po",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "pow",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powe",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "power",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powere",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powered",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "poweredg",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "poweredge",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "r",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r6",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r63",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r630",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
}
]
}
There is a token called "poweredge".
Right now we search with below query
{
"query": {
"multi_match": {
"fields": ["catch_all.grams"],
"query": "poweredge",
"operator": "and"
}
}
}
When we query with "poweredge" we get 1 result. But when we search by only "edge" there is no result.
Even the match query does not yield results for search word "edge".
Can somebody help here ?
I suggest to don't query with multi_match api for your use case, but to use a match query. The edgengram works in that way: it try to make ngram on the tokens generated by a whitespace tokenizer on you text. As written in documentation - read here:
The edge_ngram tokenizer first breaks text down into words whenever it
encounters one of a list of specified characters, then it emits
N-grams of each word where the start of the N-gram is anchored to the
beginning of the word.
As you have tested in your query to analyze API, it doesn't product "edge" - from poweredge - as ngram because it products ngram from the beginning of the word - look at the output of you analyze API call. Take a look here: https://www.elastic.co/guide/en/elasticsearch/guide/master/ngrams-compound-words.html
How do you see the analyzed data that is stored after you index something.
I know you can just do a search to see it like this
http://localhost:9200/local_products_fr/fields/_search/
But what I want to see is the actual data not the _source
something like what you get when you call _analyzer
http://localhost:9200/local_products_fr/_analyze?text=<p>my <b>super</b> text</p>&analyzer=analyzer_fr
{
"tokens": [
{
"token": "my",
"start_offset": 3,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "b",
"start_offset": 7,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "sup",
"start_offset": 9,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "b",
"start_offset": 16,
"end_offset": 17,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "text",
"start_offset": 19,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 4
}
]
}
i use this to get inverted index for a field per document
{
"query": {
"bool": {
"must": [{
"term": {
"_id": {
"value": "2"
}
}
}]
}
},
"script_fields": {
"terms": {
"script": "doc['product.name'].values"
}
}
}
Hope this works for you
I have a field with the following mapping defined :
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
}
}
}
My email analyser looks like this:
{
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
}
},
"analyzer": {
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
}
}
When I try to search for an email id, like test.xyz#example.com
Searching for terms like tes,test.xy etc. doesn't work. But if I search for
test.xyz or test.xyz#example.com, it works fine. I tried analyzing the tokens using my email filter and it works fine as expected
Ex. Hitting http://localhost:9200/my_index/_analyze?analyzer=email&text=test.xyz#example.com
I get:
{
"tokens": [{
"token": "tes",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.x",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xy",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#e",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#ex",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exa",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exam",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#examp",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#exampl",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.c",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.co",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}, {
"token": "test.xyz#example.com",
"start_offset": 0,
"end_offset": 20,
"type": "word",
"position": 0
}]
}
So I know that the tokenisation works. But while searching, it fails to search partial strings.
For ex. Looking for http://localhost:9200/my_index/my_field/_search?q=test, the result shows no hits.
Details of my index :
{
"my_index": {
"aliases": {
"alias_default": {}
},
"mappings": {
"my_field": {
"properties": {
"address": {
"type": "string",
"analyzer": "email",
"search_analyzer": "whitespace"
},
"boost": {
"type": "long"
},
"createdat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"instanceid": {
"type": "long"
},
"isdeleted": {
"type": "integer"
},
"object": {
"type": "string"
},
"objecthash": {
"type": "string"
},
"objectid": {
"type": "string"
},
"parent": {
"type": "short"
},
"parentid": {
"type": "integer"
},
"updatedat": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
},
"settings": {
"index": {
"creation_date": "1480342980403",
"number_of_replicas": "1",
"max_result_window": "100000",
"uuid": "OUuiTma8CA2VNtw9Og",
"analysis": {
"filter": {
"email_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "255"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"filter": [
"lowercase",
"autocomplete_filter"
],
"tokenizer": "standard"
},
"email": {
"type": "custom",
"filter": [
"lowercase",
"email_filter",
"unique"
],
"tokenizer": "uax_url_email"
}
}
},
"number_of_shards": "5",
"version": {
"created": "2010099"
}
}
},
"warmers": {}
}
}
Ok, everything looks correct, except your query.
You simply need to specify the address field in your query like this and it will work:
http://localhost:9200/my_index/my_field/_search?q=address:test
If you don't specify the address field, the query will work on the _all field whose search analyzer is the standard one by default, hence why you're not finding anything.