How to optimize elasticsearch query dsl to improve the query performance? - performance

Dear elasticsearch technology experts,I have a elasticsearch index which size is almost 18Gb,and the amount of documents is about 400000.There are 3 nodes in the elasticserch cluster,and each nodes server is 2Cpu 4Gb memory. It costs almost 90ms to execute a dsl to get search response based on 40K documents. How can I optimize the mapping settings or the query dsl to improve query performance? Based on what principles to solve this problem?
The mapping settings of my index is as follows:
{
"settings" : {
"similarity": {
"scripted_tfidf": {
"type": "scripted",
"script": {
"source": "double tf = doc.freq; return query.boost * tf;"
}
},
"scripted_tfidf_new": {
"type": "scripted",
"script": {
"source": "double tf = doc.freq; return query.boost * 1;"
}
}
},
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase"]
}},
"char_filter" : {
"my_filter" : {
"type" : "mapping",
"mappings" : [":=>,",".=>,","!=>,","؛=>,","*=>,","/=>,",")=>,","(=>,","+=>,","-=>,","_=>,","$=>,","#=>,","£=>,","¢=>,","¥=>,","^=>,","°=>,","==>,","{=>,","}=>,","✓=>,","™=>,","®=>,","©=>,","٪=>,","~=>,","`=>,","|=>,","•=>,","√=>,","÷=>,","×=>,","∆=>,","<=>,",">=>,","?=>,",";=>,","。=>,",",=>,",";=>,","#=>,","%=>,",":=>,","“=>,","”=>,","'=>,"]
}
},
"filter": {
"stop_filter":{
"type" : "stop",
"stopwords": "_english_"
},
"pinyin_filter" : {
"remove_duplicated_term" : "true",
"keep_joined_full_pinyin" : "true",
"none_chinese_pinyin_tokenize" : "false",
"type" : "pinyin",
"lowercase": true,
"keep_none_chinese_in_joined_full_pinyin" : "true",
"keep_full_pinyin" : "false"
}
},
"tokenizer" : {
"my_ngram" : {
"token_chars" : [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram" : "2",
"type" : "ngram",
"max_gram" : "3"
},
"my_edge_ngram": {
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol",
"whitespace"
],
"min_gram": "1",
"type": "edge_ngram",
"max_gram": "20"
},
"edge_ngram_letter_digit": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "edge_ngram",
"max_gram": "30"
}
},
"analyzer" : {
"my_analyze" : {
"char_filter":["my_filter"],
"tokenizer":"standard",
"filter":["lowercase","stop_filter"]
},
"ik_pinyin_analyzer" : {
"filter" : [
"pinyin_filter"
],
"tokenizer" : "my_ngram"
},
"my_analyze_edge_ngram": {
"tokenizer": "my_edge_ngram",
"filter": "lowercase"
},
"edge_ngram_letter_digit_analyze": {
"tokenizer": "edge_ngram_letter_digit",
"filter": "lowercase"
}
}
},
"index" : {
"number_of_shards" : "3",
"number_of_replicas" : "0"
}
},
"mappings" : {
"dynamic" : "strict",
"properties" : {
"name_lang" : {
"properties": {
"ru": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}
}
},
"zh-cn": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"pinyin" : {
"type" : "text",
"term_vector" : "with_positions_offsets",
"analyzer" : "ik_pinyin_analyzer",
"similarity" : "scripted_tfidf_new",
"search_analyzer" : "pinyin"
}
,
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}
},
"analyzer" : "ik_max_word"
},
"es-es": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "spanish",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"th": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "thai",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"mai": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"tl": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"ta": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"bg": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "bulgarian",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"mk": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"lo": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"hi": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "hindi",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
},
"de": {
"type" : "text",
"store" : true,
"similarity":"scripted_tfidf_new",
"analyzer": "german",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}}
}
}
},
"name_lang_list" : {
"type": "text",
"store": true,
"similarity":"scripted_tfidf_new",
"analyzer": "my_analyze",
"fields" : {
"keyword_field" : {
"type" : "keyword",
"store": true,
"similarity":"scripted_tfidf_new",
"normalizer" : "lowercase_normalizer"
},
"edge_ngram_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "my_analyze_edge_ngram"
},
"edge_ngram_letter_digit_field": {
"type": "text",
"store": true,
"similarity": "scripted_tfidf_new",
"analyzer": "edge_ngram_letter_digit_analyze"
}
}
},
"des_list" : {
"type": "text",
"store": true,
"analyzer": "my_analyze"
},
}
}
}
And my query dsl is as follows:
{
"explain":"false",
"from": 0,
"size": 50,
"query": {
"dis_max": {
"queries": [
{ "function_score": {
"query": {
"bool": {
"should": [
{
"dis_max": {
"boost": 1,
"queries": [
{"match_phrase": {
"name_lang.zh-cn.pinyin": {
"query":"google", "boost": 20
}}},
{"match_phrase": {
"name_lang.zh-tw.pinyin": {
"query":"google", "boost": 20}
}},
{"match_phrase": {
"name_lang.zh-hk.pinyin": {
"query":"google", "boost": 20}
}},
{"match": {
"name_lang_list": {
"query": "google",
"minimum_should_match": "50%",
"boost": 30,
"fuzziness": "auto",
"prefix_length": 1,
"max_expansions": 10,
"fuzzy_rewrite":"constant_score_boolean"
}}}
]
}},
{"match": {
"description_list": {
"query": "google",
"boost": 2
}}},
{
"dis_max": {
"boost": 120,
"queries": [
{"term": {"name_lang.ru.keyword_field": "google"}},
{"term": {"name_lang.zh-cn.keyword_field": "google"}},
{"term": {"name_lang.es-es.keyword_field": "google"}},
{"term": {"name_lang.th.keyword_field": "google"}},
{"term": {"name_lang.sr.keyword_field": "google"}},
{"term": {"name_lang.kk.keyword_field": "google"}},
{"term": {"name_lang.si.keyword_field": "google"}},
{"term": {"name_lang.ka.keyword_field": "google"}},
{"term": {"name_lang.sk.keyword_field": "google"}},
{"term": {"name_lang.pt-pt.keyword_field": "google"}},
{"term": {"name_lang.sl.keyword_field": "google"}},
{"term": {"name_lang.bn.keyword_field": "google"}},
{"term": {"name_lang.jv.keyword_field": "google"}},
{"term": {"name_lang.bo.keyword_field": "google"}},
{"term": {"name_lang.bs.keyword_field": "google"}},
{"term": {"name_lang.es-us.keyword_field": "google"}},
{"term": {"name_lang.fi.keyword_field": "google"}},
{"term": {"name_lang.be.keyword_field": "google"}},
{"term": {"name_lang.bg.keyword_field": "google"}},
{"term": {"name_lang.zh-hk.keyword_field": "google"}},
{"term": {"name_lang.zh-tw.keyword_field": "google"}},
{"term": {"name_lang.fa.keyword_field": "google"}},
{"term": {"name_lang.ro.keyword_field": "google"}},
{"term": {"name_lang.nl.keyword_field": "google"}},
{"term": {"name_lang.as.keyword_field": "google"}},
{"term": {"name_lang.vi.keyword_field": "google"}},
{"term": {"name_lang.my-mm.keyword_field": "google"}},
{"term": {"name_lang.nb.keyword_field": "google"}},
{"term": {"name_lang.ja.keyword_field": "google"}},
{"term": {"name_lang.ne.keyword_field": "google"}},
{"term": {"name_lang.az.keyword_field": "google"}},
{"term": {"name_lang.it.keyword_field": "google"}},
{"term": {"name_lang.am.keyword_field": "google"}},
{"term": {"name_lang.iw.keyword_field": "google"}},
{"term": {"name_lang.et.keyword_field": "google"}},
{"term": {"name_lang.eu.keyword_field": "google"}},
{"term": {"name_lang.ar.keyword_field": "google"}},
{"term": {"name_lang.pt-br.keyword_field": "google"}},
{"term": {"name_lang.lt.keyword_field": "google"}},
{"term": {"name_lang.pl.keyword_field": "google"}},
{"term": {"name_lang.da.keyword_field": "google"}},
{"term": {"name_lang.tr.keyword_field": "google"}},
{"term": {"name_lang.hi.keyword_field": "google"}},
{"term": {"name_lang.de.keyword_field": "google"}}
]
}},
{
"dis_max": {
"boost": 20,
"queries": [
{"prefix": {"name_lang.ru.keyword_field": "google"}},
{"prefix": {"name_lang.zh-cn.keyword_field": "google"}},
{"prefix": {"name_lang.es-es.keyword_field": "google"}},
{"prefix": {"name_lang.th.keyword_field": "google"}},
{"prefix": {"name_lang.mai.keyword_field": "google"}},
{"prefix": {"name_lang.tl.keyword_field": "google"}},
{"prefix": {"name_lang.ta.keyword_field": "google"}},
{"prefix": {"name_lang.gu.keyword_field": "google"}},
{"prefix": {"name_lang.cs.keyword_field": "google"}},
{"prefix": {"name_lang.te.keyword_field": "google"}},
{"prefix": {"name_lang.pa.keyword_field": "google"}},
{"prefix": {"name_lang.km.keyword_field": "google"}},
{"prefix": {"name_lang.kn.keyword_field": "google"}}
{"prefix": {"name_lang.sk.keyword_field": "google"}},
{"prefix": {"name_lang.de.keyword_field": "google"}}
]
}},
{
"dis_max": {
"boost": 5,
"queries": [
{"prefix": {"name_lang.ru": "google"}},
{"prefix": {"name_lang.zh-cn": "google"}},
{"prefix": {"name_lang.es-es": "google"}},
{"prefix": {"name_lang.th": "google"}},
{"prefix": {"name_lang.mai": "google"}},
{"prefix": {"name_lang.tl": "google"}},
{"prefix": {"name_lang.ta": "google"}},
{"prefix": {"name_lang.gu": "google"}},
{"prefix": {"name_lang.cs": "google"}},
{"prefix": {"name_lang.te": "google"}},
{"prefix": {"name_lang.pa": "google"}},
{"prefix": {"name_lang.km": "google"}},
{"prefix": {"name_lang.kn": "google"}},
{"prefix": {"name_lang.or": "google"}},
{"prefix": {"name_lang.sv": "google"}},
{"prefix": {"name_lang.ko": "google"}},
{"prefix": {"name_lang.sw": "google"}},
{"prefix": {"name_lang.gl": "google"}},
{"prefix": {"name_lang.en-us": "google"}},
{"prefix": {"name_lang.ca": "google"}},
{"prefix": {"name_lang.sr": "google"}},
{"prefix": {"name_lang.kk": "google"}},
{"prefix": {"name_lang.si": "google"}},
{"prefix": {"name_lang.ka": "google"}},
{"prefix": {"name_lang.sk": "google"}},
{"prefix": {"name_lang.pt-pt": "google"}},
{"prefix": {"name_lang.sl": "google"}},
{"prefix": {"name_lang.bn": "google"}},
{"prefix": {"name_lang.jv": "google"}},
{"prefix": {"name_lang.bo": "google"}},
{"prefix": {"name_lang.bs": "google"}},
{"prefix": {"name_lang.es-us": "google"}},
{"prefix": {"name_lang.de": "google"}}
]
}}
]
}
},
"script_score":{
"script": {
"source": "if(doc['order_num'].size() > 0 && doc['order_num'].value > 0 && doc['popularity_long'].size() > 0 && doc['popularity_long'].value > 0) {return 30.0 * Math.log1p(doc['popularity_long'].value) / Math.log1p(20) + 5.0 * Math.log1p(doc[\"download_num\"].value) / Math.log1p(1000000000) + 30.0 * Math.log1p(10.0 * doc['order_num'].value) / Math.log1p(1000); } if(doc['popularity_long'].size() > 0 && doc['popularity_long'].value > 0) {return 30.0 * Math.log1p(doc['popularity_long'].value) / Math.log1p(20) + 5.0 * Math.log1p(doc[\"download_num\"].value) / Math.log1p(1000000000) / Math.log1p(1000); } if(doc['order_num'].size() > 0 && doc['order_num'].value > 0) {return 5.0 * Math.log1p(doc[\"download_num\"].value) / Math.log1p(1000000000) + 30.0 * Math.log1p(10.0 * doc['order_num'].value) / Math.log1p(1000); } else return 5.0 * Math.log1p(doc[\"download_num\"].value) / Math.log1p(1000000000);"
}
},
"boost_mode": "sum",
"max_boost": 300
}}
]
}
}
}
How can I improve the performance of query?

Related

How to search in elastic search to return results like this

I'm struggeling to return the correct results as we wanted.
I wanted an autocomplete/search as you type solution. and ES provides that, but isn't working as expected. bv If I search on "diabetes" the exact match is displayed on item 50 (of 100) and the first returned example is an entire sentence.
So I was trying to work with boost.
My expactation is to returned the exact match at the top example, then the auto complete, then word in the middle of the sentence. But I don't know how to work with this.
My tryouts but not working:
"diabetes"
"diabetes test abcd"
"milde diabetes"
"milde diabetes militus"
My mapping is:
{
"mappings": {
"properties": {
"Descriptions": {
"type": "nested",
"properties": {
"Term": {
"type": "search_as_you_type"
},
"Code": {
"type": "search_as_you_type",
},
"System": {
"type": "keyword",
"index": true
},
"LanguageId": {
"type": "keyword",
"index": true
},
"Purpose": {
"type": "keyword",
"index": true
},
"Active": {
"type": "keyword",
"index": true
}
}
},
"Mappings": {
"properties": {
"To": {
"type": "keyword",
"index": true
},
"ToSys": {
"type": "keyword",
"index": true
}
}
}
}
}
}
Objects:
"Code": "10008220/A1",
"Descriptions": [
{
"Term": "milde diabetes",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6ee",
},
{
"Term": "fracture du bras",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6e3",
}
],
"SendVersion": "2021-12-07T17:01:53.786755Z"
object 2
"Code": "10008220/A1",
"Descriptions": [
{
"Term": "milde diabetes militus",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6ee",
},
{
"Term": "fracture du bras",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6e3",
}
],
"SendVersion": "2021-12-07T17:01:53.786755Z"
object 3
"Code": "10008220/A1",
"Descriptions": [
{
"Term": "diabetes",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6ee",
},
{
"Term": "fracture du bras",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6e3",
}
],
"SendVersion": "2021-12-07T17:01:53.786755Z"
object 4
"Code": "10008220/A1",
"Descriptions": [
{
"Term": "diabetes test abcd",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6ee",
},
{
"Term": "fracture du bras",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6e3",
}
],
"SendVersion": "2021-12-07T17:01:53.786755Z"
object 5
"Code": "10008220/A1",
"Descriptions": [
{
"Term": "this cannot be returned",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6ee",
},
{
"Term": "fracture du bras",
"LanguageId": "3d50c237-0add-43e7-92a2-5edf1ac7c6e3",
}
],
"SendVersion": "2021-12-07T17:01:53.786755Z"
Having a few analyzers for the same field should help:
"Term": {
"type": "search_as_you_type",
"fields": {
"keyword": {"type": "keyword"},
"standard": {"type": "text", "analyzer": "standard"},
"french": {"type": "text", "analyzer": "french"},
}
}
Then you query them all:
"query": {
"bool": {
"should": [
{"match": {"Term": {"query": "diabetes", "boost": 1}}},
{"match_phrase": {"Term": {"query": "diabetes", "boost": 5}}},
{"match_phrase_prefix": {"Term": {"query": "diabetes", "boost": 5}}},
{"match": {"Term.keyword": {"query": "diabetes", "boost": 100}}},
{"match": {"Term": {"query": "diabetes.standard", "boost": 20}}},
{"match": {"Term": {"query": "diabetes.french", "boost": 30}}},
]
}
}
You'll have to tune boosting values to get good results for different queries (single word, phrase, multi word, prefix, etc)

Why elastic search not returning result when query contains "IN" prefix?

Below Elastic Query is not returning any result for my application
"query" : {
"bool" : {
"must" : [
{
"simple_query_string" : {
"query" : "IN-123456",
"fields" : [
"field1.auto^1.0",
"field2.auto^1.0"
],
"flags" : -1,
"default_operator" : "AND",
"analyze_wildcard" : false,
"auto_generate_synonyms_phrase_query" : true,
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"fuzzy_transpositions" : true,
"boost" : 1.0
}
}],
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
}
Note that I have document present in elastic data source with matching text "IN-123456" for field2.
I am able to search same document with "123456" as text in query.
Below is the index used
{
"document_****": {
"aliases": {
"document": {}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"date_detection": false,
"properties": {
"#timestamp": {
"type": "date"
},
"field2": {
"type": "keyword",
"fields": {
"auto": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
},
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "document_***",
"creation_date": "1****",
"analysis": {
"filter": {
"autocomplete_filter_30": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "30"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_30": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter_30"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_nonstop": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "***",
"version": {
"created": "6020499"
}
}
}
}
}
Note: Few values are replaced with * for confidentiality reason
Check your mapping. The below query working fine.
POST v_upload_branch/_doc
{
"branch_name":"IN-123456",
"branch_head":"Chennai",
}
GET v_upload_branch/_search
{
"query" : {
"bool" : {
"must" : [
{
"simple_query_string" : {
"query" : "IN-123456",
"fields" : [
"branch_head^1.0",
"branch_name^1.0"
],
"flags" : -1,
"default_operator" : "AND",
"analyze_wildcard" : false,
"auto_generate_synonyms_phrase_query" : true,
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"fuzzy_transpositions" : true,
"boost" : 1.0
}
}],
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
}
Below is the index used
{
"document_****": {
"aliases": {
"document": {}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"date_detection": false,
"properties": {
"#timestamp": {
"type": "date"
},
"field2": {
"type": "keyword",
"fields": {
"auto": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
},
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "document_***",
"creation_date": "1****",
"analysis": {
"filter": {
"autocomplete_filter_30": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "30"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_30": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter_30"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_nonstop": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "***",
"version": {
"created": "6020499"
}
}
}
}
}
Note: Few values are replaced with * for confidentiality reason
After analyzing my index mapping found that token filter stop is removing the prefix IN from token streams. since it is part of default stop word list english stop words
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stop-tokenfilter.html
Because of this elastic search is ignoring the prefix IN while searching and not returning any result

Elasticsearch not returns all fields for each hit row

I have a problem with my Elasticsearch index. I'm trying to get some fields for each row, but elastic returns not all of them when I'm searching. If I try to 'get' document by id - it returns all fields
In my query I'm trying to use _source field, but it not works - query returns only several fields from _source.
Is there any restrictions on it? Restrictions on amount or size of _source fields?
Elastic version 7.1
My mapping:
"video": {
"properties": {
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 128
},
"basic_edge_ngram_analyzer": {
"type": "text",
"analyzer": "basic_edge_ngram_analyzer"
},
"basic_edge_ngram_analyzer_no_digit": {
"type": "text",
"analyzer": "basic_edge_ngram_analyzer_no_digit"
},
"basic_ngram_analyzer": {
"type": "text",
"analyzer": "basic_ngram_analyzer"
},
"basic_ngram_analyzer_no_digit": {
"type": "text",
"analyzer": "basic_ngram_analyzer_no_digit"
},
"numeric_analyzer": {
"type": "text",
"analyzer": "numeric_analyzer"
},
"translit_analyzer": {
"type": "text",
"analyzer": "translit_analyzer"
},
"translit_double_metaphone_analyzer": {
"type": "text",
"analyzer": "translit_double_metaphone_analyzer"
}
}
},
"inverse_title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 128
},
"basic_edge_ngram_analyzer": {
"type": "text",
"analyzer": "basic_edge_ngram_analyzer"
},
"basic_edge_ngram_analyzer_no_digit": {
"type": "text",
"analyzer": "basic_edge_ngram_analyzer_no_digit"
},
"basic_ngram_analyzer": {
"type": "text",
"analyzer": "basic_ngram_analyzer"
},
"basic_ngram_analyzer_no_digit": {
"type": "text",
"analyzer": "basic_ngram_analyzer_no_digit"
},
"numeric_analyzer": {
"type": "text",
"analyzer": "numeric_analyzer"
},
"translit_analyzer": {
"type": "text",
"analyzer": "translit_analyzer"
},
"translit_double_metaphone_analyzer": {
"type": "text",
"analyzer": "translit_double_metaphone_analyzer"
}
}
},
"thumbnail_url": {
"type": "keyword",
"store": "true"
},
"is_classic": {
"type": "boolean",
"store": "true"
},
"is_club": {
"type": "boolean",
"store": "true"
},
"product_id": {
"type": "integer",
"store": "true"
},
"duration": {
"type": "integer",
"store": "true"
},
"feed_name": {
"type": "keyword",
"store": "true"
},
"feed_url": {
"type": "keyword",
"store": "true"
},
"created_ts": {
"type": "date",
"store": "true"
},
"hot_until": {"type": "date", "format": "date_hour_minute_second_fraction"},
"description": {
"type": "keyword"
},
"mi_tv_id": {"type": "integer"},
"total_views": {"type": "long"},
"month_views": {"type": "long"},
"week_views": {"type": "long"},
"day_views": {"type": "long"},
"blocked_countries": {"type": "keyword"},
"linked_persons": {
"type": "nested",
"properties": {
"id": {"type": "integer"},
"name": {"type": "keyword"}
}
},
"linked_tags": {
"type": "nested",
"properties": {
"id": {"type": "integer"},
"name": {"type": "keyword"}
}
},
"linked_hashtags":{
"type": "nested",
"properties": {
"id": {"type": "integer"},
"name": {"type": "keyword"}
}
},
}
}
My query:
GET /video_idx/_search
{
"aggs": {
"mi_tv_id": {
"terms": {
"field": "mi_tv_id",
"size": 10
}
},
"linked_hashtags_id": {
"aggs": {
"linked_hashtags_id": {
"terms": {
"field": "linked_hashtags.id",
"size": 10
}
}
},
"nested": {
"path": "linked_hashtags"
}
},
"author_id": {
"terms": {
"field": "author_id",
"size": 10
}
},
"linked_tags_id": {
"aggs": {
"linked_tags_id": {
"terms": {
"field": "linked_tags.id",
"size": 10
}
}
},
"nested": {
"path": "linked_tags"
}
},
"linked_persons_id": {
"aggs": {
"linked_persons_id": {
"terms": {
"field": "linked_persons.id",
"size": 10
}
}
},
"nested": {
"path": "linked_persons"
}
}
},
"highlight": {
"fields": {
"inverse_title": {
"pre_tags": ["<b>"],
"type": "plain",
"post_tags": ["</b>"]
},
"title": {
"pre_tags": ["<b>"],
"type": "plain",
"post_tags": ["</b>"]
}
}
},
"from": 0,
"size": 20,
"_source": {
"includes":[ "mi_tv_id", "author_id", "hot_until", "id", "linked_persons", "linked_hashtags", "linked_tags", "total_views", "thumbnail_url", "feed_name", "feed_url", "duration", "is_club", "is_classic", "product_id", "created_ts", "title", "inverse_title", "description"]
},
"query": {
"function_score": {
"script_score": {
"script": "\n double total = _score;\n \n if (doc['total_views'].size() > 0) {total = total * Math.log(10 + 0.000087 * doc['total_views'].value)}\n if (doc['month_views'].size() > 0) {total = total * Math.log(10 + 0.00025 * doc['month_views'].value)}\n if (doc['week_views'].size() > 0) {total = total * Math.log(10 + 0.00077 * doc['week_views'].value)}\n if (doc['day_views'].size() > 0) {total = total * Math.log(10 + 0.0025 * doc['day_views'].value)}\n if (doc['hot_until'].size() > 0) {total = 1.5 * total}\n \n if (doc['mi_tv_id'].size() > 0) {total = total * 1.5}\n \n return total \n "
},
"query": {
"bool": {
"minimum_should_match": "20%",
"should": [{
"multi_match": {
"fields": ["title.basic_ngram_analyzer", "inverse_title.basic_ngram_analyzer"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "65%",
"type": "cross_fields",
"boost": 5.5,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.keyword", "inverse_title.keyword"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "100%",
"type": "cross_fields",
"boost": 12.5,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.translit_analyzer", "inverse_title.translit_analyzer"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "65%",
"type": "cross_fields",
"boost": 3,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.numeric_analyzer", "inverse_title.numeric_analyzer"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "100%",
"type": "cross_fields",
"boost": 6,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.basic_ngram_analyzer_no_digit", "inverse_title.basic_ngram_analyzer_no_digit"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "65%",
"type": "cross_fields",
"boost": 5.5,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.basic_edge_ngram_analyzer", "inverse_title.basic_edge_ngram_analyzer"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "65%",
"type": "cross_fields",
"boost": 5.5,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["title.translit_double_metaphone_analyzer", "inverse_title.translit_double_metaphone_analyzer"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "65%",
"type": "cross_fields",
"boost": 1,
"query": "\u0434\u043e\u043c 2"
}
}, {
"multi_match": {
"fields": ["description"],
"operator": "and",
"tie_breaker": 1.0,
"minimum_should_match": "100%",
"type": "cross_fields",
"boost": 1.0,
"query": "\u0434\u043e\u043c 2"
}
}],
"must_not": [{
"terms": {
"blocked_countries": ["RU"]
}
}]
}
}
}
}
}
You need to add all the stored field to the stored_fields parameter in your query:
"_source": {
"includes":[ "mi_tv_id", "author_id", "hot_until", "id", "linked_persons", "linked_hashtags", "linked_tags", "total_views", "thumbnail_url", "feed_name", "feed_url", "duration", "is_club", "is_classic", "product_id", "created_ts", "title", "inverse_title", "description"]
},
"stored_fields": ["feed_name", "feed_url", "duration", "is_club", ...],

Elastic query BOOL with AND & OR

I am new with Elastic. I am trying to do a simple query that in sql would be:
Select * from [movies] where is_adult = false AND (movie_title like '%xxx%' OR genre = 'xxxx')
In Elastic the closer that I could go was:
GET /idxsearch/movies/_search
{
"size": 10,
"query": {
"bool": {
"filter": {
"term": {
"is_adult": false
}
},
"must": [
{
"multi_match": {
"query": "xxx",
"operator": "and",
"fields": [ "movie_title.default^5", "movie_title.shingles" ]
}
}
],
"should": [
{"term": {
"genre.name": {
"value": "xxxx"
}
}}
]
}
}
}
But testing this query with some data I see that is not working....
cheers
Just to check, here is part of the custom analyser:
"shingle_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "shingle_filter"]
}
"shingle_filter": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 2
}
"match_title": {
"type": "text",
"fields": {
"default": {"type": "text", "analyzer": "default_analyzer" },
"snowball": {"type": "text", "analyzer": "snowball_analyzer"},
"shingles": {"type": "text", "analyzer": "shingle_analyzer" },
"ngrams": {"type": "text", "analyzer": "edgengram_analyzer",
"search_analyzer": "default_analyzer"}
}

Some Multi word synonyms are not working in elasticsearch for nested fields

I am trying to use synonym analyzer at query time and not getting expected results. Can someone throw some light on this?
Here is my mapping for the index:
{
"jobs_user_profile_v2": {
"mappings": {
"profile": {
"_all": {
"enabled": false
},
"_ttl": {
"enabled": true
},
"properties": {
"rsa": {
"type": "nested",
"properties": {
"answer": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "synonym",
"position_offset_gap": 100
},
"answerId": {
"type": "long"
},
"answerOriginal": {
"type": "string",
"index": "not_analyzed"
},
"createdAt": {
"type": "long"
},
"label": {
"type": "string",
"index": "not_analyzed"
},
"labelOriginal": {
"type": "string",
"index": "not_analyzed"
},
"question": {
"type": "string",
"index": "not_analyzed"
},
"questionId": {
"type": "long"
},
"questionOriginal": {
"type": "string"
},
"source": {
"type": "integer"
},
"updatedAt": {
"type": "long"
}
}
}
}
}
}
}
}
The field to focus on is rsa.answer, which is the field I am querying.
My synonym mapping:
Beautician,Stylist,Make up artist,Massage therapist,Therapist,Spa,Hair Dresser,Salon,Beauty Parlour,Parlor => Beautician
Carpenter,Wood Worker,Furniture Carpenter => Carpenter
Cashier,Store Manager,Store Incharge,Purchase Executive,Billing Executive,Billing Boy => Cashier
Content Writer,Writer,Translator,Writing,Copywriter,Content Creation,Script Writer,Freelance Writer,Freelance Content Writer => Content Writer
My Search Query:
http://{{domain}}/jobs_user_profile_v2/_search
{
"query": {
"nested":{
"path": "rsa",
"query":{
"query_string": {
"query": "hair dresser",
"fields": ["answer"],
"analyzer" :"synonym"
}
},
"inner_hits": {
"explain": true
}
}
},
"explain" : true,
"sort" : [ {
"_score" : { }
} ]
}
It is showing proper Beautician and 'Cashierprofiles for search queryHair Dresserandbilling executivebut not showing anything forwood worker => carpenter` case.
My analyzer results:
http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=hair dresser
{
"tokens": [
{
"token": "beautician",
"start_offset": 0,
"end_offset": 12,
"type": "SYNONYM",
"position": 1
}
]
}
and for wood worker case
http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=wood worker
{
"tokens": [
{
"token": "carpenter",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 1
}
]
}
It is also not working a few other cases.
My analyzer setting for index:
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "10"
}
},
"analyzer": {
"text_en_splitting_search": {
"type": "custom",
"filter": [
"stop",
"lowercase",
"porter_stem",
"word_delimiter"
],
"tokenizer": "whitespace"
},
"synonym": {
"filter": [
"stop",
"lowercase",
"synonym"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"text_en_splitting": {
"filter": [
"lowercase",
"porter_stem",
"word_delimiter"
],
"type": "custom",
"tokenizer": "whitespace"
},
"text_general": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "standard"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "edge_ngram_tokenizer"
},
"autocomplete_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "whitespace"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "2",
"type": "edgeNGram",
"max_gram": "10"
}
}
}
For the above case one multi-match is more ideal than query-string.
Multi-Match unlike query string does not tokenize the query terms before analyzing it . As a result multi-word synonyms may not work as expected.
Example:
{
"query": {
"nested": {
"path": "rsa",
"query": {
"multi_match": {
"query": "wood worker",
"fields": [
"rsa.answer"
],
"type" : "cross_fields",
"analyzer": "synonym"
}
}
}
}
}
If for some reason you prefer query-string then you would need to pass the entire query in double quotes to ensure it is not tokenized:
example :
post test/_search
{
"query": {
"nested": {
"path": "rsa",
"query": {
"query_string": {
"query": "\"wood worker\"",
"fields": [
"rsa.answer"
],
"analyzer": "synonym"
}
}
}
}
}

Resources