Getting elasticsearch synonym work? - elasticsearch

I'm trying a simple test on elasticsearch synonym without success, this is what I am so far
POST /mysearch
{
"settings" : {
"number_of_shards" : 5,
"number_of_replicas" : 0,
"analysis": {
"filter" : {
"my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : true
},
"my_stopwords": {
"type": "stop",
"stopwords": [ ]
},
"mysynonym" : {
"type" : "synonym",
"synonyms" : [
"foo => bar"
]
}
},
"char_filter": {
"my_htmlstrip": {
"type": "html_strip"
}
},
"analyzer": {
"index_text_analyzer":{
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "my_stopwords", "my_ascii_folding" ]
},
"index_html_analyzer":{
"type": "custom",
"tokenizer": "standard",
"char_filter": "my_htmlstrip",
"filter": [ "lowercase", "my_stopwords", "my_ascii_folding" ]
},
"search_text_analyzer":{
"type": "custom",
"tokenizer": "standard",
"filter": [ "mysynonym", "lowercase", "my_stopwords" ]
}
}
}
},
"mappings" : {
"news" : {
"_source" : { "enabled" : true },
"_all" : {"enabled" : false},
"properties" : {
"name" : { "type" : "string", "index" : "analyzed", "store": "yes" , "analyzer": "index_text_analyzer" , "search_analyzer": "search_text_analyzer" }
}
}
}
}
Add some documnents
POST /mysearch/news
{
"name":"foo kar"
}
POST /mysearch/news
{
"name":"bar kar"
}
Do a search
POST /mysearch/_search?q=name:foo
{
}
Give me result that match foo , not bar , so why?

I think you are doing it wrong, for the following reasons:
why do you use foo => bar? This means that you replace foo with bar, whereas if they are synonyms, they should be both indexed. So, I would use foo,bar instead.
why, at indexing time, you are using a different analyzer than at search time? At indexing time you will want your text to be indexed using its synonyms.
Let me give you an example: assuming you index foo kar. Since bar is a synonym of foo you'd want to index its synonym, as well, so that the index will contain foo, bar, kar. In this way, if you search for foo or bar that document WILL be found in the index even if the original text didn't contain bar.
These being said, I would suggest the following:
POST /mysearch
{
"settings": {
"number_of_shards": 5,
"number_of_replicas": 0,
"analysis": {
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
},
"my_stopwords": {
"type": "stop",
"stopwords": []
},
"mysynonym": {
"type": "synonym",
"synonyms": [
"foo,bar"
]
}
},
"char_filter": {
"my_htmlstrip": {
"type": "html_strip"
}
},
"analyzer": {
"index_text_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"my_stopwords",
"my_ascii_folding"
]
},
"index_html_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": "my_htmlstrip",
"filter": [
"lowercase",
"my_stopwords",
"my_ascii_folding"
]
},
"search_text_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"mysynonym",
"lowercase",
"my_stopwords"
]
}
}
}
},
"mappings": {
"news": {
"_source": {
"enabled": true
},
"_all": {
"enabled": false
},
"properties": {
"name": {
"type": "string",
"index": "analyzed",
"store": "yes",
"analyzer": "search_text_analyzer"
}
}
}
}
}
Or, if you don't want to index the synonyms, just indexing the original text and then, only at search time, search for the synonyms, as well, do the following changes:
"synonyms": ["foo,bar"] because, as I mentioned above, you will replace foo with bar otherwise
explicitly specify the two analyzers:
"index_analyzer": "index_text_analyzer",
"search_analyzer": "search_text_analyzer"
The two changes above will result in your text being indexed as is (with no synonyms), but at search time, when you want to search for foo, Elasticsearch will instead search for its synonym, as well: foo or bar.

Related

Elastic search multiple analyzers on index

I have an index with Name field .
I want to use soundex analyzer and synonym analyzer on that field.
I want to achieve both in a single index .Is it even possible ?
Please help me experts out there
Index 1
{
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "phonetic_sample",
"creation_date": "1603097131476",
"analysis": {
"filter": {
"my_soundex": {
"replace": "false",
"type": "phonetic",
"encoder": "soundex"
}
},
"analyzer": {
"my_analyzer": {
"filter": [
"lowercase",
"my_soundex"
],
"tokenizer": "standard"
}
}
}
I query for Catherine and match Catherine,Katherine and Kathryn
Index 2
{
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "phonetic_synonym",
"creation_date": "1603121439096",
"analysis": {
"filter": {
"synonym": {
"format": "wordnet",
"type": "synonym",
"synonyms": [
"s(100000001,1,'Bill',v,1,0).",
"s(100000001,2,'William',v,1,0).",
"s(100000001,3,'Wilhelm',v,1,0)."
]
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
}
}
}
I query for Bill and match Bill, William and Wilhelm
You can use multi-field with multiple analyzers. You can declare
sub-fields for the name field, each with a different analyzer.
Below is the modified index mapping.
Index Mapping:
{
"settings": {
"index": {
"analysis": {
"filter": {
"my_soundex": {
"type": "phonetic",
"encoder": "metaphone",
"replace": false
},
"synonym": {
"format": "wordnet",
"type": "synonym",
"synonyms": [
"s(100000001,1,'Bill',v,1,0).",
"s(100000001,2,'William',v,1,0).",
"s(100000001,3,'Wilhelm',v,1,0)."
]
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
},
"my_analyzer": {
"filter": [
"lowercase",
"my_soundex"
],
"tokenizer": "standard"
}
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analzyer": "synonym",
"search_analyzer": "synonym",
"fields": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"search_analyzer": "my_analyzer"
}
}
}
}
}
}
Then you can refer to name and name.content in your queries. Your search query will be like this:
{
"query": {
"multi_match": {
"query": "Bill",
"fields": [
"name",
"name.content"
],
"type": "most_fields"
}
}
}

Adding new documents to elasticsearch with mapping via elasticsearch.index, body structure

I'm building blog-like app with flask (based on Miguel Grinberg Megatutorial) and I'm trying to setup ES indexing that would support autocomplete feature. I'm struggling with setting up indexing correctly.
I started with (working) simple indexing mechanism:
from flask import current_app
def add_to_index(index, model):
if not current_app.elasticsearch:
return
payload = {}
for field in model.__searchable__:
payload[field] = getattr(model, field)
current_app.elasticsearch.index(index=index, id=model.id, body=payload)
and after some fun with Google I found out that my body could look something like that (probably with fewer analyzers, but I'm coping exactly as I found it somewhere, where author claims it works):
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
field: {
"properties": {
"name": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
}
},
"analyzer": "standard"
}
}
}
}
}
I figured out that I can modify original mechanism to something like:
for field in model.__searchable__:
temp = getattr(model, field)
fields[field] = {"properties": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
}
},
"analyzer": "standard"
}}
payload = {
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": fields
}
but that's where I'm lost. Where do I put actual content (temp=getattr(model, field)) in this document so that whole thing works? I couldn't find any example or relevant part of documentation that would cover updating index with slightly more complex mappings and so on, is this even correct/doable? Every guide I see covers bulk indexing and somehow I fail to make connection.
I think you are a little bite confuse let me try to explain. What you want is adding one document in elastic with:
current_app.elasticsearch.index(index=index, id=model.id,
body=payload)
Which is using the index() method defined in the elasticsearch-py lib
Check the example here:
https://elasticsearch-py.readthedocs.io/en/master/index.html#example-usage
body must be your document a simple dict, as shown in the example from the doc.
What you set is the settings of the index which is different. Take the analogy of the database, you set the schema of a table inside the document.
To set the settings if you want to set the given settings you need to use put_settings, as defined here:
https://elasticsearch-py.readthedocs.io/en/master/api.html?highlight=settings#elasticsearch.client.ClusterClient.put_settings
I hope it help you.

Synonyms for Sankt and St

I'm trying to get synonyms working for my existing setup. Currently I have this settings:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
},
"autocomplete_search": {
"tokenizer": "lowercase",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
}
},
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15,
"token_chars": [
"letter",
"digit",
"symbol"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
In this City Index I have documents like that:
St. Wolfgang or Sankt Wolfgang and so on. For me St. and Sankt are synonyms. So if I search for Sankt both of the documents should appear.
I created a new Filter and added the filter to my autocomplete analyzer:
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st."
]
}
So good for now. But the issues I faced are following:
Its clear that the dot after st is not analyzed and not searchable at the moment. But For the synonym the dot is important.
The second issue is if I search for sankt the synonym is st which gives me all documents which starts with st like Stuttgart. So this happens also because the dot is not used.
Do you have any idea how I can achieve the stuff? If you need any more information, please let me know.
Update:
After discussions I did this changes in my settings:
changed edge_ngram tokenizer to a standard tokenizer.
added an edgeNGram filter and added this filter to my analyzer.
deleted the filter german_normalization and my_ascii_folding from my analyzer to simplify the tests.
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "autocomplete",
"filter": [
"my_synonym_filter",
"lowercase"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "standard"
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
I added these 3 documents to the index:
"name":"Sankt Wolfgang",
"name":"Stuttgart",
"name":"St. Wolfgang"
Query String - Result
st -> "St. Wolfgang", "Stuttgart"
st. -> "St. Wolfgang", "Sankt Wolfgang"
sankt -> "St. Wolfgang", "Sankt Wolfgang"
This works pretty well for me. The main point here is to make sure to
put the synonym filter after the lowercase one
put the edge-n-gram filter at the end
use the edge-n-gram only at indexing time
So we create the index:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st. => sankt"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
Then we index data:
PUT city/city/1
{
"name":"St. Wolfgang"
}
PUT city/city/2
{
"name":"Stuttgart"
}
PUT city/city/3
{
"name":"Sankt Wolfgang"
}
Finally searching for either st or sankt will only return documents 1 and 3 but not 2
POST city/_search?q=name:st
POST city/_search?q=name:sankt

Query Elastic Search where word indexed has space

I recently tried to use Elastic search. However, I am struggling to query for the following scenario:
I have my index set up with this:
"analysis": {
"index_analyzer": {
"my_index_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "nGram"],
"char-filter": ["my_pattern"]
}
},
"search_analyzer": {
"my_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "nGram"],
"char-filter": ["my_pattern"]
}
},
"filter": {
"nGram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 40
}
},
"char_filter" : {
"my_pattern":{
"type":"pattern_replace",
"pattern":"\u0020",
"replacement":""
}
}
And the documents that are indexed are :
{
name:'My self'
},
{
name:'Hell o'
}
If I search for Myself, I am expecting it to return the first JSON object, however this is not happening..
I am searching using this (where term is just the string being searched):
var query = {
match: {
location: term
}
};
client.search({
index: 'requests',
analyzer:'my_search_analyzer',
body: {
query:query
}
})
I would really appreciate some guidance on this!
Kind Regards
JB
You are almost there, your index definition only has some small issues and typos which we will fix:
You don't need index_analyzer and search_analyzer simply define my_index_analyzer and my_search_analyzer directly under the analyzer element.
char-filter should read char_filter (with underscore)
your space pattern needs an additional backslash
This is the corrected settings/mappings that I used:
{
"settings": {
"analysis": {
"analyzer": {
"my_index_analyzer": { <--- 1. directly under analyzer
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"nGram"
],
"char_filter": [ <--- 2. underscore
"my_pattern"
]
},
"my_search_analyzer": { <--- 1. directly under analyzer
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"nGram"
],
"char_filter": [ <--- 2. underscore
"my_pattern"
]
}
},
"filter": {
"nGram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 40
}
},
"char_filter": {
"my_pattern": {
"type": "pattern_replace",
"pattern": "\\u0020", <--- 3. additional backslash
"replacement": ""
}
}
}
},
"mappings": {
"request": {
"properties": {
"location": {
"type": "string",
"index_analyzer": "my_index_analyzer"
}
}
}
}
}
Then you can index your two sample documents:
curl -XPUT localhost:9200/requests/request/1 -d '{"location":"My self"}'
curl -XPUT localhost:9200/requests/request/2 -d '{"location":"Hell o"}'
And you'll get what you expect:
curl -XPOST localhost:9200/requests/request/_search -d '{
"query": {
"match": {
"location": "Myself"
}
}
}'
will return the document with My self

How to match query terms containing hyphens or trailing space in elasticsearch

In the mapping char_filter section of elasticsearch mapping, its kind of vague and I'm having a lot of difficulty understanding if and how to use charfilter analyzer: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-mapping-charfilter.html
Basically the data we are storing in the index are ids of type String that look like this: "008392342000". I want to be able to search such ids when query terms actually contain a hyphen or trailing space like this: "008392342-000 ".
How would you advise I set the analyzer like?
Currently this is the definition of the field:
"mappings": {
"client": {
"properties": {
"ucn": {
"type": "multi_field",
"fields": {
"ucn_autoc": {
"type": "string",
"index": "analyzed",
"index_analyzer": "autocomplete_index",
"search_analyzer": "autocomplete_search"
},
"ucn": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
Here is the settings for the index containing analyzer etc.
"settings": {
"analysis": {
"filter": {
"autocomplete_ngram": {
"max_gram": 15,
"min_gram": 1,
"type": "edge_ngram"
},
"ngram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 8
}
},
"analyzer": {
"lowercase_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "keyword"
},
"autocomplete_index": {
"filter": [
"lowercase",
"autocomplete_ngram"
],
"tokenizer": "keyword"
},
"ngram_index": {
"filter": [
"ngram_filter",
"lowercase"
],
"tokenizer": "keyword"
},
"autocomplete_search": {
"filter": [
"lowercase"
],
"tokenizer": "keyword"
},
"ngram_search": {
"filter": [
"lowercase"
],
"tokenizer": "keyword"
}
},
"index": {
"number_of_shards": 6,
"number_of_replicas": 1
}
}
}
You haven't provided your actual analyzers, what data goes in and what your expectations are, but based on the info you provided I would start with this:
{
"settings": {
"analysis": {
"char_filter": {
"my_mapping": {
"type": "mapping",
"mappings": [
"-=>"
]
}
},
"analyzer": {
"autocomplete_search": {
"tokenizer": "keyword",
"char_filter": [
"my_mapping"
],
"filter": [
"trim"
]
},
"autocomplete_index": {
"tokenizer": "keyword",
"filter": [
"trim"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"ucn": {
"type": "multi_field",
"fields": {
"ucn_autoc": {
"type": "string",
"index": "analyzed",
"index_analyzer": "autocomplete_index",
"search_analyzer": "autocomplete_search"
},
"ucn": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
The char_filter would replace - with nothing: -=>. I would, also, use the trim filter to get rid of any trailing or leading white spaces. No idea what your autocomplete_index analyzer you have, I just used a keyword one.
Testing the analyzer GET /my_index/_analyze?analyzer=autocomplete_search&text= 0123-34742-000 results in:
"tokens": [
{
"token": "012334742000",
"start_offset": 0,
"end_offset": 17,
"type": "word",
"position": 1
}
]
which means it does eliminate the - and the white spaces.
And the typical query would be:
{
"query": {
"match": {
"ucn.ucn_autoc": " 0123-34742-000 "
}
}
}

Resources