Query Elastic Search where word indexed has space - elasticsearch

I recently tried to use Elastic search. However, I am struggling to query for the following scenario:
I have my index set up with this:
"analysis": {
"index_analyzer": {
"my_index_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "nGram"],
"char-filter": ["my_pattern"]
}
},
"search_analyzer": {
"my_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "nGram"],
"char-filter": ["my_pattern"]
}
},
"filter": {
"nGram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 40
}
},
"char_filter" : {
"my_pattern":{
"type":"pattern_replace",
"pattern":"\u0020",
"replacement":""
}
}
And the documents that are indexed are :
{
name:'My self'
},
{
name:'Hell o'
}
If I search for Myself, I am expecting it to return the first JSON object, however this is not happening..
I am searching using this (where term is just the string being searched):
var query = {
match: {
location: term
}
};
client.search({
index: 'requests',
analyzer:'my_search_analyzer',
body: {
query:query
}
})
I would really appreciate some guidance on this!
Kind Regards
JB

You are almost there, your index definition only has some small issues and typos which we will fix:
You don't need index_analyzer and search_analyzer simply define my_index_analyzer and my_search_analyzer directly under the analyzer element.
char-filter should read char_filter (with underscore)
your space pattern needs an additional backslash
This is the corrected settings/mappings that I used:
{
"settings": {
"analysis": {
"analyzer": {
"my_index_analyzer": { <--- 1. directly under analyzer
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"nGram"
],
"char_filter": [ <--- 2. underscore
"my_pattern"
]
},
"my_search_analyzer": { <--- 1. directly under analyzer
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"nGram"
],
"char_filter": [ <--- 2. underscore
"my_pattern"
]
}
},
"filter": {
"nGram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 40
}
},
"char_filter": {
"my_pattern": {
"type": "pattern_replace",
"pattern": "\\u0020", <--- 3. additional backslash
"replacement": ""
}
}
}
},
"mappings": {
"request": {
"properties": {
"location": {
"type": "string",
"index_analyzer": "my_index_analyzer"
}
}
}
}
}
Then you can index your two sample documents:
curl -XPUT localhost:9200/requests/request/1 -d '{"location":"My self"}'
curl -XPUT localhost:9200/requests/request/2 -d '{"location":"Hell o"}'
And you'll get what you expect:
curl -XPOST localhost:9200/requests/request/_search -d '{
"query": {
"match": {
"location": "Myself"
}
}
}'
will return the document with My self

Related

Ignore specific character during fuzzy searches analyzer in Elastic search

I have a fuzzy search analyzer in elastic search with following documents
PUT test_index
{
"settings": {
"index": {
"max_ngram_diff": 40
},
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete"
]
},
"autocomplete_search": {
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
},
"filter": {
"autocomplete": {
"type": "ngram",
"min_gram": 2,
"max_gram": 40
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
PUT test_index/_doc/1
{ "title": "HRT 2018-BN18 N-SB" }
PUT test_index/_doc/2
{ "title": "GMC 2019-BN18 A-SB" }
How can i ignore the hyphen ('-') during my fuzzy search so that GMC 2019-BN18 A-SB , gmc 2019, gmc 2019-BN18 A-SB and GMC 2019-BN18 ASB yield the same document
I had tried to create another analyzer separately but i am not sure how can we apply multiple analyzer on the same field
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
]
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => "
]
}
}
}
}
You're on the right path, you just need to add that character filter to both analyzers to make sure the hyphens get removed at indexing and search time:
PUT test_index
{
"settings": {
"index": {
"max_ngram_diff": 40
},
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => "
]
}
},
"analyzer": {
"autocomplete": {
"char_filter": [
"my_char_filter"
],
"tokenizer": "whitespace",
"filter": [
"lowercase",
"autocomplete"
]
},
"autocomplete_search": {
"char_filter": [
"my_char_filter"
],
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
},
"filter": {
"autocomplete": {
"type": "ngram",
"min_gram": 2,
"max_gram": 40
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}

Adding new documents to elasticsearch with mapping via elasticsearch.index, body structure

I'm building blog-like app with flask (based on Miguel Grinberg Megatutorial) and I'm trying to setup ES indexing that would support autocomplete feature. I'm struggling with setting up indexing correctly.
I started with (working) simple indexing mechanism:
from flask import current_app
def add_to_index(index, model):
if not current_app.elasticsearch:
return
payload = {}
for field in model.__searchable__:
payload[field] = getattr(model, field)
current_app.elasticsearch.index(index=index, id=model.id, body=payload)
and after some fun with Google I found out that my body could look something like that (probably with fewer analyzers, but I'm coping exactly as I found it somewhere, where author claims it works):
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
field: {
"properties": {
"name": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
}
},
"analyzer": "standard"
}
}
}
}
}
I figured out that I can modify original mechanism to something like:
for field in model.__searchable__:
temp = getattr(model, field)
fields[field] = {"properties": {
"type": "text",
"fields": {
"keywordstring": {
"type": "text",
"analyzer": "keyword_analyzer"
},
"edgengram": {
"type": "text",
"analyzer": "edge_ngram_analyzer",
"search_analyzer": "edge_ngram_search_analyzer"
},
"completion": {
"type": "completion"
}
},
"analyzer": "standard"
}}
payload = {
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"keyword_analyzer": {
"filter": [
"lowercase",
"asciifolding",
"trim"
],
"char_filter": [],
"type": "custom",
"tokenizer": "keyword"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
},
"edge_ngram_search_analyzer": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": fields
}
but that's where I'm lost. Where do I put actual content (temp=getattr(model, field)) in this document so that whole thing works? I couldn't find any example or relevant part of documentation that would cover updating index with slightly more complex mappings and so on, is this even correct/doable? Every guide I see covers bulk indexing and somehow I fail to make connection.
I think you are a little bite confuse let me try to explain. What you want is adding one document in elastic with:
current_app.elasticsearch.index(index=index, id=model.id,
body=payload)
Which is using the index() method defined in the elasticsearch-py lib
Check the example here:
https://elasticsearch-py.readthedocs.io/en/master/index.html#example-usage
body must be your document a simple dict, as shown in the example from the doc.
What you set is the settings of the index which is different. Take the analogy of the database, you set the schema of a table inside the document.
To set the settings if you want to set the given settings you need to use put_settings, as defined here:
https://elasticsearch-py.readthedocs.io/en/master/api.html?highlight=settings#elasticsearch.client.ClusterClient.put_settings
I hope it help you.

Synonyms for Sankt and St

I'm trying to get synonyms working for my existing setup. Currently I have this settings:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
},
"autocomplete_search": {
"tokenizer": "lowercase",
"filter": [
"lowercase",
"my_synonym_filter",
"german_normalization",
"my_ascii_folding"
]
}
},
"filter": {
"my_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15,
"token_chars": [
"letter",
"digit",
"symbol"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
In this City Index I have documents like that:
St. Wolfgang or Sankt Wolfgang and so on. For me St. and Sankt are synonyms. So if I search for Sankt both of the documents should appear.
I created a new Filter and added the filter to my autocomplete analyzer:
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st."
]
}
So good for now. But the issues I faced are following:
Its clear that the dot after st is not analyzed and not searchable at the moment. But For the synonym the dot is important.
The second issue is if I search for sankt the synonym is st which gives me all documents which starts with st like Stuttgart. So this happens also because the dot is not used.
Do you have any idea how I can achieve the stuff? If you need any more information, please let me know.
Update:
After discussions I did this changes in my settings:
changed edge_ngram tokenizer to a standard tokenizer.
added an edgeNGram filter and added this filter to my analyzer.
deleted the filter german_normalization and my_ascii_folding from my analyzer to simplify the tests.
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "autocomplete",
"filter": [
"my_synonym_filter",
"lowercase"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st => sankt"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "standard"
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
I added these 3 documents to the index:
"name":"Sankt Wolfgang",
"name":"Stuttgart",
"name":"St. Wolfgang"
Query String - Result
st -> "St. Wolfgang", "Stuttgart"
st. -> "St. Wolfgang", "Sankt Wolfgang"
sankt -> "St. Wolfgang", "Sankt Wolfgang"
This works pretty well for me. The main point here is to make sure to
put the synonym filter after the lowercase one
put the edge-n-gram filter at the end
use the edge-n-gram only at indexing time
So we create the index:
PUT city
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter",
"edge_filter"
]
},
"autocomplete_search": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
},
"filter": {
"edge_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 15
},
"my_synonym_filter": {
"type": "synonym",
"ignore_case": "true",
"synonyms": [
"sankt, st. => sankt"
]
}
}
}
},
"mappings": {
"city": {
"properties": {
"name": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
Then we index data:
PUT city/city/1
{
"name":"St. Wolfgang"
}
PUT city/city/2
{
"name":"Stuttgart"
}
PUT city/city/3
{
"name":"Sankt Wolfgang"
}
Finally searching for either st or sankt will only return documents 1 and 3 but not 2
POST city/_search?q=name:st
POST city/_search?q=name:sankt

Search partial word in elasticsearch

I'm kind of new to Elasticsearch but I would like to search the partial in the word
For example if I search "helloworld" is it possible to type only "world"?
Right now it work perfectly for case "hello" the elasticsearch return the suggestion helloworld for me
Here is the code:
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"word": {
"properties": {
"text": {
"type": "string",
"analyzer": "autocomplete"
}
}
}
}
}
Can anyone give me any suggestion?

Elastic search edge_ngram match query on _all being ignored

I'm using elastic search number: "1.5.2" and I'm trying to implement an edge_ngram autocomplete search. I have the following mapping:
curl -XPUT 'localhost:8080/users' -d '{
"settings": {
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10
}
},
"analyzer": {
"edge_ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"edge_ngram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
},
"mappings": {
"user": {
"_all": {
"type":"string",
"index_analyzer": "edge_ngram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"id":{
"type": "integer",
"index": "no",
"include_in_all":false
},
"email": {
"type": "string"
},
"firstName": {
"type": "string"
},
"lastName": {
"type": "string"
}
}
}
}
}
}'
I then index an "user" document:
curl -XPUT 'localhost:8080/users/user/1' -d '{
"email": "a.smith#gmail.com",
"firstName": "Alexander",
"lastName": "Smith"
}'
When I run the following query nothing is returned:
curl -XGET 'localhost:8080/users/_search' -d '{
"query": {
"match":{
"_all":{
"query": "ale",
"operator":"and"
}
}
}
}'
Why is the _all match query not matching on the user document?
You can achieve the functionality of autocomplete by edge_ngram without overriding the _all field analysis. This is done by changing the names of the analyzers you have defined to default_index and default_search (you can alias them to reflect your original names ("edge_ngram_analyzer" and "whitespace_analyzer") if you want). Here is your configuration with the relevant changes:
curl -XPUT 'localhost:8080/users' -d '{
"settings": {
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10
}
},
"analyzer": {
"default_index": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"edge_ngram_filter"
]
},
"default_search": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
},
"mappings": {
"user": {
"properties": {
"id":{
"type": "integer",
"index": "no",
"include_in_all":false
},
"email": {
"type": "string"
},
"firstName": {
"type": "string"
},
"lastName": {
"type": "string"
}
}
}
}
}
}'
Hope I have managed to help :)

Resources