Elasticsearch `Procter&Gamble` and `Procter & Gamble` issues - elasticsearch

My task is:
* Make procter&gamble and procter & gamble produce the same results including score
* Make it universal, not via synonyms, as it can be any other Somehow&Somewhat
* Highlight procter&gamble or procter & gamble, not separate tokens if the phrase matches
* I want to use simple_query_stringas I allow search operators
* Make AT&T searchable as well
Here is my snippet. The problems that procter&gamble or procter & gamble searches produce different scores and this different documents as the result.
But the user expects the same result for procter&gamble or procter & gamble
DELETE /english_example
PUT /english_example
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"acronymns": {
"type": "word_delimiter_graph",
"catenate_all" : true,
"preserve_original":true
},
"acronymns_": {
"type": "word_delimiter_graph",
"catenate_all" : true,
"preserve_original":true
},
"custom_stop_words_filter": {
"type": "stop",
"ignore_case": true,
"stopwords": [ "t" ]
}
},
"analyzer": {
"default": {
"tokenizer": "whitespace",
"char_filter": [
"ampersand_filter"
],
"filter": [
"english_possessive_stemmer",
"lowercase",
"acronymns",
"flatten_graph",
"english_stop",
"custom_stop_words_filter",
"english_keywords",
"english_stemmer"
]
}
},
"char_filter": {
"ampersand_filter": {
"type": "pattern_replace",
"pattern": "(?=[^&]*)( {0,}& {0,})(?=[^&]*)",
"replacement": "_and_"
},
"ampersand_filter2": {
"type": "mapping",
"mappings": [
"& => _and_"
]
}
}
}
}
}
PUT /english_example/_bulk
{ "index" : { "_id" : "1" } }
{ "description" : "wi-fi AT&T BB&T Procter & Gamble, some\nOther $500 games with Peter's", "contents" : "Much text with somewhere I meet Procter or Gamble" }
{ "index" : { "_id" : "2" } }
{ "description" : "Procter & Gamble", "contents" : "Much text with somewhere I meet Procter and Gamble" }
{ "index" : { "_id" : "3" } }
{ "description" : "Procter&Gamble", "contents" : "Much text with somewhere I meet Procter & Gamble" }
{ "index" : { "_id" : "4" } }
{ "description" : "Come Procter&Gamble", "contents" : "Much text with somewhere I meet Procter&Gamble" }
{ "index" : { "_id" : "5" } }
{ "description" : "Tome Procter & Gamble", "contents" : "Much text with somewhere I don't meet AT&T" }
# "query": "procter & gamble",
GET english_example/_search
{
"query": {
"simple_query_string": {
"query": "procter & gamble",
"default_operator": "or",
"fields": [
"description^2",
"contents^80"
]
}
},
"highlight": {
"fields": {
"description": {},
"contents": {}
}
}
}
# "query": "procter&gamble",
GET english_example/_search
{
"query": {
"simple_query_string": {
"query": "procter&gamble",
"default_operator": "or",
"fields": [
"description^2",
"contents^80"
]
}
},
"highlight": {
"fields": {
"description": {},
"contents": {}
}
}
}
# "query": "at&t",
GET english_example/_search
{
"query": {
"simple_query_string": {
"query": "at&t",
"default_operator": "or",
"fields": [
"description^2",
"contents^80"
]
}
},
"highlight": {
"fields": {
"description": {},
"contents": {}
}
}
}
In my snippet I redefine the default analyzer using word_delimiter_graph and whitespace tokenizer to search AT&T matches as well.

One option I can think of is to use a should query with a "standard analyzer" and your custom analyzer.
For "proctor & gamble" tokens generated using custom and standard analyzer will be "proctor","gamble"
For "proctor&gamble" tokens generated using custom analyzer will be "proctor","gamble","proctor&gamble" and using standard analyzer will "proctor" and "gamble"
So in should clause we can use a standard analyzer to look for "proctor" or "gamble" and a custom analyzer to look for "proctor&gamble"
GET english_example/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"description": {
"query": "Procter&Gamble",
"analyzer": "standard"
}
}
},
{
"match": {
"description": {
"query": "Procter&Gamble"
}
}
}
],
"minimum_should_match": 1
}
}
}
Second option will be to use synonymns where you define all variations in which proctor and gamble can appear to mean a single thing

I just realized that you are searching a description field and not a company field. So keyword analyzer wont work. I have updated my answer accordingly.
You can potentially try adding a custom field with lowercase and whitespace analyzer and use the same custom analyzer for search as well. When you perform search, search in both standard field and this custom field as a multimatch search. That should allow you to support both. You can boost the score for custom field so that exact matches comes in the top of the search results.
Trick is to convert user input to lower case before performing the search. You shouldn't use user input as is. Else this approach wont work.
You can use below scripts to try it out.
DELETE /test1
PUT /test1
{
"settings": {
"analysis": {
"analyzer": {
"lowercase_analyzer" : {
"filter" : ["lowercase"],
"type" : "custom",
"tokenizer" : "whitespace"
}
}
}
},
"mappings": {
"properties": {
"description" : {
"type": "text",
"analyzer": "standard",
"fields": {
"custom" : {
"type" : "text",
"analyzer" : "lowercase_analyzer",
"search_analyzer" : "lowercase_analyzer"
}
}
}
}
}
}
PUT /test1/_bulk
{ "index" : { "_id" : "1" } }
{ "description" : "wi-fi AT&T BB&T Procter & Gamble, some\nOther $500 games with Peter's" }
{ "index" : { "_id" : "2" } }
{ "description" : "Procter & Gamble" }
{ "index" : { "_id" : "3" } }
{ "description" : "Procter&Gamble" }
GET test1/_search
{
"query": {
"multi_match": {
"query": "procter&gamble",
"fields": ["description", "description.custom"]
}
},
"highlight": {
"fields": {
"description": {},
"description.custom": {}
}
}
}
GET test1/_search
{
"query": {
"multi_match": {
"query": "procter",
"fields": ["description", "description.custom"]
}
},
"highlight": {
"fields": {
"description": {},
"description.custom": {}
}
}
}
GET test1/_search
{
"query": {
"multi_match": {
"query": "at&t",
"fields": ["description", "description.custom"]
}
},
"highlight": {
"fields": {
"description": {},
"description.custom": {}
}
}
}
GET test1/_search
{
"query": {
"multi_match": {
"query": "procter & gamble",
"fields": ["description", "description.custom"]
}
},
"highlight": {
"fields": {
"description": {},
"description.custom": {}
}
}
}
You can add highlighting and try it out.

Related

Elasticsearch Multi-Term Auto Completion

I'm trying to implement the Multi-Term Auto Completion that's presented here.
Filtering down to the correct documents works, but when aggregating the completion_terms they are not filtered to those that match the current partial query, but instead include all completion_terms from any matched documents.
Here are the mappings:
{
"mappings": {
"dynamic" : "false",
"properties" : {
"completion_ngrams" : {
"type" : "text",
"analyzer" : "completion_ngram_analyzer",
"search_analyzer" : "completion_ngram_search_analyzer"
},
"completion_terms" : {
"type" : "keyword",
"normalizer" : "completion_normalizer"
}
}
}
}
Here are the settings:
{
"settings" : {
"index" : {
"analysis" : {
"filter" : {
"edge_ngram" : {
"type" : "edge_ngram",
"min_gram" : "1",
"max_gram" : "10"
}
},
"normalizer" : {
"completion_normalizer" : {
"filter" : [
"lowercase",
"german_normalization"
],
"type" : "custom"
}
},
"analyzer" : {
"completion_ngram_search_analyzer" : {
"filter" : [
"lowercase"
],
"tokenizer" : "whitespace"
},
"completion_ngram_analyzer" : {
"filter" : [
"lowercase",
"edge_ngram"
],
"tokenizer" : "whitespace"
}
}
}
}
}
}
}
I'm then indexing data like this:
{
"completion_terms" : ["Hammer", "Fortis", "Tool", "2000"],
"completion_ngrams": "Hammer Fortis Tool 2000"
}
Finally, the autocomplete search looks like this:
{
"query": {
"bool": {
"must": [
{
"term": {
"completion_terms": "fortis"
}
},
{
"term": {
"completion_terms": "hammer"
}
},
{
"match": {
"completion_ngrams": "too"
}
}
]
}
},
"aggs": {
"autocomplete": {
"terms": {
"field": "completion_terms",
"size": 100
}
}
}
}
This correctly returns documents matching the search string "fortis hammer too", but the aggregations include ALL completion terms that are included in any of the matched documents, e.g. for the query above:
"buckets": [
{ "key": "fortis" },
{ "key": "hammer" },
{ "key": "tool" },
{ "key": "2000" },
]
Ideally, I'd expect
"buckets": [
{ "key": "tool" }
]
I could filter out the terms that are already covered by the search query ("fortis" and "hammer" in this case) in the app, but the "2000" doesn't make any sense from a user's perspective, because it doesn't partially match any of the provided search terms.
I understand why this is happening, but I can't think of a solution. Can anyone help?
try filters agg please
{
"query": {
"bool": {
"must": [
{
"term": {
"completion_terms": "fortis"
}
},
{
"term": {
"completion_terms": "hammer"
}
},
{
"match": {
"completion_ngrams": "too"
}
}
]
}
},
"aggs": {
"findOuthammerAndfortis": {
"filters": {
"filters": {
"fortis": {
"term": {
"completion_terms": "fortis"
}
},
"hammer": {
"term": {
"completion_terms": "hammer"
}
}
}
}
}
}
}

Search by slug in Elasticsearch

I have an index named homes. Here is the simplified mapping of it:
{
"template": "homes",
"index_patterns": "homes",
"settings": {
"index.refresh_interval": "60s"
},
"mappings": {
"properties": {
"status": {
"type": "keyword"
},
"address": {
"type": "keyword",
"fields": {
"suggest": {
"type": "search_as_you_type"
},
"search": {
"type": "text"
}
}
}
}
}
}
As you can see, there is an address field which I query this way:
{
"query": {
"bool": {
"filter": [
{
"term": {
"status": "sale"
}
},
{
"term": {
"address": "406 - 533 Richmond St W"
}
}
]
}
}
}
Now my problem is that I need to be able to query with slugyfied version of the address field as well. For example, I need to query like this:
{
"query": {
"bool": {
"filter": [
{
"term": {
"status": "sale"
}
},
{
"term": {
"address": "406-533-richmond-st-w"
}
}
]
}
}
}
So, instead of 406 - 533 Richmond St W I need to query 406-533-richmond-st-w. How can I do that? I was thinking of adding a new field address_slug which is the slugyfied version of address but I need it to be auto populated so I don't need to manually fill this field every time that I insert or update a document in the index.
If you create a custom analyzer with the token filters below and another field for search that uses the custom analyzer, you can achieve this. Here is an example analyze result and output:
GET {index}/_analyze
{
"tokenizer": "keyword",
"filter": [
{
"type": "lowercase"
},
{
"type": "pattern_replace",
"pattern": """[^A-Za-z0-9]+""",
"replacement": "-"
}
],
"text": "406 - 533 Richmond St W"
}
Output:
{
"tokens" : [
{
"token" : "406-533-richmond-st-w",
"start_offset" : 0,
"end_offset" : 23,
"type" : "word",
"position" : 0
}
]
}

ElasticSearch Search-as-you-type field type field with partial search

I recently updating my ngram implementation settings to use Search-as-you-type field type.
https://www.elastic.co/guide/en/elasticsearch/reference/7.x/search-as-you-type.html
This worked great but I noticed that partial searching does not work.
If I search for number 00060434 I get the desired result but I would also like to be able to search for 60434, then it should return document 3.
Is there a way todo it with the Search-as-you-type field type or can i only do this with ngrams?
PUT searchasyoutype_example
{
"settings": {
"analysis": {
"analyzer": {
"englishAnalyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"trim",
"ascii_folding"
]
}
},
"filter": {
"ascii_folding": {
"type": "asciifolding",
"preserve_original": true
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "search_as_you_type",
"analyzer": "englishAnalyzer"
},
"fullName": {
"type": "search_as_you_type",
"analyzer": "englishAnalyzer"
}
}
}
}
PUT searchasyoutype_example/_doc/1
{
"number" : "00069794",
"fullName": "Employee 1"
}
PUT searchasyoutype_example/_doc/2
{
"number" : "00059840",
"fullName": "Employee 2"
}
PUT searchasyoutype_example/_doc/3
{
"number" : "00060434",
"fullName": "Employee 3"
}
GET searchasyoutype_example/_search
{
"query": {
"multi_match": {
"query": "00060434",
"type": "bool_prefix",
"fields": [
"number",
"number._index_prefix",
"fullName",
"fullName._index_prefix"
]
}
}
}
I think you need to query on number,number._2gram & number._3gram like below:
GET searchasyoutype_example/_search
{
"query": {
"multi_match": {
"query": "00060434",
"type": "bool_prefix",
"fields": [
"number",
"number._2gram",
"number._3gram",
]
}
}
}
search_as_you_type creates the 3 sub fields. You can check more on this article how it works:
https://ashish.one/blogs/search-as-you-type/

Elastic synonyms are taking over other words

On this sequence of commands:
Create the index:
PUT /test_index?
{
"settings": {
"analysis": {
"analyzer": {
"GermanCompoundWordsAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"german_compound_synonym",
"german_normalization"
]
}
},
"filter": {
"german_compound_synonym": {
"type": "synonym",
"synonyms": [
"teppichläufer, auslegware läufer"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"name": {
"type": "text",
"analyzer": "GermanCompoundWordsAnalyzer"
}
}
}
}
}
Adding a few documents:
POST test_index/_doc/
{
"sku" : "kimchy",
"name" : "teppichläufer alfa"
}
POST test_index/_doc/
{
"sku" : "kimchy",
"name" : "teppichläufer beta"
}
Search for one document (I would expect), but 2 are returning :(
GET /test_index/_search
{
"query": {
"match": {
"name": {
"query": "teppichläufer beta",
"operator": "and"
}
}
}
}
I will get both documents since the synonym teppichläufer, auslegware läufer, läufer will endup on the position 1 and 'substitute' the beta. If I remove the "analyzer": "GermanCompoundWordsAnalyzer", I will just get one document as expected.
How do I use this synonyms and don't have this issue?
POST /test_index/_search
{
"query": {
"bool" : {
"should": [
{
"query_string": {
"default_field": "name",
"query": "teppichläufer beta"
, "default_operator": "AND"
}
}
]
}
}
}
After a little more search I found it on the documentations. This a RFM problems, sorry guys.
I tried with:
https://www.elastic.co/guide/en/elasticsearch/reference/master/analysis-synonym-graph-tokenfilter.html
The funny part is that it makes the NDCG of the results worst :)

Elasticsearch nested geo-shape query

Suppose I have the following mapping:
"mappings": {
"doc": {
"properties": {
"name": {
"type": "text"
},
"location": {
"type": "nested",
"properties": {
"point": {
"type": "geo_shape"
}
}
}
}
}
}
}
There is one document in the index:
POST /example/doc?refresh
{
"name": "Wind & Wetter, Berlin, Germany",
"location": {
"type": "point",
"coordinates": [13.400544, 52.530286]
}
}
How can I make a nested geo-shape query?
Example of usual geo-shape query from the documentation (the "bool" block can be skipped):
{
"query":{
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_shape": {
"location": {
"shape": {
"type": "envelope",
"coordinates" : [[13.0, 53.0], [14.0, 52.0]]
},
"relation": "within"
}
}
}
}
}
}
Example of a nested query is:
{
"query": {
"nested" : {
"path" : "obj1",
"score_mode" : "avg",
"query" : {
"bool" : {
"must" : [
{ "match" : {"obj1.name" : "blue"} },
{ "range" : {"obj1.count" : {"gt" : 5}} }
]
}
}
}
}
}
Now how to combine them? In the documentation it is mentioned that nested filter has been replaced by nested query. And that it behaves as a query in “query context” and as a filter in “filter context”.
If I try query for intersect with the point:
{
"query": {
"nested": {
"path": "location",
"query": {
"geo_shape": {
"location.point": {
"shape": {
"type": "point",
"coordinates": [
13.400544,
52.530286
]
},
"relation": "disjoint"
}
}
}
}
}
}
I still get back the document even if relation is "disjoint", so it's not correct. I tried different combinations, with "bool" and "filter", etc. but query is ignored, returning the whole index. Maybe it's impossible with this type of mapping?
Clearly I am missing something here. Can somebody help me out with that, please? Any help is greatly appreciated.

Resources