ElasticSearch wildcard highlighting with hyphen - elasticsearch

I am having trouble with wildcard query. When i have some hyphen - it does not highlight anything after it. I played with highlight settings but did not found any solution yet. Is it normal behavior?
I am making some index:
PUT testhighlight
PUT testhighlight/_mapping/_doc
{
"properties": {
"title": {
"type": "text",
"term_vector": "with_positions_offsets"
},
"content": {
"type": "text",
"term_vector": "with_positions_offsets"
}
}
}
Then i create documents:
PUT testhighlight/_doc/1
{
"title": "1",
"content": "test-input"
}
PUT testhighlight/_doc/2
{
"title": "2",
"content": "test input"
}
PUT testhighlight/_doc/3
{
"title": "3",
"content": "testinput"
}
Then i execute this search request:
GET testhighlight/_search
{
"query": {
"bool": {
"must": [
{
"query_string": {
"fields": [
"title",
"content"
],
"query": "test*"
}
}
]
}
},
"highlight": {
"fields": {
"content": {
"boundary_max_scan": 10,
"fragment_offset": 5,
"fragment_size": 250,
"type": "fvh",
"number_of_fragments": 5,
"order": "score",
"boundary_scanner": "word",
"post_tags": [
"</span>"
],
"pre_tags": [
"""<span class="highlight-search">"""
]
}
}
}
}
It returns these hits:
"hits": [
{
"_index": "testhighlight",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"title": "2",
"content": "test input"
},
"highlight": {
"content": [
"""<span class="highlight-search">test</span> input"""
]
}
},
{
"_index": "testhighlight",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"title": "1",
"content": "test-input"
},
"highlight": {
"content": [
"""<span class="highlight-search">test</span>-input"""
]
}
},
{
"_index": "testhighlight",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"title": "3",
"content": "testinput"
},
"highlight": {
"content": [
"""<span class="highlight-search">testinput</span>"""
]
}
}
]
It looks alright, but didn't highlighted the whole "test-input" in document with ID 1. Is there any way to do so?

Related

How can I prioritize documents in Elasticsearch query

I have products in my index. Documents are basically structured like these:
{
"_id": "product",
"_source": {
...
"type": "product",
"id": 1,
"mainTaxon": {
"name": "T-SHIRT",
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA T-SHIRT"
],
"score": 50
},
]
}
},
{
"_id": "product",
"_source": {
...
"type": "product",
"id": 2,
"mainTaxon": {
"name": "JEANS",
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA JEANS"
],
"score": 50
},
]
}
}
}
When I search for 'BANANA' I would prioritize products with mainTaxon different from JEANS. So, every product with the mainTaxon name T_SHIRT or something else would be listed before products with mainTaxon JEANS.
You can use boosting query to prioritize documents
{
"query": {
"boosting": {
"positive": {
"match": {
"attributes.value": "banana"
}
},
"negative": {
"match": {
"mainTaxon.name": "JEANS"
}
},
"negative_boost": 0.5
}
}
}
Search Result will be
"hits": [
{
"_index": "67164768",
"_type": "_doc",
"_id": "1",
"_score": 0.5364054,
"_source": {
"type": "product",
"id": 1,
"mainTaxon": {
"name": "T-SHIRT"
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA T-SHIRT"
],
"score": 50
}
]
}
},
{
"_index": "67164768",
"_type": "_doc",
"_id": "2",
"_score": 0.32743764,
"_source": {
"type": "product",
"id": 2,
"mainTaxon": {
"name": "JEANS"
},
"attributes": [
{
"code": "name",
"name": "Name",
"value": [
"BANANA JEANS"
],
"score": 50
}
]
}
}
]

How to make flattened sub-field in the nested field in elastic search?

Here, I have a indexed document like:
doc = {
"id": 1,
"content": [
{
"txt": I,
"time": 0,
},
{
"txt": have,
"time": 1,
},
{
"txt": a book,
"time": 2,
},
{
"txt": do not match this block,
"time": 3,
},
]
}
And I want to match "I have a book", and return the matched time: 0,1,2. Is there anyone who knows how to build the index and the query for this situation?
I think the "content.txt" should be flattened but "content.time" should be nested?
want to match "I have a book", and return the matched time: 0,1,2.
Adding a working example with index mapping,search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"content": {
"type": "nested"
}
}
}
}
Search Query:
{
"query": {
"nested": {
"path": "content",
"query": {
"bool": {
"must": [
{
"match": {
"content.txt": "I have a book"
}
}
]
}
},
"inner_hits": {}
}
}
}
Search Result:
"inner_hits": {
"content": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.5226097,
"hits": [
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 2
},
"_score": 2.5226097,
"_source": {
"txt": "a book",
"time": 2
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 0
},
"_score": 1.5580825,
"_source": {
"txt": "I",
"time": 0
}
},
{
"_index": "64752029",
"_type": "_doc",
"_id": "1",
"_nested": {
"field": "content",
"offset": 1
},
"_score": 1.5580825,
"_source": {
"txt": "have",
"time": 1
}
}
]
}
}
}
}

Edge n-gram suggestions and 'starts with' keyword in Elasticsearch

I'm trying to build a food search engine on Elasticsearch that should meet following use cases -
If the user searches for 'coff' then it should return all the documents with phrase 'coffee' in their name and the priority should be for food items that have 'coffee' at the starting of their name.
If the user searches for 'green tea' then it should give priority to the documents that have both the phrases 'green tea' instead of splitting 'green' and 'tea'
If the phrase does not exist in the 'name' then it should also search in the alias field.
To manage the first case, I've used the edge n-grams analyzer.
Mapping -
{
"settings": {
"index": {
"analysis": {
"filter": {},
"analyzer": {
"analyzer_keyword": {
"tokenizer": "standard",
"filter": "lowercase"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "edge_ngram_tokenizer"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": [
"letter"
]
}
}
}
}
},
"mappings": {
"doc": {
"properties": {
"alias": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"search_analyzer": "analyzer_keyword",
"analyzer": "edge_ngram_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
This is the search query that I'm using but it's not exactly returning the relevant search results
{
"query": {
"multi_match": {
"query": "coffee",
"fields": ["name^2", "alias"]
}
}
}
There are over 1500 food items with 'coffee' in their name but the above query is only returning 2
{
"took": 745,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 8.657346,
"hits": [
{
"_index": "food-master",
"_type": "doc",
"_id": "a9uzinABb4g7LgmgoI1I",
"_score": 8.657346,
"_source": {
"id": 17463,
"name": "Rotiboy, coffee bun",
"alias": [
"Mexican Coffee Bun (Rotiboy)",
"Mexican coffee bun"
],
}
},
{
"_index": "food-master",
"_type": "doc",
"_id": "TNuzinABb4g7LgmgoFVI",
"_score": 7.0164866,
"_source": {
"id": 1344,
"name": "Coffee with sugar",
"alias": [
"Heart Friendly",
"Coffee With Sugar",
"Coffee With Milk and Sugar",
"Gluten Free",
"Hypertension Friendly"
],
}
}
]
}
}
In the mapping, if I remove the analyzer_keyword then it returns relevant results but the documents that start with 'coffee' are not prioritized
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1323,
"max_score": 57.561867,
"hits": [
{
"_index": "food-master-new",
"_type": "doc",
"_id": "nduzinABb4g7LgmgoINI",
"_score": 57.561867,
"_source": {
"name": "Egg Coffee",
"alias": [],
"id": 12609
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "dNuzinABb4g7LgmgoFVI",
"_score": 55.811295,
"_source": {
"name": "Coffee (Black)",
"alias": [
"Weight Loss",
"Diabetes Friendly",
"Gluten Free",
"Lactose Free",
"Heart Friendly",
"Hypertension Friendly"
],
"id": 1341
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "NduzinABb4g7LgmgoHxI",
"_score": 54.303185,
"_source": {
"name": "Brewed Coffee",
"alias": [
"StarBucks"
],
"id": 15679
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "ltuzinABb4g7LgmgoJJI",
"_score": 54.303185,
"_source": {
"name": "Coffee - Masala",
"alias": [],
"id": 11329
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "oduzinABb4g7LgmgoGpI",
"_score": 53.171227,
"_source": {
"name": "Coffee, German",
"alias": [],
"id": 12257
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "YNuzinABb4g7LgmgoFRI",
"_score": 52.929176,
"_source": {
"name": "Soy Milk Coffee",
"alias": [
"Gluten Free",
"Lactose Free",
"Weight Loss",
"Diabetes Friendly",
"Heart Friendly",
"Hypertension Friendly"
],
"id": 978
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "8duzinABb4g7LgmgoFRI",
"_score": 52.068523,
"_source": {
"name": "Cold Coffee (Soy Milk)",
"alias": [
"Soy Milk"
],
"id": 1097
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "tNuzinABb4g7LgmgoF9I",
"_score": 50.956154,
"_source": {
"name": "Coffee Frappe",
"alias": [],
"id": 3142
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "ZduzinABb4g7LgmgoF5I",
"_score": 49.810112,
"_source": {
"name": "Big Apple Coffee",
"alias": [],
"id": 3130
}
},
{
"_index": "food-master-new",
"_type": "doc",
"_id": "eduzinABb4g7LgmgoHtI",
"_score": 49.62197,
"_source": {
"name": "Mexican Coffee",
"alias": [],
"id": 13604
}
}
]
}
}
If I change the tokenizer to 'keyword' from 'standard' then I face the same problem and it also splits phrases into individual words - 'green tea' to 'green' and 'tea'
Any suggestions on what I might be getting wrong with respect to analyzers? I've tried all possible combinations but meeting all 3 scenarios with high accuracy is getting a little difficult.

Agg counts on nested objects in elasitcsearch query count only once

we have a mapping with nested objects.
We use it for storing variants in an ecommerce environment.
To build filters in ecommerce we check agg bucket counts.
What happens now is that we get agg counts for every variant that this matching.
But i would like to count only once per parent (even if more than one children match).
here is our mapping:
{
"products": {
"mappings": {
"dynamic": "false",
"properties": {
"product_id": {
"type": "keyword"
},
"variants": {
"type": "nested",
"properties": {
"brand": {
"type": "keyword"
},
"color": {
"type": "keyword"
}
}
}
}
}
}
example of documents:
{
"_index": "products",
"_type": "_doc",
"_id": "5e42f759de235d5b42e6e35524141bd2cfcbc5d0",
"_score": 1.0,
"_source": {
"product_id": "5e42f759de235d5b42e6e35524141bd2cfcbc5d0",
"variants": [
{
"color": [
"Black"
]
"brand": "Fackelmann"
},
{
"color": [],
"brand": "Fackelmann"
},
{
"color": [],
"brand": "Fackelmann",
},
{
"color": [],
"brand": "Fackelmann"
}
]
}
},
{
"_index": "products",
"_type": "_doc",
"_id": "768af68018654b04f20d32003348ee9bd81d9f65",
"_score": 1.0,
"_source": {
"product_id": "768af68018654b04f20d32003348ee9bd81d9f65",
"variants": [
{
"color": [
"Grey"
]
"brand": "Fackelmann"
},
{
"color": [],
"brand": "Fackelmann"
},
{
"color": [],
"brand": "Fackelmann",
},
{
"color": [],
"brand": "IKEA"
}
]
}
}
in this example we get those counts for aggs:
brand:Fackelmann[7]
brand:IKEA[1]
color:Grey[1]
color:Black[1]
What i expect to get is:
brand:Fackelmann[2]
brand:IKEA[1]
color:Grey[1]
color:Black[1]

Autocomplete in elastic search

i am planning to make an elastic search based auto complete module for an e commerce website.i am using edge_ngram for suggestions.I am trying out this configuration.
**My index creation :**
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "lowercase"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 10,
"token_chars": [
"letter","digit"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
**Inserting Data**
PUT my_index/doc/1
{
"title": "iphone s"
}
PUT my_index/doc/9
{
"title": "iphone ka"
}
PUT my_index/doc/11
{
"title": "iphone ka t"
}
PUT my_index/doc/15
{
"title": "iphone 6"
}
PUT my_index/doc/14
{
"title": "iphone 6 16GB"
}
PUT my_index/doc/3
{
"title": "iphone k"
}
POST my_index/_refresh
POST my_index/_analyze
{
"tokenizer": "autocomplete",
"text": "iphone 6"
}
POST my_index/_analyze
{
"analyzer": "pattern",
"text": "iphone 6"
}
**Autocomplete suggestions**
When i am trying to find out closets match to iphone 6.It is not showing correct result.
GET my_index/_search
{
"query": {
"match": {
"title": {
"query": "iphone 6",
"operator": "and"
}
}
}
}
**Above query yielding :**
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0.28582606,
"hits": [
{
"_index": "my_index",
"_type": "doc",
"_id": "1",
"_score": 0.28582606,
"_source": {
"title": "iphone s"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "9",
"_score": 0.25811607,
"_source": {
"title": "iphone ka"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "14",
"_score": 0.24257512,
"_source": {
"title": "iphone 6 16GB"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "3",
"_score": 0.19100356,
"_source": {
"title": "iphone k"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "15",
"_score": 0.1862728,
"_source": {
"title": "iphone 6"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "11",
"_score": 0.16358379,
"_source": {
"title": "iphone ka t"
}
},
{
"_index": "my_index",
"_type": "doc",
"_id": "2",
"_score": 0.15861572,
"_source": {
"title": "iphone 5 s"
}
}
]
}
}
But result should be :
{
"_index": "my_index",
"_type": "doc",
"_id": "15",
"_score": 1,
"_source": {
"title": "iphone 6"
}
}
Please let me know if i am missing something on this,I am new to this so not aware of any other method that may yield better results.
You are using autocomplete_search as your search_analyzer. If you look how your text is analyzed using search analyzer specified by you.
POST my_index/_analyze
{
"analyzer": "autocomplete_search",
"text": "iphone 6"
}
You will get
{
"tokens": [
{
"token": "iphone", ===> Only one token
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 0
}
]
}
Since all the documents have this (iphone) token in reverse index. So all the documents are returned
In case you want to match desired results, you can use the same analyzer used while indexing.
{
"query": {
"match": {
"title": {
"query": "iphone 6",
"operator": "and",
"analyzer" : "autocomplete"
}
}
}
}

Resources