Elasticsearch Highlight the result of script fields - elasticsearch

In the last question that I asked I want to remove the HTML tags in my search results, After that I thought I could highlite the results with a common query, But in the highlighting field I got other html contents that you removed with script. Would you please help me to highlight the results without html tags that I saved in my db?
My mapping and settings:
{
"settings": {
"analysis": {
"filter": {
"my_pattern_replace_filter": {
"type": "pattern_replace",
"pattern": "\n",
"replacement": ""
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase"
],
"char_filter": [
"html_strip"
]
},
"parsed_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"char_filter": [
"html_strip"
],
"filter": [
"my_pattern_replace_filter"
]
}
}
}
},
"mappings": {
"properties": {
"html": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"raw": {
"type": "text",
"fielddata": true,
"analyzer": "parsed_analyzer"
}
}
}
}
}
}
Search Query:
POST idx_test/_search
{
"script_fields": {
"raw": {
"script": "doc['html.raw']"
}
},
"query": {
"match": {
"html": "more"
}
},"highlight": {
"fields": {
"*":{ "pre_tags" : ["<strong>"], "post_tags" : ["</strong>"] }
}
}
}
Result:
"hits": [
{
"_index": "idx_test2",
"_type": "_doc",
"_id": "GijDsYMBjgX3UBaguGxc",
"_score": 0.2876821,
"fields": {
"raw": [
"Test More test"
]
},
"highlight": {
"html": [
"<html><body><h1 style=\"font-family: Arial\">Test</h1> <span><strong>More</strong> test</span></body></html>"
]
}
}
]
Result that I want to get:
"hits": [
{
"_index": "idx_test2",
"_type": "_doc",
"_id": "GijDsYMBjgX3UBaguGxc",
"_score": 0.2876821,
"fields": {
"raw": [
"Test <strong>More</strong> test"
]
}
]

I thought of another solution. You could index two fields, the original html and the html_extract which has only the text.
You would have to use a processor to just index the text coming from the message and highligths would work.
Mapping
PUT idx_html_strip
{
"mappings": {
"properties": {
"html": {
"type": "text"
},
"html_extract": {
"type": "text"
}
}
}
}
Processor Pipeline
PUT /_ingest/pipeline/pipe_html_strip
{
"description": "_description",
"processors": [
{
"html_strip": {
"field": "html",
"target_field": "html_extract"
}
},
{
"script": {
"lang": "painless",
"source": "ctx['html_raw'] = ctx['html_raw'].replace('\n',' ').trim()"
}
}
]
}
Index Data
Note the use ?pipeline=pipe_html_strip
POST idx_html_strip/_doc?pipeline=pipe_html_strip
{
"html": """<html><body><h1 style=\"font-family: Arial\">Test</h1> <span><strong>More</strong> test</span></body></html>"""
}
Query
GET idx_html_strip/_search?filter_path=hits.hits._source,hits.hits.highlight
{
"query": {
"multi_match": {
"query": "More",
"fields": ["html", "html_extract"]
}
},"highlight": {
"fields": {
"*":{ "pre_tags" : ["<strong>"], "post_tags" : ["</strong>"] }
}
}
}
Results
{
"hits": {
"hits": [
{
"_source": {
"html": """<html><body><h1 style=\"font-family: Arial\">Test</h1> <span><strong>More</strong> test</span></body></html>""",
"html_extract": "Test More test"
},
"highlight": {
"html": [
"""<html><body><h1 style=\"font-family: Arial\">Test</h1> <span><strong><strong>More</strong></strong> test</span></body>"""
],
"html_extract": [
"Test <strong>More</strong> test"
]
}
}
]
}
}

Related

Elasticsearch completion suggester skip_duplicates non duplicates

I am confused with the skip_duplicates Elasticsearch completion suggester. The settings and mappings is like this:
{
"settings":{
"index" :{
"analysis": {
"analyzer": {
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
]
}
}
}
}
},
"mappings":
{
"properties": {
"name": {
"type": "text"
},
"category": {
"type": "text"
},
"category_suggest": {
"type": "completion",
"analyzer": "autocomplete_analyzer" }
}
}
}
I have docs like this.
"category_suggest": {
"input": [
"Automotive",
"Auto",
]
}
and also have docs like this
"category_suggest": {
"input": [
"Automotive",
"Car",
]
}
If I use _search endpoint, submit this query, with skip_duplicates=true.
"suggest":
{
"suggestions":{
"prefix": "Aut",
"completion":{
"field": "category_suggest",
"size": 5,
"skip_duplicates": true
}
}
}
It only returns
"suggest": {
"suggestions": [
{
"text": "Aut",
"offset": 0,
"length": 3,
"options": [
{
"text": "Auto",
"_index": "my_index",
"_type": "_doc",
"_id": "oCNRa4IAiXc1hBrN9UnM",
"_score": 1.0
}
]
}
]
}
If I set skip_duplicates=False, both 'Auto' and 'Automotive' will be returned. Is that because in one of document, category_suggest field has both 'Auto' and 'Automotive'. I am using ES 7.10 python.
Thanks

Elastic phrase prefix working phrase isnt

I am trying to return all documents that contain a string in the userName & documentName.
Data:
{
"userName" : "johnwick",
"documentName": "john",
"office":{
"name":"my_office"
}
},
{
"userName" : "johnsnow",
"documentName": "snowy",
"office": {
"name":"Abraham deVilliers"
}
},
{
"userName" : "johnnybravo",
"documentName": "bravo",
"office": {
"name":"blabla"
}
},
{
"userName" : "moana",
"documentName": "disney",
"office": {
"name":"deVilliers"
}
},
{
"userName" : "stark",
"documentName": "marvel",
"office": {
"name":"blabla"
}
}
I can perform an exact string match with:
}
_source": [ "userName", "documentName"],
"query": {
"multi_match": {
"query": "johnsnow",
"fields": [ "userName", "documentName"]
}
}
}
This successfully returns:
{
"userName" : "johnsnow",
"documentName": "snowy",
"office": {
"name":"Abraham deVilliers"
}
}
If i use type: phrase_fix with john i also get returned successfully 3 results.
But then i try with:
{
"query": {
"multi_match": {
"query": "ohn", // <---- match all docs that contain 'ohn'
"type": "phrase_prefix"
"fields": [ "userName", "documentName"]
}
}
}
Zero results are returned.
What you are looking for is the infix search and you need to have ngram tokenizer with a search time analyzer to achieve that.
Complete example with your sample data
Index mapping and setting
{
"settings": {
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "Ingram", --> note this
"min_gram": 1,
"max_gram": 10
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
},
"index.max_ngram_diff" : 10 --> this you can reduce based on your requirement.
},
"mappings": {
"properties": {
"userName": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"documentName": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
}
}
Sample your docs and then use the same search query, I indexed only first and last doc for brevity and it returned me first doc
"hits": [
{
"_index": "infix",
"_type": "_doc",
"_id": "1",
"_score": 5.7100673,
"_source": {
"userName": "johnwick",
"documentName": "john"
}
}
]

field type as text and completion in elastic serach

I am trying to have title field as both text and completion types in elastic search.
As shown below
PUT playlist
{
"settings": {
"number_of_shards": 2,
"number_of_replicas": 2,
"analysis": {
"filter": {
"custom_english_stemmer": {
"type": "stemmer",
"name": "english"
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"custom_lowercase_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"english_stop",
"custom_english_stemmer"
]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "long",
"index": false,
"doc_values": false
},
"title": {
"type": "text",
"analyzer": "custom_lowercase_analyzer",
"fields": {
"raw": {
"type": "completion"
}
}
}
}
}
}
The below suggestion query works
POST media/_search
{
"_source": ["id", "title"],
"suggest": {
"job-suggest": {
"prefix": "sri",
"completion": {
"field": "title"
}
}
}
}
But normal search would fail on the same title
GET media/_search
{
"_source": ["id", "title"],
"query" : {
"query_string": {
"query" : "*sri*",
"fields" : [
"title"
]
}
}
}
Please help me solve this problem

Highlight words with whitespace in Elasticsearch 7.6

I would like to use Elasticsearch highlight to obtain matched keywords found inside a text.
This is my settings/mappings
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => _",
]
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "my_analyzer"
},
"description": {
"type": "text",
"analyzer": "my_analyzer",
"fielddata": True
}
}
}
}
I am using a char_filter to search and highligth hypenated words.
This my document example:
{
"_index": "test_tokenizer",
"_type": "_doc",
"_id": "DbBIxXEBL7VGAl98vIRl",
"_score": 1.0,
"_source": {
"title": "Best places: New Mexico and Sedro-Woolley",
"description": "This is an example text containing some cities like New York, Toronto, Rome and many other. So, there are also Milton-Freewater and Las Vegas!"
}
}
and this is the query I use
{
"query": {
"query_string" : {
"query" : "\"New York\" OR \"Rome\" OR \"Milton-Freewater\"",
"default_field": "description"
}
},
"highlight" : {
"pre_tags" : ["<key>"],
"post_tags" : ["</key>"],
"fields" : {
"description" : {
"number_of_fragments" : 0
}
}
}
}
and this is the output I have
...
"hits": [
{
"_index": "test_tokenizer",
"_type": "_doc",
"_id": "GrDNz3EBL7VGAl98EITg",
"_score": 0.72928625,
"_source": {
"title": "Best places: New Mexico and Sedro-Woolley",
"description": "This is an example text containing some cities like New York, Toronto, Rome and many other. So, there are also Milton-Freewater and Las Vegas!"
},
"highlight": {
"description": [
"This is an example text containing some cities like <key>New</key> <key>York</key>, Toronto, <key>Rome</key> and many other. So, there are also <key>Milton-Freewater</key> and Las Vegas!"
]
}
}
]
...
Rome and Milton-Freewater are highlighted correctly. New York is not
How can I have <key>New York</key> instead of <key>New</key> and <key>York</key>?
There is an open PR regarding this but I'd suggest the following interim solution:
Add a term_vector setting
PUT test_tokenizer
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"- => _"
]
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "my_analyzer"
},
"description": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector": "with_positions_offsets",
"fielddata": true
}
}
}
}
Sync a doc
POST test_tokenizer/_doc
{"title":"Best places: New Mexico and Sedro-Woolley","description":"This is an example text containing some cities like New York, Toronto, Rome and many other. So, there are also Milton-Freewater and Las Vegas!"}
Convert your query_string to a bunch of bool-should match_phrases inside the highlight_query and use type: fvh
GET test_tokenizer/_search
{
"query": {
"query_string": {
"query": "'New York' OR 'Rome' OR 'Milton-Freewater'",
"default_field": "description"
}
},
"highlight": {
"pre_tags": [
"<key>"
],
"post_tags": [
"</key>"
],
"fields": {
"description": {
"highlight_query": {
"bool": {
"should": [
{
"match_phrase": {
"description": "New York"
}
},
{
"match_phrase": {
"description": "Rome"
}
},
{
"match_phrase": {
"description": "Milton-Freewater"
}
}
]
}
},
"type": "fvh",
"number_of_fragments": 0
}
}
}
}
yielding
{
"highlight":{
"description":[
"This is an example text containing some cities like <key>New York</key>, Toronto, <key>Rome</key> and many other. So, there are also <key>Milton-Freewater</key> and Las Vegas!"
]
}
}

More like this with nested object does not work when specifying a record

I have an elasticsearch index with a nested mapping like this :
{
"my_index": {
"aliases": {},
"mappings": {
"esdata": {
"properties": {
"id": {
"type": "integer"
},
"record": {
"type": "nested",
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "my_analyzer"
}
}
}
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "my_index",
"creation_date": "1502272783802",
"analysis": {
"analyzer": {
"my_analyzer": {
"filter": [
"lowercase",
"stop"
],
"char_filter": [
"html_strip"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "35O9YT3eQGKhukPFbUTflw",
"version": {
"created": "5050199"
}
}
}
}
}
When i execute a query with more_like_this and i specify a like text, it works correctly :
GET my_index/_search
{
"query": {
"nested": {
"path": "record",
"query": {
"bool": {
"must": [
{ "match": { "record.name": "Report" }},
{"more_like_this": {
"fields": [
"record.value"
],
"like": "some thing is wrong",
"min_term_freq": 1,
"max_query_terms": 12
}}
]
}
}
}
}
}
However, when i use the more_like_this with a specific document id, it does not work (for information, i duplicated a record, so at least i should get one result for this duplicated record)
GET my_index/_search
{
"query": {
"nested": {
"path": "record",
"query": {
"bool": {
"must": [
{ "match": { "record.name": "Report" }},
{"more_like_this": {
"fields": [
"record.value"
],
"like": [
{
"_id": "2"
}
],
"min_term_freq": 1,
"max_query_terms": 12
}}
]
}
}
}
}
}

Resources