How to highlight characters within words using elasticsearch - elasticsearch

I have implemented auto suggest using elastic search where I am giving suggestions to users based on typed value 'where'. Most of the part works fine if I type full word or few starting characters of word. I want to highlight specific characters typed by the user, say for example user types 'ca' then suggestions should highlight 'California' only and not whole word 'California'
Highlight tag should show result like <b>Ca</b>lifornia and not <b>California</b>.
Here is my index settings
{
"settings": {
"index": {
"analysis": {
"filter": {
"edge_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"lowercase_filter":{
"type":"lowercase",
"language": "greek"
},
"metro_synonym": {
"type": "synonym",
"synonyms_path": "metro_synonyms.txt"
},
"profession_specialty_synonym": {
"type": "synonym",
"synonyms_path": "profession_specialty_synonyms.txt"
}
},
"analyzer": {
"auto_suggest_analyzer": {
"filter": [
"lowercase",
"edge_filter"
],
"type": "custom",
"tokenizer": "whitespace"
},
"auto_suggest_search_analyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "whitespace"
},
"lowercase": {
"filter": [
"trim",
"lowercase"
],
"type": "custom",
"tokenizer": "keyword"
}
}
}
}
},
"mappings": {
"properties": {
"what_auto_suggest": {
"type": "text",
"analyzer": "auto_suggest_analyzer",
"search_analyzer": "auto_suggest_search_analyzer",
"fields": {
"raw":{
"type":"keyword"
}
}
},
"company": {
"type": "text",
"analyzer": "lowercase"
},
"where_auto_suggest": {
"type": "text",
"analyzer": "auto_suggest_analyzer",
"search_analyzer": "auto_suggest_search_analyzer",
"fields": {
"raw":{
"type":"keyword"
}
}
},
"tags_auto_suggest": {
"type": "text",
"analyzer": "auto_suggest_analyzer",
"search_analyzer": "auto_suggest_search_analyzer",
"fields": {
"raw":{
"type":"keyword"
}
}
}
}
}
}
Query i am using to pull suggestions -
GET /autosuggest_index_test/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"where_auto_suggest": {
"query": "ca",
"operator": "and"
}
}
}
]
}
},
"aggs": {
"NAME": {
"terms": {
"field": "where_auto_suggest.raw",
"size": 10
}
}
},
"highlight": {
"pre_tags": [
"<b>"
],
"post_tags": [
"</b>"
],
"fields": {
"where_auto_suggest": {
}
}
}
}
One of json result that I am getting -
{
"_index" : "autosuggest_index_test",
"_type" : "_doc",
"_id" : "Calabasas CA",
"_score" : 5.755663,
"_source" : {
"where_auto_suggest" : "Calabasas CA"
},
"highlight" : {
"where_auto_suggest" : [
"<b>Calabasas</b> <b>CA</b>"
]
}
}
Can someone please suggest, how to get output here (in the where_auto_suggest) like - "<b>Ca</b>labasas <b>CA</b>"

I don't really know why but if you use a edge_ngram tokenizer instead of an edge_ngram filter you will have highlighted characters instead of highlighted words.
So in your settings, you could declare such a tokenizer :
"settings": {
"index": {
"analysis": {
"tokenizer": {
"edge_tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
},
...
}
}
}
And change your analyzer to :
"analyzer": {
"auto_suggest_analyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "edge_tokenizer"
}
...
}
Thus your example request will return
{
...
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.2876821,
"hits": [
{
"_index": "autosuggest_index_test",
"_type": "_doc",
"_id": "grIzo28BY9R4-IxJhcFv",
"_score": 0.2876821,
"_source": {
"where_auto_suggest": "california"
},
"highlight": {
"where_auto_suggest": [
"<b>ca</b>lifornia"
]
}
}
]
}
...
}

Related

Elasicsearch mixing NGram with Simple query string query

Currently, I am using Ngram tokenizer to-do partial matching of Employees.
I can match on FullName, Email address and Employee Number
My current setup looks as follow:
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
The problem that I am facing is that Employee Number can be 1 character long and because of the min_gram and max_gram, I can never match. I can't make the min_gram 1 either because the results do not look correct.
So I tried to mix the Ngram with a standard tokenizer and instead of doing in Multimatch search I am doing an simple_query_string.
This seems to also work partially.
My question is how can I partially match on all 3 fields bearing in mind that employee number can be 1 or 2 chars long. And exact match if I use semi quotes around a word or number
In the below example how can search for 11 and return documents 4 and 5?
Also, I would like document 2 to return if I had to search for 706 which is a partial match, but if I had to search with "7061" I would only return document 2
Full Code
PUT index
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "text",
"analyzer": "english",
"fields": {
"exact": {
"type": "text",
"analyzer": "english_exact"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
PUT index/_doc/1
{
"number" : 1,
"fullName": "Brenda eaton"
}
PUT index/_doc/2
{
"number" : 7061,
"fullName": "Bruce wayne"
}
PUT index/_doc/3
{
"number" : 23,
"fullName": "Bruce Banner"
}
PUT index/_doc/4
{
"number" : 111,
"fullName": "Cat woman"
}
PUT index/_doc/5
{
"number" : 1112,
"fullName": "0723568521"
}
GET index/_search
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
You need to change the analyzer of the number.exact field and reduce the min_gram
count to 2. Modify the index mapping as shown below
Adding a working example
Index Mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"my_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword", // note this
"fields": {
"exact": {
"type": "text",
"analyzer": "my_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "my_analyzer"
}
},
"analyzer": "standard"
}
}
}
}
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact"],
"query": "11"
}
}
}
Search Result:
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "4",
"_score": 0.9929736,
"_source": {
"number": 111,
"fullName": "Cat woman"
}
},
{
"_index": "66311552",
"_type": "_doc",
"_id": "5",
"_score": 0.8505551,
"_source": {
"number": 1112,
"fullName": "0723568521"
}
}
]
Update 1:
If you just need to search for 1, modify the data type of the number field from text type to keyword type, as shown in the index mapping above.
Search Query:
{
"query": {
"simple_query_string": {
"fields": [ "fullName.ngram", "number.exact","number"],
"query": "1"
}
}
}
Search Result will be
"hits": [
{
"_index": "66311552",
"_type": "_doc",
"_id": "1",
"_score": 1.3862942,
"_source": {
"number": 1,
"fullName": "Brenda eaton"
}
}
]
Update 2:
You can use two separate analyzers with n-gram tokenizer for the fullName field and number field. Modify with the below index mapping:
{
"settings": {
"analysis": {
"analyzer": {
"english_exact": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"name_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "name_tokenizer"
},
"number_analyzer": {
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "number_tokenizer"
}
},
"tokenizer": {
"name_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
},
"number_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
},
"normalizer": {
"lowersort": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"number": {
"type": "keyword",
"fields": {
"exact": {
"type": "text",
"analyzer": "number_analyzer"
}
}
},
"fullName": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "name_analyzer"
}
},
"analyzer": "standard"
}
}
}
}

Elasticsearch 5.6 multi_match search with edge_ngram returning zero results

I have documents like following in Elasticsearch:
{
"_index" : "demo_index",
"_type" : "doc",
"_id" : "user_122",
"_version" : 4,
"found" : true,
"_source" : {
"id" : 1520488,
"doc_type" : "user",
"user_email" : "neeraj#test.com",
"user_first_name" : "Neeraj",
"user_last_name" : "Goel"
}
}
And following is the query which is returning zero results.
Note:This is case is occuring only when the full name is bifurcated into user_first_name and user_last_name fields in the index.
{
"index": "demo_index",
"type": "doc",
"body": {
"from": 0,
"size": "200",
"query": {
"bool": {
"must": [
{
"term": {
"doc_type": "user"
}
},
{
"term": {
"user_status": 1
}
},
{
"multi_match": {
"query": "neeraj goel",
"operator": "AND",
"fuzziness": "AUTO",
"fields": [
"user_first_name.edge_ngram",
"user_last_name.edge_ngram"
]
}
}
],
"should": {
"prefix": {
"user_first_name.sort": "neeraj goel"
}
}
}
},
"sort": {
"_score": {
"order": "desc"
}
}
}
}
The edge_ngram settings used in index are:
{
"edge_ngram_tokenizer": {
"token_chars": [
"letter"
],
"min_gram": "2",
"type": "edge_ngram",
"max_gram": "8"
}
}
Mappings:
{
"user_first_name": {
"type": "text",
"fields": {
"edge_ngram": {
"type": "text",
"analyzer": "edge_ngram_analyzer"
},
"ngram": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"raw": {
"type": "keyword"
}
},
"analyzer": "standard"
}
}
I am not able to figure what's wrong with my query because as mentioned above the query is working fine when there is no user_last_name in the document.
Can any one please help me out with this query.
Thanks.
As your index setting and mapping is not complete and it's not clear what you want to achieve , I added a example which works and near to your use case, what I understood and will change it once you add more data.
Index def with mapping and settings
{
"settings": {
"analysis": {
"analyzer": {
"edge_ngram_analyzer": {
"tokenizer": "edge_ngram_tokenizer"
},
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"token_chars": [
"letter"
],
"min_gram": "2",
"type": "edge_ngram",
"max_gram": "8"
},
"ngram_tokenizer": {
"token_chars": [
"letter"
],
"min_gram": "2",
"type": "ngram",
"max_gram": "8"
}
}
},
"index.max_ngram_diff": 10 // note this
},
"mappings": {
"properties": {
"user_first_name": {
"type": "text",
"fields": {
"edge_ngram": {
"type": "text",
"analyzer": "edge_ngram_analyzer"
},
"ngram": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"raw": {
"type": "keyword"
}
},
"analyzer": "standard"
},
"user_last_name": {
"type": "text",
"fields": {
"edge_ngram": {
"type": "text",
"analyzer": "edge_ngram_analyzer"
},
"ngram": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"raw": {
"type": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
And add sample doc to index
{
"user_first_name" : "Neeraj",
"user_last_name" : "Goel"
}
And search query
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "neeraj goel",
"operator": "or", // note this
"fuzziness": "AUTO",
"fields": [
"user_first_name.edge_ngram",
"user_last_name.edge_ngram"
]
}
}
],
"should": {
"prefix": {
"user_first_name.sort": "neeraj goel"
}
}
}
}
}
And search results
"hits": [
{
"_index": "myindexedge",
"_type": "_doc",
"_id": "1",
"_score": 1.0500396,
"_source": {
"user_first_name": "Neeraj",
"user_last_name": "Goel"
}
}
]

Why elastic search not returning result when query contains "IN" prefix?

Below Elastic Query is not returning any result for my application
"query" : {
"bool" : {
"must" : [
{
"simple_query_string" : {
"query" : "IN-123456",
"fields" : [
"field1.auto^1.0",
"field2.auto^1.0"
],
"flags" : -1,
"default_operator" : "AND",
"analyze_wildcard" : false,
"auto_generate_synonyms_phrase_query" : true,
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"fuzzy_transpositions" : true,
"boost" : 1.0
}
}],
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
}
Note that I have document present in elastic data source with matching text "IN-123456" for field2.
I am able to search same document with "123456" as text in query.
Below is the index used
{
"document_****": {
"aliases": {
"document": {}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"date_detection": false,
"properties": {
"#timestamp": {
"type": "date"
},
"field2": {
"type": "keyword",
"fields": {
"auto": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
},
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "document_***",
"creation_date": "1****",
"analysis": {
"filter": {
"autocomplete_filter_30": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "30"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_30": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter_30"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_nonstop": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "***",
"version": {
"created": "6020499"
}
}
}
}
}
Note: Few values are replaced with * for confidentiality reason
Check your mapping. The below query working fine.
POST v_upload_branch/_doc
{
"branch_name":"IN-123456",
"branch_head":"Chennai",
}
GET v_upload_branch/_search
{
"query" : {
"bool" : {
"must" : [
{
"simple_query_string" : {
"query" : "IN-123456",
"fields" : [
"branch_head^1.0",
"branch_name^1.0"
],
"flags" : -1,
"default_operator" : "AND",
"analyze_wildcard" : false,
"auto_generate_synonyms_phrase_query" : true,
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"fuzzy_transpositions" : true,
"boost" : 1.0
}
}],
"adjust_pure_negative" : true,
"boost" : 1.0
}
}
}
Below is the index used
{
"document_****": {
"aliases": {
"document": {}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"date_detection": false,
"properties": {
"#timestamp": {
"type": "date"
},
"field2": {
"type": "keyword",
"fields": {
"auto": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
},
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "document_***",
"creation_date": "1****",
"analysis": {
"filter": {
"autocomplete_filter_30": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "30"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_30": {
"filter": [
"lowercase",
"stop",
"autocomplete_filter_30"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete_nonstop": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1",
"uuid": "***",
"version": {
"created": "6020499"
}
}
}
}
}
Note: Few values are replaced with * for confidentiality reason
After analyzing my index mapping found that token filter stop is removing the prefix IN from token streams. since it is part of default stop word list english stop words
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stop-tokenfilter.html
Because of this elastic search is ignoring the prefix IN while searching and not returning any result

Get suggestion on field Elasticsearch

I am trying to make a suggestion feature with Elasticsearch.
following this article https://qbox.io/blog/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams
What I have now works but not for two words in the same sentence.
The data I have now in ES is.
{
"_index": "books",
"_type": "book",
"_id": "AVJp8p4ZTfM-Ee45GnF5",
"_score": 1,
"_source": {
"title": "Making a dish",
"author": "Jim haunter"
}
},
{
"_index": "books",
"_type": "book",
"_id": "AVJp8jaZTfM-Ee45GnF4",
"_score": 1,
"_source": {
"title": "The big fish",
"author": "Jane Stewart"
}
},
{
"_index": "books",
"_type": "book",
"_id": "AVJp8clRTfM-Ee45GnF3",
"_score": 1,
"_source": {
"title": "The Hunter",
"author": "Jame Franco"
}
}
Here is the mapping and settings.
{"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"books": {
"_all": {
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"title": {
"type": "string",
"index": "no"
},
"author": {
"type": "string",
"index": "no"
}
}
}
}
}
Here is the search
{
"size": 10,
"query": {
"match": {
"_all": {
"query": "Hunter",
"operator": "and",
"fuzziness": 1
}
}
}
}
when I search for "The" I get
"The big fish" and
"The hunter".
However when I enter "The Hunt" I get nothing.
To get the book again I need to enter "The Hunte".
Any suggestions?
Any help appreciated.
Removing "index": "no" from the fields worked for me. Also, since I'm using ES 2.x, I had to replace "index_analyzer" with "analyzer". So here is the mapping:
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"books": {
"_all": {
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"title": {
"type": "string"
},
"author": {
"type": "string"
}
}
}
}
}
Here's some code I used to test it:
http://sense.qbox.io/gist/0140ee0f5043f66e76cc3109a18d573c1d09280b

Trying to form an Elasticsearch query for autocomplete

I've read a lot and it seems that using EdgeNGrams is a good way to go for implementing an autocomplete feature for search applications. I've already configured the EdgeNGrams in my settings for my index.
PUT /bigtestindex
{
"settings":{
"analysis":{
"analyzer":{
"autocomplete":{
"type":"custom",
"tokenizer":"standard",
"filter":[ "standard", "stop", "kstem", "ngram" ]
}
},
"filter":{
"edgengram":{
"type":"ngram",
"min_gram":2,
"max_gram":15
}
},
"highlight": {
"pre_tags" : ["<em>"],
"post_tags" : ["</em>"],
"fields": {
"title.autocomplete": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
}
}
So if in my settings I have the EdgeNGram filter configured how do I add that to the search query?
What I have so far is a match query with highlight:
GET /bigtestindex/doc/_search
{
"query": {
"match": {
"content": {
"query": "thing and another thing",
"operator": "and"
}
}
},
"highlight": {
"pre_tags" : ["<em>"],
"post_tags" : ["</em>"],
"field": {
"_source.content": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
How would I add autocomplete to the search query using EdgeNGrams configured in the settings for the index?
UPDATE
For the mapping, would it be ideal to do something like this:
"title": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
},
Or do I need to use multi_field type:
"title": {
"type": "multi_field",
"fields": {
"title": {
"type": "string"
},
"autocomplete": {
"analyzer": "autocomplete",
"type": "string",
"index": "not_analyzed"
}
}
},
I'm using ES 1.4.1 and want to use the title field for autocomplete purposes.... ?
Short answer: you need to use it in a field mapping. As in:
PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"stop",
"kstem",
"ngram"
]
}
},
"filter": {
"edgengram": {
"type": "ngram",
"min_gram": 2,
"max_gram": 15
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field1": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
}
}
}
For a bit more discussion, see:
http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams
and
http://blog.qbox.io/an-introduction-to-ngrams-in-elasticsearch
Also, I don't think you want the "highlight" section in your index definition; that belongs in the query.
EDIT: Upon trying out your code, there are a couple of problems with it. One was the highlight issue I already mentioned. Another is that you named your filter "edgengram", even though it is of type "ngram" rather than type "edgeNGram", but then you referenced the filter "ngram" in your analyzer, which will use the default ngram filter, which probably doesn't give you what you want. (Hint: you can use term vectors to figure out what your analyzer is doing to your documents; you probably want to turn them off in production, though.)
So what you actually want is probably something like this:
PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"stop",
"kstem",
"edgengram_filter"
]
}
},
"filter": {
"edgengram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 15
}
}
}
},
"mappings": {
"doc": {
"properties": {
"content": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
}
}
}
When I indexed these two docs:
POST test_index/doc/_bulk
{"index":{"_id":1}}
{"content":"hello world"}
{"index":{"_id":2}}
{"content":"goodbye world"}
And ran this query (there was an error in your "highlight" block as well; should have said "fields" rather than "field")"
POST /test_index/doc/_search
{
"query": {
"match": {
"content": {
"query": "good wor",
"operator": "and"
}
}
},
"highlight": {
"pre_tags": [
"<em>"
],
"post_tags": [
"</em>"
],
"fields": {
"content": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
I get back this response, which seems to be what you're looking for, if I understand you correctly:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2712221,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 0.2712221,
"_source": {
"content": "goodbye world"
},
"highlight": {
"content": [
"<em>goodbye</em> <em>world</em>"
]
}
}
]
}
}
Here is some code I used to test it out:
http://sense.qbox.io/gist/3092992993e0328f7c4ee80e768dd508a0bc053f

Resources