Wrong indexation elasticsearch using the analyser - elasticsearch

I did a pretty simple test. I build a student index and a type, then I define a mapping:
POST student
{
"mappings" : {
"ing3" : {
"properties" : {
"quote": {
"type": "string",
"analyzer": "english"
}
}
}
}
}
After that I add 3 students to this index:
POST /student/ing3/1
{
"name": "Smith",
"first_name" : "John",
"quote" : "Learning is so cool!!"
}
POST /student/ing3/2
{
"name": "Roosevelt",
"first_name" : "Franklin",
"quote" : "I learn everyday"
}
POST /student/ing3/3
{
"name": "Black",
"first_name" : "Mike",
"quote" : "I learned a lot at school"
}
At this point I thought that the english tokeniser will tokenise all the word in my quotes so if I'm making a search like:
GET /etudiant/ing3/_search
{
"query" : {
"term" : { "quote" : "learn" }
}
}
I will have all the document as a result since my tokeniser will make equal "learn, learning, learned" and I was right. But when I try this request:
GET /student/ing3/_search
{
"query" : {
"term" : { "quote" : "learned" }
}
}
I got zero hit and in my opinion I should have the 3rd document (at least?). But for me Elasticsearch is also supposed to index learned and learning not only learn. Am I wrong? Is my request wrong?

If you check:
GET 'index/_analyze?field=quote' -d "I learned a lot at school"
you will see that your sentence is analyzed as:
{
"tokens":[
{
"token":"i",
"start_offset":0,
"end_offset":1,
"type":"<ALPHANUM>",
"position":0
},
{
"token":"learn",
"start_offset":2,
"end_offset":9,
"type":"<ALPHANUM>",
"position":1
},
{
"token":"lot",
"start_offset":12,
"end_offset":15,
"type":"<ALPHANUM>",
"position":3
},
{
"token":"school",
"start_offset":19,
"end_offset":25,
"type":"<ALPHANUM>",
"position":5
}
]
}
So english analyzer removes punctions and stop words and tokenize words in their root form.
https://www.elastic.co/guide/en/elasticsearch/guide/current/using-language-analyzers.html
You can use match query which will also analyze your search text so will match:
GET /etudiant/ing3/_search
{
"query" : {
"match" : { "quote" : "learned" }
}
}

There is another way. You can both stem the terms (the english analyzer does have a stemmer), but also keep the original terms, by using a keyword_repeat token filter and then using a unique token filter with "only_on_same_position": true to remove unnecessary duplicates after the stemming:
PUT student
{
"settings": {
"analysis": {
"analyzer": {
"myAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"keyword_repeat",
"english_stemmer",
"unique_stem"
]
}
},
"filter": {
"unique_stem": {
"type": "unique",
"only_on_same_position": true
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
}
}
},
"mappings": {
"ing3": {
"properties": {
"quote": {
"type": "string",
"analyzer": "myAnalyzer"
}
}
}
}
}
In this case the term query will work, as well. If you look at what terms are actually being indexed:
GET /student/_search
{
"fielddata_fields": ["quote"]
}
it will be clear why now it matches:
"hits": [
{
"_index": "student",
"_type": "ing3",
"_id": "2",
"_score": 1,
"_source": {
"name": "Roosevelt",
"first_name": "Franklin",
"quote": "I learn everyday"
},
"fields": {
"quote": [
"everydai",
"everyday",
"i",
"learn"
]
}
},
{
"_index": "student",
"_type": "ing3",
"_id": "1",
"_score": 1,
"_source": {
"name": "Smith",
"first_name": "John",
"quote": "Learning is so cool!!"
},
"fields": {
"quote": [
"cool",
"learn",
"learning",
"so"
]
}
},
{
"_index": "student",
"_type": "ing3",
"_id": "3",
"_score": 1,
"_source": {
"name": "Black",
"first_name": "Mike",
"quote": "I learned a lot at school"
},
"fields": {
"quote": [
"i",
"learn",
"learned",
"lot",
"school"
]
}
}
]

Related

ElasticSearch Keyword usage with a prefix search

I have a requirement to be able to search a sentence as complete or with prefix. The UI library (reactive search) I am using is generating the query in this way:
"simple_query_string": {
"query": "\"Louis George Maurice Adolphe\"",
"fields": [
"field1",
"field2",
"field3"
],
"default_operator": "or"
}
I am expecting it to returns results for eg.
Louis George Maurice Adolphe (Roche)
but NOT just records containing partial terms like Louis or George
Currently, I have code like this but it only brings the record if I search with complete word Louis George Maurice Adolphe (Roche) but not a prefix Louis George Maurice Adolphe.
{
"settings": {
"analysis": {
"char_filter": {
"space_remover": {
"type": "mapping",
"mappings": [
"\\u0020=>"
]
}
},
"normalizer": {
"lower_case_normalizer": {
"type": "custom",
"char_filter": [
"space_remover"
],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"field3": {
"type": "keyword",
"normalizer": "lower_case_normalizer"
}
}
}
}
}
Any guidance on the above is appreciated. Thanks.
You are not using the prefix query hence not getting result for prefix search terms, I used same mapping and sample doc, but changed the search query which gives the expected results
Index mapping
{
"settings": {
"analysis": {
"char_filter": {
"space_remover": {
"type": "mapping",
"mappings": [
"\\u0020=>"
]
}
},
"normalizer": {
"lower_case_normalizer": {
"type": "custom",
"char_filter": [
"space_remover"
],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"field3": {
"type": "keyword",
"normalizer": "lower_case_normalizer"
}
}
}
}
Indexed sample doc
{
"field3" : "Louis George Maurice Adolphe (Roche)"
}
Search query
{
"query": {
"prefix": {
"field3": {
"value": "Louis George Maurice Adolphe"
}
}
}
}
Search result
"hits": [
{
"_index": "normal",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"field3": "Louis George Maurice Adolphe (Roche)"
}
}
]
The underlying issue stems from the fact that you're applying a whitespace remover. What this practically means is that when you ingest your docs:
GET your_index_name/_analyze
{
"text": "Louis George Maurice Adolphe (Roche)",
"field": "field3"
}
they're indexed as
{
"tokens" : [
{
"token" : "louisgeorgemauriceadolphe(roche)",
"start_offset" : 0,
"end_offset" : 36,
"type" : "word",
"position" : 0
}
]
}
So if you indend to use simple_string, you may want to rethink your normalizers.
#Ninja's answer fails when you search for George Maurice Adolphe, i.e. no prefix intersection.

How to do partial search and get relevant score in Elasticsearch

I am new to Elasticsearch, trying to do some search.
I have names of objects like :
Homework
work
jobroles
jobs
I am using wildcard query, but its returning score of 1.0 for each docs.
I want score based on how well it matched. Ex
Ex. If I type
work
score of work > homework
Its a good question and directly you can't get the exact match on top, what you need is ngram analyzer which provides the partial matches and another field which stores the exact tokens in lowercase(text field with standard analyzer will solve it).
I've reproduced your issue and solved it using above mentioned approach, Please refer my blog on autocomplete and my this SO answer for in-depth read of various autocomplete/partial searches and why/what/how part of it.
Working example
Create index mapping
{
"settings": {
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "ngram",
"min_gram": 1,
"max_gram": 10
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
},
"index.max_ngram_diff" : 10
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"title_lowercase" :{
"type" : "text"
}
}
}
}
Index your sample docs
{
"title" : "Homework",
"title_lowercase" : "Homework"
}
{
"title" : "work",
"title_lowercase" : "work"
}
Search query
{
"query": {
"bool": {
"should": [
{
"match": {
"title": {
"query": "work"
}
}
},
{
"match": {
"title_lowercase": {
"query": "work"
}
}
}
]
}
}
}
And expected result
"hits": [
{
"_index": "internaledge",
"_type": "_doc",
"_id": "1",
"_score": 0.9926754, /note score of `work` is much higher than`homework`
"_source": {
"title": "work",
"title_lowercase": "work"
}
},
{
"_index": "internaledge",
"_type": "_doc",
"_id": "2",
"_score": 0.2995283,
"_source": {
"title": "Homework",
"title_lowercase": "Homework"
}
}
]

ElasticSearch: Highlighting with Stemming

I have read this question and attempted to understand the documentation here, but this is complicated.
The problem (I think):
[update 1]
I am using Scala for my code and interface with ES High Level Java API.
I have a stemming analyzer configured. If I search for responsibilities i get results for responsibilities and responsibility. That's great.
BUT
Only the documents with the term responsibilities return highlights.
This is because the search is on the stemmed content , i.e., responsib. However, the highlight is against the unstemmed content. Hence, it finds responsibilities which was a search criteria, but not responsibility, which wasn't.
If I set the highlighter to highlight on the stemmed content, it returns nothing at all. I guess because it is comparing resonsib with responsibilities
Search
I an using the Java high level API. The problem is not the code itself.
Currently, I am highlighting only the content field, returning only responsibilities. Highlighting content.english seems to return nothing
private def buildHighlighter(): HighlightBuilder = {
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder
val highlightBuilder = new HighlightBuilder
val highlightContent = new HighlightBuilder.Field("content")
highlightContent.highlighterType("unified")
highlightBuilder.field(highlightContent)
highlightBuilder
}
Mapping (adumbrated)
{
"settings": {
"number_of_shards": 3,
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": []
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"title": {
"type": "text",
"fields": {
"english": {
"type": "text",
"analyzer": "english"
}
}
},
"content": {
"type": "text",
"fields": {
"english": {
"type": "text",
"analyzer": "english"
}
}
}
}
}
}
[update 2]
Scala code to implement search:
def searchByField(indices: Seq[ESIndexName], terms: Seq[(String, String)], size: Int = 20): SearchResponse = {
val searchRequest = new SearchRequest
searchRequest.indices(indices.map(idx => idx.completeIndexName()): _*)
searchRequest.source(buildTargetFieldsMatchQuery(terms, size))
searchRequest.indicesOptions(IndicesOptions.strictSingleIndexNoExpandForbidClosed())
client.search(searchRequest, RequestOptions.DEFAULT)
}
and query is built as follows:
private def buildTargetFieldsMatchQuery(termsByField: Seq[(String, String)], size: Int): SearchSourceBuilder = {
val query = new BoolQueryBuilder
termsByField.foreach {
case (field, term) =>
if (field == "content") {
logger.debug(field + " should have " + term)
query.should(new MatchQueryBuilder(field+standardAnalyzer, term.toLowerCase))
query.should(new MatchQueryBuilder(field, term.toLowerCase))
}
else if (field == "title"){
logger.debug(field + " should have " + term)
query.should(new MatchQueryBuilder(field+standardAnalyzer, term.toLowerCase())).boost
}
else {
logger.debug(field + " should have " + term)
query.should(new MatchQueryBuilder(field, term.toLowerCase))
}
}
val sourceBuilder: SearchSourceBuilder = new SearchSourceBuilder()
sourceBuilder.query(query)
sourceBuilder.from(0)
sourceBuilder.size(size)
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS))
sourceBuilder.highlighter(buildHighlighter())
}
With plain REST the following is working fine for me:
PUT test
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": []
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"content": {
"type": "text",
"fields": {
"english": {
"type": "text",
"analyzer": "english"
}
}
}
}
}
}
}
POST test/_doc/
{
"content": "This is my responsibility"
}
POST test/_doc/
{
"content": "These are my responsibilities"
}
GET test/_search
{
"query": {
"match": {
"content.english": "responsibilities"
}
},
"highlight": {
"fields": {
"content.english": {
"type": "unified"
}
}
}
}
The result is then:
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "5D5PPGoBqgTTLzdtM-_Y",
"_score" : 0.18232156,
"_source" : {
"content" : "This is my responsibility"
},
"highlight" : {
"content.english" : [
"This is my <em>responsibility</em>"
]
}
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "5T5PPGoBqgTTLzdtZe8U",
"_score" : 0.18232156,
"_source" : {
"content" : "These are my responsibilities"
},
"highlight" : {
"content.english" : [
"These are my <em>responsibilities</em>"
]
}
}
]
Looking at your Java / Groovy (?) code it looks close enough to the example in the docs. Could you log the actual query you are running, so we can figure out what is going wrong? Generally it should work like this.

How Elasticsearch relevance score gets calculated?

I am using multi_match with phrase_prefix for full text search in Elasticsearch 5.5. ES query looks like
{
query: {
bool: {
must: {
multi_match: {
query: "butt",
type: "phrase_prefix",
fields: ["item.name", "item.keywords"],
max_expansions: 10
}
}
}
}
}
I am getting following response
[
{
"_index": "items_index",
"_type": "item",
"_id": "2",
"_score": 0.61426216,
"_source": {
"item": {
"keywords": "amul butter, milk, butter milk, flavoured",
"name": "Flavoured Butter"
}
}
},
{
"_index": "items_index",
"_type": "item",
"_id": "1",
"_score": 0.39063013,
"_source": {
"item": {
"keywords": "amul butter, milk, butter milk",
"name": "Butter Milk"
}
}
}
]
Mappings is as follows(I am using default mappings)
{
"items_index" : {
"mappings" : {
"parent_doc": {
...
"properties": {
"item" : {
"properties" : {
"keywords" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
How item with "name": "Flavoured Butter" getting higher score of 0.61426216 against the document with "name": "Butter Milk" and score 0.39063013?
I tried applying boost to "item.name" and removing "item.keywords" form search fields getting same results.
How scores in Elasticsearch works? Are above results correct in terms of relavance?
The scoring for phrase_prefix is similar to that of best_fields, meaning that score of a document is the score obtained from the best_field, which here is item.keywords.
So, item.name isn't adding to score
Refer: multi-match-types
You can use 2 multi_match queries to combine the score from keywords and name.
{
"query": {
"bool": {
"must": [{
"multi_match": {
"query": "butt",
"type": "phrase_prefix",
"fields": [
"item.keywords"
],
"max_expansions": 10
}
},{
"multi_match": {
"query": "butt",
"type": "phrase_prefix",
"fields": [
"item.name"
],
"max_expansions": 10
}
}]
}
}
}

Understanding Elasticsearch synonym

Being very new in Elasticsearch, I'm not sure what's the best way to use synonym.
I have two fields, one is hashtag and another one is name. Hashtag containing names in lower case without whitespace whereas name contains actual name in camel case format.
I want to search based on name in the right format and want to get all matching names along with those docs where it matches hashtag as well.
For example, name contains "Tom Cruise" and hashtag is "tomcruise". I want to search "Tom Cruise" and expected result is it will return all docs which has either name "Tom Cruise" or hashtag "tomcruise".
Here is the way I'm creating this index:
PUT /my_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"synonym" : {
"type" : "synonym",
"ignore_case" : true,
"synonyms" : [
"tom cruise => tomcruise, tom cruise"
]
}
},
"analyzer": {
"synonym" : {
"tokenizer" : "whitespace",
"filter" : ["synonym"]
}
}
}
}
}
PUT /my_index/my_type/_mapping
{
"my_type": {
"properties": {
"hashtag": {
"type": "string",
"search_analyzer": "synonym",
"analyzer": "standard"
},
"name":{
"type": "keyword"
}
}
}
}
POST /my_index/my_type/_bulk
{ "index": { "_id": 1 }}
{ "hashtag": "tomcruise", "name": "abc" }
{ "index": { "_id": 2 }}
{ "hashtag": "tomhanks", "name": "efg" }
{ "index": { "_id": 3 }}
{ "hashtag": "tomcruise" , "name": "efg" }
{ "index": { "_id": 4 }}
{ "hashtag": "news" , "name": "Tom Cruise"}
{ "index": { "_id": 5 }}
{ "hashtag": "celebrity", "name": "Kate Winslet" }
{ "index": { "_id": 6 }}
{ "hashtag": "celebrity", "name": "Tom Cruise" }
When I do analyze, it looks like I get the right tokens: [tomcruise, tom, cruise]
GET /my_index/_analyze
{
"text": "Tom Cruise",
"analyzer": "synonym"
}
Here's how I'm searching:
POST /my_index/my_type/_search?pretty
{
"query":
{
"multi_match": {
"query": "Tom Cruise",
"fields": [ "hashtag", "name" ]
}
}
}
Is this the right way to archive my search requirement?
What's the best way to search like this on Kibana? I have to use the entire query but what I need to do if I want to just type "Tom Cruise" and want to get the expected result? I tried with "_all" but didn't work.
Updated:
After discussing with Russ Cam and with my little knowledge of Elasticsearch, I thought it will be overkill to use synonym for my search requirement. So I changed search analyzer to generate same token and got the same result. Still want to know whether I'm doing it in the right way.
PUT /my_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"word_joiner": {
"type": "word_delimiter",
"catenate_all": true
}
},
"analyzer": {
"test_analyzer" : {
"type": "custom",
"tokenizer" : "keyword",
"filter" : ["lowercase", "word_joiner"]
}
}
}
}
}

Resources