Elasticsearch: Shingles not working with odd no. of search words

Elasticsearch: Shingles not working with odd no. of search words - elasticsearch

When trying to search with odd no. of words with my shingle analyzer no result shows up.
'new delhi abcd' does not return any results but 'new delhi abcd xyz' returns
My analyzers look like below
{
"search": {
"settings": {
"index": {
"analysis": {
"filter": {
"my_shingle_filter": {
"max_shingle_size": "2",
"min_shingle_size": "2",
"output_unigrams": "false",
"type": "shingle"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "40"
}
},
"analyzer": {
"my_shingle_analyzer": {
"filter": [
"lowercase",
"my_shingle_filter"
],
"type": "custom",
"tokenizer": "whitespace"
},
"shingle_with_autocomplete": {
"filter": [
"lowercase",
"my_shingle_filter",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "whitespace"
}
}
}
}
}
}
}
The mapping for the type is shared below
{
"search": {
"mappings": {
"address": {
"properties": {
"full_address": {
"type": "text",
"norms": false,
"fields": {
"edge_n_grams": {
"type": "text",
"norms": false,
"analyzer": "autocomplete"
},
"shingles": {
"type": "text",
"norms": false,
"analyzer": "shingle_with_autocomplete"
},
"synonym": {
"type": "text",
"norms": false,
"analyzer": "synonym_autocomplete"
}
},
"analyzer": "whitespace"
}
}
}
}
}
}
Query 1
GET search/address/_search?_source=full_address
{
"query": {
"match": {
"full_address.shingles": {
"query": "new delhi",
"analyzer": "my_shingle_analyzer"
}
}
}
}
Results
{
...
"hits": {
"total": 21801,
"max_score": 8.015874,
"hits": [{
"_index": "search",
"_type": "address",
"_id": "581c50297fd84ecc35420570",
"_score": 8.015874,
"_source": {
"full_address": "new delhi nagar palika adarsh vidyalay new delhi nagar palika adarsh vidyalay tilak lane tilak marg area new delhi delhi 110001"
}
}, {
"_index": "search",
"_type": "address",
"_id": "581c502a7fd84ecc35422010",
"_score": 7.013889,
"_source": {
"full_address": "kingdom hall of jehovant witness gyan bharti public school saket new delhi new delhi delhi 110017"
}
},
...
]
}
}
Query 2
GET search/address/_search?_source=full_address
{
"query": {
"match": {
"full_address.shingles": {
"query": "new delhi nag",
"analyzer": "my_shingle_analyzer"
}
}
}
}
Results
{
...,
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
'new delhi nagar pal' returns proper results.
Any help/insights would be appreciated

This is a bug in ES 5.2.x. Issue is with the single filter in new version

Related

why is shingle token filter with analyser isn't yielding expected results?

Hi here are my index details:
PUT shingle_test
{
"settings": {
"analysis": {
"analyzer": {
"evolutionAnalyzer": {
"tokenizer": "standard",
"filter": [
"standard",
"custom_shingle"
]
}
},
"filter": {
"custom_stop": {
"type": "stop",
"stopwords": "_english_"
},
"custom_shingle": {
"type": "shingle",
"min_shingle_size": "2",
"max_shingle_size": "10",
"output_unigrams": false
}
}
}
},
"mappings": {
"legacy" : {
"properties": {
"name": {
"type": "text",
"fields": {
"shingles": {
"type": "text",
"analyzer": "standard",
"search_analyzer": "evolutionAnalyzer"
},
"as_is": {
"type": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
}
Added 2 docs
PUT shingle_test/legacy/1
{
"name": "Chandni Chowk 2 Banglore"
}
PUT shingle_test/legacy/2
{
"name": "Chandni Chowk"
}
Nothing is being returned if I do this,
GET shingle_test/_search
{
"query": {
"match": {
"name": {
"query": "Chandni Chowk",
"analyzer": "evolutionAnalyzer"
}
}
}
}
Looked at all possible solutions online, didn't get any.
Also, if I do "output_unigrams": true, then it just works like match query and gives results.
The thing I'm trying to achieve:
Having these documents:
Chandni Chowk 2 Bangalore
Chandni Chowk
CCD Bangalore
Istah shawarma and biryani
Istah
So,
searching for "Chandni Chowk 2 Bangalore" should return 1, 2
searching for "Chandni Chowk" should return 1, 2
searching for "Istah shawarma and biryani" should return 4, 5
searching for "Istah" should return 4, 5
searching for "CCD Bangalore" should return 3
note: search keyword will always be exactly equal to value of the name field in the document ex: In this particular index, we can query "Chandni Chowk 2 Bangalore", "Chandni Chowk", "CCD Bangalore", "Istah shawarma and biryani", "Istah". "CCD" won't be queried on this index.

The analyzer parameter specifies the analyzer used for text analysis when indexing or searching a text field.
Modify your index mapping as
{
"settings": {
"analysis": {
"analyzer": {
"evolutionAnalyzer": {
"tokenizer": "standard",
"filter": [
"standard",
"custom_shingle"
]
}
},
"filter": {
"custom_stop": {
"type": "stop",
"stopwords": "_english_"
},
"custom_shingle": {
"type": "shingle",
"min_shingle_size": "2",
"max_shingle_size": "10",
"output_unigrams": true // note this
}
}
}
},
"mappings": {
"legacy" : {
"properties": {
"name": {
"type": "text",
"fields": {
"shingles": {
"type": "text",
"analyzer": "evolutionAnalyzer", // note this
"search_analyzer": "evolutionAnalyzer"
},
"as_is": {
"type": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
}
And, the modified search query will be
{
"query": {
"match": {
"name.shingles": {
"query": "Chandni Chowk"
}
}
}
}
Search Results:
"hits": [
{
"_index": "66127416",
"_type": "_doc",
"_id": "2",
"_score": 0.25759193,
"_source": {
"name": "Chandni Chowk"
}
},
{
"_index": "66127416",
"_type": "_doc",
"_id": "1",
"_score": 0.19363807,
"_source": {
"name": "Chandni Chowk 2 Banglore"
}
}
]

Synonym analyzer not working

Here are my settings:
{
"countries": {
"aliases": {},
"mappings": {
"country": {
"properties": {
"countryName": {
"type": "string"
}
}
}
},
"settings": {
"index": {
"creation_date": "1472140045116",
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
}
}
},
"number_of_shards": "5",
"number_of_replicas": "1",
"uuid": "7-fKyD9aR2eG3BwUNdadXA",
"version": {
"created": "2030599"
}
}
},
"warmers": {}
}
}
My synonym.txt file is in the config folder inside the main elasticsearch folder.
Here is my query:
query: {
query_string: {
fields: ["countryName"],
default_operator: "AND",
query: searchInput,
analyzer: "synonym"
}
}
The words in synonym.txt are: us, u.s., united states.
So this doesn't work. What's interesting is that search works as normal, except for when I enter any of the words in the synonym.txt file. So for example, when I usually type in us into the search, I would get results. With this analyzer, us doesn't give me anything.
I've done close and open to my ES server, and still it doesn't work.
EDIT
An example of a document:
{
"_index": "countries",
"_type": "country",
"_id": "57aabeb80057405968de152b",
"_score": 1,
"_source": {
"countryName": "United States"
}
Example of searchInput (this is coming from the front-end):
united states
EDIT #2:
Here is my updated index config file:
{
"countries": {
"aliases": {},
"mappings": {},
"settings": {
"index": {
"number_of_shards": "5",
"creation_date": "1472219634083",
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"synonym"
],
"tokenizer": "whitespace"
}
}
},
"country": {
"properties": {
"countryName": {
"type": "string",
"analyzer": "synonym"
},
"number_of_replicas": "1",
"uuid": "50ZwpIVFTqeD_rJxlmd59Q",
"version": {
"created": "2030599"
}
}
},
"warmers": {}
}
}
}
}
When I try adding documents, and doing a search on said documents, the synonym analyzer does not work for me.
EDIT #3
Here are 2 documents in the index:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [{
"_index": "stocks",
"_type": "stock",
"_id": "2",
"_score": 1,
"_source": {
"countryName": "United States"
}
}, {
"_index": "stocks",
"_type": "stock",
"_id": "1",
"_score": 1,
"_source": {
"countryName": "Canada"
}
}]
}
}

You are close, but I suggest reading thoroughly this section from the documentation to understand better this functionality.
As a solution:
PUT /countries
{
"mappings": {
"country": {
"properties": {
"countryName": {
"type": "string",
"analyzer": "synonym"
}
}
}
},
"settings": {
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
}
},
"analyzer": {
"synonym": {
"filter": [
"lowercase",
"synonym"
],
"tokenizer": "whitespace"
}
}
}
}
}
You need to delete the index and create it again with the mapping above.
Then use this query:
"query": {
"query_string": {
"fields": [
"countryName"
],
"default_operator": "AND",
"query": "united states"
}
}

Have you deleted/created the index after pushing the txt ?
I think you should remove the "synonyms": "" if you are using "synonyms_path"

elasticsearch not returning text when entered partial word

I have my analyzers set like this:
"analyzer": {
"edgeNgram_autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "autocomplete"]
},
"full_name": {
"filter":["standard","lowercase","asciifolding"],
"type":"custom",
"tokenizer":"standard"
}
My filter:
"filter": {
"autocomplete": {
"type": "edgeNGram",
"side":"front",
"min_gram": 1,
"max_gram": 50
}
Name field analyzer:
"textbox": {
"_parent": {
"type": "document"
},
"properties": {
"text": {
"fields": {
"text": {
"type":"string",
"analyzer":"full_name"
},
"autocomplete": {
"type": "string",
"index_analyzer": "edgeNgram_autocomplete",
"search_analyzer": "full_name",
"analyzer": "full_name"
}
},
"type":"multi_field"
}
}
}
Put all together, makes up my mapping for docstore index:
PUT http://localhost:9200/docstore
{
"settings": {
"analysis": {
"analyzer": {
"edgeNgram_autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "autocomplete"]
},
"full_name": {
"filter":["standard","lowercase","asciifolding"],
"type":"custom",
"tokenizer":"standard"
}
},
"filter": {
"autocomplete": {
"type": "edgeNGram",
"side":"front",
"min_gram": 1,
"max_gram": 50
} }
}
},
"mappings": {
"space": {
"properties": {
"name": {
"type": "string",
"index": "not_analyzed"
}
}
},
"document": {
"_parent": {
"type": "space"
},
"properties": {
"name": {
"type": "string",
"index": "not_analyzed"
}
}
},
"textbox": {
"_parent": {
"type": "document"
},
"properties": {
"bbox": {
"type": "long"
},
"text": {
"fields": {
"text": {
"type":"string",
"analyzer":"full_name"
},
"autocomplete": {
"type": "string",
"index_analyzer": "edgeNgram_autocomplete",
"search_analyzer": "full_name",
"analyzer":"full_name"
}
},
"type":"multi_field"
}
}
},
"entity": {
"_parent": {
"type": "document"
},
"properties": {
"bbox": {
"type": "long"
},
"name": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
Add a space to hold all docs:
POST http://localhost:9200/docstore/space
{
"name": "Space 1"
}
When user enters word: proj
this should return, all text:
SampleProject
Sample Project
Project Name
myProjectname
firstProjectName
my ProjectName
But it returns nothing.
My query:
POST http://localhost:9200/docstore/textbox/_search
{
"query": {
"match": {
"text": "proj"
}
},
"filter": {
"has_parent": {
"type": "document",
"query": {
"term": {
"name": "1-a1-1001.pdf"
}
}
}
}
}
If I search by project, I get:
{ "took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 3.0133555,
"hits": [
{
"_index": "docstore",
"_type": "textbox",
"_id": "AVRuV2d_f4y6IKuxK35g",
"_score": 3.0133555,
"_routing": "AVRuVvtLf4y6IKuxK33f",
"_parent": "AVRuV2cMf4y6IKuxK33g",
"_source": {
"bbox": [
8750,
5362,
9291,
5445
],
"text": [
"Sample Project"
]
}
},
{
"_index": "docstore",
"_type": "textbox",
"_id": "AVRuV2d_f4y6IKuxK35Y",
"_score": 2.4106843,
"_routing": "AVRuVvtLf4y6IKuxK33f",
"_parent": "AVRuV2cMf4y6IKuxK33g",
"_source": {
"bbox": [
8645,
5170,
9070,
5220
],
"text": [
"Project Name and Address"
]
}
}
]
}
}
Maybe my edgengram is not suited for this?
I am saying:
side":"front"
Should I do it differently?
Does anyone know what I am doing wrong?

The problem is with the autocomplete indexing analyzer field name.
Change:
"index_analyzer": "edgeNgram_autocomplete"
To:
"analyzer": "edgeNgram_autocomplete"
And also search like (#Andrei Stefan) showed in his answer:
POST http://localhost:9200/docstore/textbox/_search
{
"query": {
"match": {
"text.autocomplete": "proj"
}
}
}
And it will work as expected!
I have tested your configuration on Elasticsearch 2.3
By the way, type multi_field is deprecated.
Hope I have managed to help :)

Your query should actually try to match on text.autocomplete and not text:
"query": {
"match": {
"text.autocomplete": "proj"
}
}

Get suggestion on field Elasticsearch

I am trying to make a suggestion feature with Elasticsearch.
following this article https://qbox.io/blog/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams
What I have now works but not for two words in the same sentence.
The data I have now in ES is.
{
"_index": "books",
"_type": "book",
"_id": "AVJp8p4ZTfM-Ee45GnF5",
"_score": 1,
"_source": {
"title": "Making a dish",
"author": "Jim haunter"
}
},
{
"_index": "books",
"_type": "book",
"_id": "AVJp8jaZTfM-Ee45GnF4",
"_score": 1,
"_source": {
"title": "The big fish",
"author": "Jane Stewart"
}
},
{
"_index": "books",
"_type": "book",
"_id": "AVJp8clRTfM-Ee45GnF3",
"_score": 1,
"_source": {
"title": "The Hunter",
"author": "Jame Franco"
}
}
Here is the mapping and settings.
{"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"books": {
"_all": {
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"title": {
"type": "string",
"index": "no"
},
"author": {
"type": "string",
"index": "no"
}
}
}
}
}
Here is the search
{
"size": 10,
"query": {
"match": {
"_all": {
"query": "Hunter",
"operator": "and",
"fuzziness": 1
}
}
}
}
when I search for "The" I get
"The big fish" and
"The hunter".
However when I enter "The Hunt" I get nothing.
To get the book again I need to enter "The Hunte".
Any suggestions?
Any help appreciated.

Removing "index": "no" from the fields worked for me. Also, since I'm using ES 2.x, I had to replace "index_analyzer" with "analyzer". So here is the mapping:
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"books": {
"_all": {
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"title": {
"type": "string"
},
"author": {
"type": "string"
}
}
}
}
}
Here's some code I used to test it:
http://sense.qbox.io/gist/0140ee0f5043f66e76cc3109a18d573c1d09280b

Trying to form an Elasticsearch query for autocomplete

I've read a lot and it seems that using EdgeNGrams is a good way to go for implementing an autocomplete feature for search applications. I've already configured the EdgeNGrams in my settings for my index.
PUT /bigtestindex
{
"settings":{
"analysis":{
"analyzer":{
"autocomplete":{
"type":"custom",
"tokenizer":"standard",
"filter":[ "standard", "stop", "kstem", "ngram" ]
}
},
"filter":{
"edgengram":{
"type":"ngram",
"min_gram":2,
"max_gram":15
}
},
"highlight": {
"pre_tags" : ["<em>"],
"post_tags" : ["</em>"],
"fields": {
"title.autocomplete": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
}
}
So if in my settings I have the EdgeNGram filter configured how do I add that to the search query?
What I have so far is a match query with highlight:
GET /bigtestindex/doc/_search
{
"query": {
"match": {
"content": {
"query": "thing and another thing",
"operator": "and"
}
}
},
"highlight": {
"pre_tags" : ["<em>"],
"post_tags" : ["</em>"],
"field": {
"_source.content": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
How would I add autocomplete to the search query using EdgeNGrams configured in the settings for the index?
UPDATE
For the mapping, would it be ideal to do something like this:
"title": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
},
Or do I need to use multi_field type:
"title": {
"type": "multi_field",
"fields": {
"title": {
"type": "string"
},
"autocomplete": {
"analyzer": "autocomplete",
"type": "string",
"index": "not_analyzed"
}
}
},
I'm using ES 1.4.1 and want to use the title field for autocomplete purposes.... ?

Short answer: you need to use it in a field mapping. As in:
PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"stop",
"kstem",
"ngram"
]
}
},
"filter": {
"edgengram": {
"type": "ngram",
"min_gram": 2,
"max_gram": 15
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field1": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
}
}
}
For a bit more discussion, see:
http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams
and
http://blog.qbox.io/an-introduction-to-ngrams-in-elasticsearch
Also, I don't think you want the "highlight" section in your index definition; that belongs in the query.
EDIT: Upon trying out your code, there are a couple of problems with it. One was the highlight issue I already mentioned. Another is that you named your filter "edgengram", even though it is of type "ngram" rather than type "edgeNGram", but then you referenced the filter "ngram" in your analyzer, which will use the default ngram filter, which probably doesn't give you what you want. (Hint: you can use term vectors to figure out what your analyzer is doing to your documents; you probably want to turn them off in production, though.)
So what you actually want is probably something like this:
PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"standard",
"stop",
"kstem",
"edgengram_filter"
]
}
},
"filter": {
"edgengram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 15
}
}
}
},
"mappings": {
"doc": {
"properties": {
"content": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "standard"
}
}
}
}
}
When I indexed these two docs:
POST test_index/doc/_bulk
{"index":{"_id":1}}
{"content":"hello world"}
{"index":{"_id":2}}
{"content":"goodbye world"}
And ran this query (there was an error in your "highlight" block as well; should have said "fields" rather than "field")"
POST /test_index/doc/_search
{
"query": {
"match": {
"content": {
"query": "good wor",
"operator": "and"
}
}
},
"highlight": {
"pre_tags": [
"<em>"
],
"post_tags": [
"</em>"
],
"fields": {
"content": {
"number_of_fragments": 1,
"fragment_size": 250
}
}
}
}
I get back this response, which seems to be what you're looking for, if I understand you correctly:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2712221,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 0.2712221,
"_source": {
"content": "goodbye world"
},
"highlight": {
"content": [
"<em>goodbye</em> <em>world</em>"
]
}
}
]
}
}
Here is some code I used to test it out:
http://sense.qbox.io/gist/3092992993e0328f7c4ee80e768dd508a0bc053f

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch: Shingles not working with odd no. of search words - elasticsearch

This is a bug in ES 5.2.x. Issue is with the single filter in new version

Related

why is shingle token filter with analyser isn't yielding expected results?

Synonym analyzer not working

elasticsearch not returning text when entered partial word

Get suggestion on field Elasticsearch

Trying to form an Elasticsearch query for autocomplete

Categories

Resources