ElasticSearch accented and no accented words management - elasticsearch

I created an index :
PUT members
{
"settings":{
"number_of_shards":1,
"analysis":{
"analyzer":{
"accentedNames":{
"tokenizer":"standard",
"filter":[
"lowercase",
"asciifolding"
]
},
"standardNames":{
"tokenizer":"standard",
"filter":[
"lowercase"
]
}
}
}
},
"mappings":{
"member":{
"properties":{
"id":{
"type":"text"
},
"name":{
"type":"text",
"analyzer":"standardNames",
"fields":{
"accented":{
"type":"text",
"analyzer":"accentedNames"
}
}
}
}
}
}
}
Assume that some documents are in this set (EDIT):
{"1", "Maéllys Macron"};
{"2", "Maêllys Alix"};
{"3", "Maëllys Rosa"};
{"4", "Maèllys Alix"};
{"5", "Maellys du Bois"};
I wanted to have this result :
If I want to get documents named "Maéllys", I expect to get "Maéllys Richard" as the best match, and others with the same score.
What I did is to use my analyzers with a such request :
GET members/member/_search
{
"query":{
"multi_match" : {
"query" : "Maéllys",
"fields" : [ "name", "name.accented" ]
}
}
}
"Maéllys Richard" has the best score. The documents "Ma(ê|ë|é|è)llys Richard have the same score that is higher than "Maellys Richard" document.
Can someone help me ?
Thanks.

Related

How to filter aggregation results in elasticsearch (v 6.3)

I have an array for field commodity line ex:[3,35,1,11,12],[3,12]. I am trying to query the field for autocomplete results and i need output as 3 and 35 when i match with 3. My indexing works fine for all the scenarios except when i am working with an array data type.
I will need to filter the aggregation results to give 3 and 35, which i am unable to retrieve.i need to use facet_filter or filter with prefix .Similar to facet.prefix in solr.
Let me know if i need to change the query or the mapping?
Query :
GET contracts/doc/_search
{
"size":0,
"query":{
"bool":{
"must":{
"match":{
"commodity_line.autocomplete":"3"
}
}
}
},
"aggs" : {
"names":{
"terms":{
"field":"commodity_line.keyword"
}
}
}
}
Mapping :
PUT contracts
{
"settings":{
"analysis":{
"filter":{
"gramFilter": {
"type": "edge_ngram",
"min_gram" : 1,
"max_gram" : 20,
"token_chars": [
"letter",
"symbol",
"digit"
]
}
},
"analyzer":{
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"trim",
"gramFilter",
"asciifolding"
]
}
}
}
}
,
"mappings":{
"doc":{
"properties":{
"commodity_line" :{
"type":"text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
},
"autocomplete":{
"type":"text",
"analyzer":"autocomplete",
"search_analyzer":"standard"
}
}
}
}
}
}
}
I have found an solution,
I had to match it with a prefix rather than filtering the results.
"aggs" : {
"names":{
"terms":{
"field":"commodity_line.keyword",
"include" : "3.*"
}
}

why data can't get by elasticsearch?

Elastic search version 6.2.4
I made elastic search environment and made mapping like this.
{
"state":"open",
"settings":{
"index":{
"number_of_shards":"5",
"provided_name":"lara_cart",
"creation_date":"1529082175034",
"analysis":{
"filter":{
"engram":{
"type":"edgeNGram",
"min_gram":"1",
"max_gram":"36"
},
"maxlength":{
"type":"length",
"max":"36"
},
"word_delimiter":{
"split_on_numerics":"false",
"generate_word_parts":"true",
"preserve_original":"true",
"generate_number_parts":"true",
"catenate_all":"true",
"split_on_case_change":"true",
"type":"word_delimiter",
"catenate_numbers":"true"
}
},
"char_filter":{
"normalize":{
"mode":"compose",
"name":"nfkc",
"type":"icu_normalizer"
},
"whitespaces":{
"pattern":"\s[2,]",
"type":"pattern_replace",
"replacement":"\u0020"
}
},
"analyzer":{
"keyword_analyzer":{
"filter":[
"lowercase",
"trim",
"maxlength"
],
"char_filter":[
"normalize",
"whitespaces"
],
"type":"custom",
"tokenizer":"keyword"
},
"autocomplete_index_analyzer":{
"filter":[
"lowercase",
"trim",
"maxlength",
"engram"
],
"char_filter":[
"normalize",
"whitespaces"
],
"type":"custom",
"tokenizer":"keyword"
},
"autocomplete_search_analyzer":{
"filter":[
"lowercase",
"trim",
"maxlength"
],
"char_filter":[
"normalize",
"whitespaces"
],
"type":"custom",
"tokenizer":"keyword"
}
},
"tokenizer":{
"engram":{
"type":"edgeNGram",
"min_gram":"1",
"max_gram":"36"
}
}
},
"number_of_replicas":"1",
"uuid":"5xyW07F-RRCuIJlvBufNbA",
"version":{
"created":"6020499"
}
}
},
"mappings":{
"products":{
"properties":{
"sale_end_at":{
"format":"yyyy-MM-dd HH:mm:ss",
"type":"date"
},
"image_5":{
"type":"text"
},
"image_4":{
"type":"text"
},
"created_at":{
"format":"yyyy-MM-dd HH:mm:ss",
"type":"date"
},
"description":{
"analyzer":"keyword_analyzer",
"type":"text",
"fields":{
"autocomplete":{
"search_analyzer":"autocomplete_search_analyzer",
"analyzer":"autocomplete_index_analyzer",
"type":"text"
}
}
},
"sale_start_at":{
"format":"yyyy-MM-dd HH:mm:ss",
"type":"date"
},
"sale_price":{
"type":"integer"
},
"category_id":{
"type":"integer"
},
"updated_at":{
"format":"yyyy-MM-dd HH:mm:ss",
"type":"date"
},
"price":{
"type":"integer"
},
"image_1":{
"type":"text"
},
"name":{
"analyzer":"keyword_analyzer",
"type":"text",
"fields":{
"autocomplete":{
"search_analyzer":"autocomplete_search_analyzer",
"analyzer":"autocomplete_index_analyzer",
"type":"text"
},
"keyword":{
"analyzer":"keyword_analyzer",
"type":"text"
}
}
},
"image_3":{
"type":"text"
},
"categories":{
"type":"nested",
"properties":{
"parent_category_id":{
"type":"integer"
},
"updated_at":{
"type":"text",
"fields":{
"keyword":{
"ignore_above":256,
"type":"keyword"
}
}
},
"name":{
"analyzer":"keyword_analyzer",
"type":"text",
"fields":{
"autocomplete":{
"search_analyzer":"autocomplete_search_analyzer",
"analyzer":"autocomplete_index_analyzer",
"type":"text"
}
}
},
"created_at":{
"type":"text",
"fields":{
"keyword":{
"ignore_above":256,
"type":"keyword"
}
}
},
"id":{
"type":"long"
}
}
},
"id":{
"type":"long"
},
"image_2":{
"type":"text"
},
"stock":{
"type":"integer"
}
}
}
},
"aliases":[
],
"primary_terms":{
"0":1,
"1":1,
"2":1,
"3":1,
"4":1
},
"in_sync_allocations":{
"0":[
"clYoJWUKTru2Z78h0OINwQ"
],
"1":[
"MGQC73KiQsuigTPg4SQG4g"
],
"2":[
"zW6v82gNRbe3wWKefLOAug"
],
"3":[
"5TKrfz7HRAatQsJudKX9-w"
],
"4":[
"gqiblStYSYy_NA6fYtkghQ"
]
}
}
I want to use suggest search by autocomplete filed.
So I added a document like this.
{
"_index":"lara_cart",
"_type":"products",
"_id":"19",
"_version":1,
"_score":1,
"_source":{
"id":19,
"name":"Conqueror, whose.",
"description":"I should think you'll feel it a bit, if you wouldn't mind,' said Alice: 'besides, that's not a regular rule: you invented it just missed her. Alice caught the flamingo and brought it back, the fight.",
"category_id":81,
"stock":79,
"price":11533,
"sale_price":15946,
"sale_start_at":null,
"sale_end_at":null,
"image_1":"https://lorempixel.com/640/480/?56260",
"image_2":"https://lorempixel.com/640/480/?15012",
"image_3":"https://lorempixel.com/640/480/?14138",
"image_4":"https://lorempixel.com/640/480/?94728",
"image_5":"https://lorempixel.com/640/480/?99832",
"created_at":"2018-06-01 16:12:41",
"updated_at":"2018-06-01 16:12:41",
"deleted_at":null,
"categories":{
"id":81,
"name":"A secret, kept.",
"parent_category_id":"33",
"created_at":"2018-06-01 16:12:41",
"updated_at":"2018-06-01 16:12:41",
"deleted_at":null
}
}
}
After that, I try to search by below query.
But, this query can't get anything.
Do you know how to resolve it?
I think to cause is mapping and setting cause.
{
"query":{
"bool":{
"must":[
{
"term":{
"name.autocomplete":"Conqueror"
}
}
],
"must_not":[
],
"should":[
]
}
},
"from":0,
"size":10,
"sort":[
],
"aggs":{
}
}
It's just because of the field that you are using is analyzed and "term" couldn't support the query
you can try "match" on the field which analyzer is autocomplete; may be some basic knowledge of autocomplete and n-grams will help you better understanding this problem.
e.g.
you defined the following analyzer:
PUT /my_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
}
}
After that you can test the autocomplete with following request:
GET /my_index/_analyze?analyzer=autocomplete
quick brown
as configured abrove, the autocomplete will generate n-grams for the input query with the edges from 1 ~ 20. And the return for the request is:
q
qu
qui
quic
quick
b
br
bro
brow
brown
As we all know that term query is a query that will search the field which exactly contains the query world, just like where condition of mysql.

Elasticsearch nGram search query containing blanks

I have created the following index:
{
"settings":{
"number_of_shards":1,
"number_of_replicas":0,
"blocks":{
"read_only_allow_delete":false,
"read_only":false
},
"analysis":{
"filter":{
"autocomplete_filter":{
"type":"ngram",
"min_gram":3,
"max_gram":30
}
},
"analyzer":{
"autocomplete":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings":{
"movie":{
"properties":{
"title":{
"type":"text"
},
"actors":{
"type":"nested",
"include_in_all":true,
"properties":{
"name":{
"type":"text",
"analyzer":"autocomplete",
"search_analyzer": "standard"
},
"age":{
"type":"long",
"index":"false"
}
}
}
}
}
}
}
And I have inserted the following data via _bulk endpoint:
{"index":{"_index":"movies","_type":"movie","_id":1}}
{"title":"Ocean's 11", "actors":[{"name":"Brad Pitt","age":54}, {"name":"George Clooney","age":56}, {"name":"Julia Roberts","age":50}, {"name":"Andy Garcia","age":61}]}
{"index":{"_index":"movies","_type":"movie","_id":2}}
{"title":"Usual suspects", "actors":[{"name":"Kevin Spacey","age":58}, {"name":"Benicio del Toro","age":50}]}
{"index":{"_index":"movies","_type":"movie","_id":3}}
{"title":"Fight club", "actors":[{"name":"Brad Pitt","age":54}, {"name":"Edward Norton","age":48}, {"name":"Helena Bonham Carter","age":51}, {"name":"Jared Leto","age":46}]}
{"index":{"_index":"movies","_type":"movie","_id":24}
{"title":"Fight club", "actors":[{"name":"Brad Garrett","age":57}, {"name":"Ben Stiller","age":52}, {"name":"Robin Williams","age":63}]}
Now I want to search the index by actor name. For instance, when I search for brad I get all movies having an actor named brad, which is good.
But when I search for rad p I want only the movies with Brad Pitt, and not Brad Garrett, but I get Brad Garrett.
This is my search query :
{
"query":{
"nested":{
"path":"actors",
"query":{
"match":{
"actors.name":{
"query":"rad p",
"analyzer":"standard"
}
}
},
"inner_hits":{
}
}
}
}
Endpoint I am calling is
/movies/movie/_search?pretty
My question is, how to correctly implement the mentioned feature?
Thanks
BTW elasticsearch version is 6.1.0.
This is because of the standard tokenizer which will split the input into tokens based on whitespaces and punctuation, so Brad Pitt becomes brad and pitt and hence you will not have a token with rad p in it.
What you need to do is to change the tokenizer to (e.g.) keyword so that you consider the full input as one token which you can then apply ngram on.
Or easier, you can simply use the ngram tokenizer and not the token filter
As Val has said, you have to use the nGram tokenizer to do this, and I also had to change my search query to:
{
"query":{
"nested":{
"path":"searchable",
"query":{
"bool":{
"must":{
"match":{
"searchable.searchKeyword":{
"query":"%1$s"
}
}
}
}
},
"inner_hits":{
}
}
}
}
My new index with the nGram tokenizer:
{
"number_of_shards":1,
"number_of_replicas":0,
"blocks":{
"read_only_allow_delete":false,
"read_only":false
},
"analysis":{
"analyzer":{
"autocomplete":{
"tokenizer":"search_tokenizer",
"filter":[
"lowercase",
"asciifolding"
]
}
},
"tokenizer":{
"search_tokenizer":{
"type":"ngram",
"token_chars":[
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
],
"min_gram":3,
"max_gram":30
}
}
}
}

Elasticsearch not analyzed and lowercase

I'm trying to make a field lowercase and not analyzed in Elasticsearch 5+ in order to search for strings with spaces in lowercase (them being indexed in mixed case)
Before Elasticsearch v5 we could use an analyzer like this one to accomplish it:
"settings":{
"index":{
"analysis":{
"analyzer":{
"analyzer_keyword":{
"tokenizer":"keyword",
"filter":"lowercase"
}
}
}
}
}
This however doesn't work for me right now. And I believe the problem to be that "string" is deprecated and automatically converted to either keyword or text.
Anyone here know how to accomplish this? I thought about adding a "fields" tag to my mapping along the lines of:
"fields": {
"lowercase": {
"type": "string"
**somehow convert to lowercase**
}
}
This would make working with it slightly more challenging and I have no idea how to convert it to lowercase either.
Below you'll find a test setup which reproduces my exact problem.
create index:
{
"settings":{
"index":{
"analysis":{
"analyzer":{
"analyzer_keyword":{
"tokenizer":"keyword",
"filter":"lowercase"
}
}
}
}
},
"mappings":{
"test":{
"properties":{
"name":{
"analyzer":"analyzer_keyword",
"type":"string"
}
}
}
}
}
Add a test record:
{
"name": "city test"
}
Query that should match:
{
"size": 20,
"from": 0,
"query": {
"bool": {
"must": [{
"bool": {
"should": [{
"wildcard": {
"name": "*city t*"
}
}]
}
}]
}
}
}
When creating your index, you need to make sure that the analysis section is right under the settings section and not inside the settings > index section otherwise it won't work.
Then you also need to use the text data type for your field instead of the string one. Wipe your index, do that and it will work.
{
"settings":{
"analysis":{
"analyzer":{
"analyzer_keyword":{
"tokenizer":"keyword",
"filter":"lowercase"
}
}
}
},
"mappings":{
"test":{
"properties":{
"name":{
"analyzer": "analyzer_keyword",
"type": "text"
}
}
}
}
}

Multi field analyzer not working as expected

I'm confused. I have the following document indexed:
POST test/topic
{
"title": "antiemetics"
}
With the following query:
{
"query": {
"query_string" : {
"fields" : ["title*"],
"default_operator": "AND",
"query" :"anti emetics",
"use_dis_max" : true
}
},
"highlight" : {
"fields" : {
"*" : {
"fragment_size" : 200,
"pre_tags" : ["<mark>"],
"post_tags" : ["</mark>"]
}
}
}
}
and the following settings and mappings:
POST test{
"settings":{
"index":{
"number_of_shards":1,
"analysis":{
"analyzer":{
"merge":{
"type":"custom",
"tokenizer":"keyword",
"filter":[
"lowercase"
],
"char_filter":[
"hyphen",
"space",
"html_strip"
]
}
},
"char_filter":{
"hyphen":{
"type":"pattern_replace",
"pattern":"[-]",
"replacement":""
},
"space":{
"type":"pattern_replace",
"pattern":" ",
"replacement":""
}
}
}
}
},
"mappings":{
"topic":{
"properties":{
"title":{
"analyzer":"standard",
"search_analyzer":"standard",
"type":"string",
"fields":{
"specialised":{
"type":"string",
"index":"analyzed",
"analyzer":"standard",
"search_analyzer":"merge"
}
}
}
}
}
}
}
I know my use of a multi-field doesn't make sense as I'm using the same index analyzer as the title so please just ignore that however I'm more interested in my understanding with regard to analyzers. I was expecting the merge analyzer to change the following query "anti emetics" to "antiemetics" and I was hoping the multifield setting that has the analyzer applied would match against the token "antiemetics" but I don't get any results back even though I have tested that the analyzer is removing white spaces from the query by running the analyze API. Any idea why?
This seems to work with your setup:
POST /test_index/_search
{
"query": {
"match": {
"title.specialised": "anti emetics"
}
}
}
Here's some code I set up to play with it:
http://sense.qbox.io/gist/3ef6926644213cf7db568557a801fec6cb15eaf9

Resources