Elastic query BOOL with AND & OR - elasticsearch

I am new with Elastic. I am trying to do a simple query that in sql would be:
Select * from [movies] where is_adult = false AND (movie_title like '%xxx%' OR genre = 'xxxx')
In Elastic the closer that I could go was:
GET /idxsearch/movies/_search
{
"size": 10,
"query": {
"bool": {
"filter": {
"term": {
"is_adult": false
}
},
"must": [
{
"multi_match": {
"query": "xxx",
"operator": "and",
"fields": [ "movie_title.default^5", "movie_title.shingles" ]
}
}
],
"should": [
{"term": {
"genre.name": {
"value": "xxxx"
}
}}
]
}
}
}
But testing this query with some data I see that is not working....
cheers
Just to check, here is part of the custom analyser:
"shingle_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "shingle_filter"]
}
"shingle_filter": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 2
}
"match_title": {
"type": "text",
"fields": {
"default": {"type": "text", "analyzer": "default_analyzer" },
"snowball": {"type": "text", "analyzer": "snowball_analyzer"},
"shingles": {"type": "text", "analyzer": "shingle_analyzer" },
"ngrams": {"type": "text", "analyzer": "edgengram_analyzer",
"search_analyzer": "default_analyzer"}
}

Related

How to make edge_ngram token match with certaint quantity ofwords between them?

I'm trying to make a search request that retrieves the results only when less than
5 words are between requested tokens.
{
"settings": {
"index": {
"analysis": {
"filter": {
"stopWords": {
"type": "stop",
"stopwords": [
"_english_"
]
}
},
"normalizer": {
"lowercaseNormalizer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"char_filter": []
}
},
"analyzer": {
"autoCompleteAnalyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "autoCompleteTokenizer"
},
"autoCompleteSearchAnalyzer": {
"type": "custom",
"tokenizer": "lowercase"
},
"charGroupAnalyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "charGroupTokenizer"
}
},
"tokenizer": {
"charGroupTokenizer": {
"type": "char_group",
"max_token_length": "20",
"tokenize_on_chars": [
"whitespace",
"-",
"\n"
]
},
"autoCompleteTokenizer": {
"token_chars": [
"letter"
],
"min_gram": "3",
"type": "edge_ngram",
"max_gram": "20"
}
}
}
}
}
}
The settings:
{
"mappings": {
"_doc": {
"properties": {
"description": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 64
}
},
"analyzer": "autoCompleteAnalyzer",
"search_analyzer": "autoCompleteSearchAnalyzer"
},
"text": {
"type": "text",
"analyzer": "charGroupAnalyzer"
}
}
}
}
}
}
}
And make a bool request with request:
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"fields": [
"description.name"
],
"operator": "and",
"query": "rounded elephant",
"fuzziness": 1
}
},
{
"match_phrase": {
"description.text": {
"analyzer": "charGroupAnalyzer",
"query": "rounded elephant",
"slop": 5,
"boost": 20
}
}
}
]
}
}
}
I expect the request to retrieve documents, where description contains:
... rounded very interesting elephant ...
This works good, when i use the complete words, like rounded elephant.
But, whe i enter prefixed words, like round eleph it fails.
But it's obvious that the description.name and description.text have different tokenizers (name contains ngram tokens, but text contain word tokens), so i get completely wrong results.
How can I configure mappings and search, to be able to use ngrams with distance between tokens?

Any way to limit elasticsearch to only match the closest token? [Edge n-gram, Fuzziness]

Using Tokenizer, Fuzziness and Edge n-gram I have three documents:
"Star Trek I"
"Star Trekian"
"Star Trakian: A Star Trek Documentary"
Searching "Star Trek" with fuzziness gives "Star Trekian" a higher score than "Star Trek" because of additional tokens that match "Trek" (=> "Treki"). Is the best way to combat this to match additionally with less or no fuzziness?
Furthermore "Star Trakian: A Star Trek Documentary" gets an even higher score because it matches "Trak" and "Trek". Is there a way to match only the best token or any other method to score it the same as "Star Trek I" (because both contain "Star Trek")?
Edit:
Mappings & Settings:
PUT /stackoverflow
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"edge_n_gram": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "50"
}
},
"analyzer": {
"autocomplete": {
"filter": [
"lowercase",
"asciifolding",
"edge_n_gram"
],
"type": "custom",
"tokenizer": "autocomplete"
},
"autocomplete_search": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "char_group"
},
"full_word": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"tokenizer": "char_group"
}
},
"tokenizer": {
"autocomplete": {
"type": "standard"
},
"char_group": {
"type": "char_group",
"tokenize_on_chars": [
"whitespace",
"-",
"."
]
}
}
}
},
"mappings": {
"properties": {
"search_field_full": {
"type": "text",
"similarity": "boolean",
"fields": {
"raw": {
"type": "text",
"similarity": "boolean",
"analyzer": "full_word",
"search_analyzer": "autocomplete_search"
}
},
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
Documents:
POST stackoverflow/_doc/
{
"search_field_full": "Star Trek I"
}
POST stackoverflow/_doc/
{
"search_field_full": "Star Trakian: A Star Trek Documentary"
}
POST stackoverflow/_doc/
{
"search_field_full": "Star Trekian"
}
Query:
GET stackoverflow/_search
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"fields": [
"search_field_full"
],
"fuzziness": "AUTO:4,7",
"max_expansions": 500,
"minimum_should_match": 2,
"operator": "or",
"query": "Star Trek",
"type": "best_fields"
}
}
],
"should": [
{
"multi_match": {
"fields": [
"search_field_full.raw^30"
],
"fuzziness": 0,
"operator": "or",
"query": "Star Trek",
"type": "best_fields"
}
},
{
"multi_match": {
"fields": [
"search_field_full.raw^20"
],
"fuzziness": 1,
"operator": "or",
"query": "Star Trek",
"type": "best_fields"
}
}
]
}
}
}

Some Multi word synonyms are not working in elasticsearch for nested fields

I am trying to use synonym analyzer at query time and not getting expected results. Can someone throw some light on this?
Here is my mapping for the index:
{
"jobs_user_profile_v2": {
"mappings": {
"profile": {
"_all": {
"enabled": false
},
"_ttl": {
"enabled": true
},
"properties": {
"rsa": {
"type": "nested",
"properties": {
"answer": {
"type": "string",
"index_analyzer": "autocomplete",
"search_analyzer": "synonym",
"position_offset_gap": 100
},
"answerId": {
"type": "long"
},
"answerOriginal": {
"type": "string",
"index": "not_analyzed"
},
"createdAt": {
"type": "long"
},
"label": {
"type": "string",
"index": "not_analyzed"
},
"labelOriginal": {
"type": "string",
"index": "not_analyzed"
},
"question": {
"type": "string",
"index": "not_analyzed"
},
"questionId": {
"type": "long"
},
"questionOriginal": {
"type": "string"
},
"source": {
"type": "integer"
},
"updatedAt": {
"type": "long"
}
}
}
}
}
}
}
}
The field to focus on is rsa.answer, which is the field I am querying.
My synonym mapping:
Beautician,Stylist,Make up artist,Massage therapist,Therapist,Spa,Hair Dresser,Salon,Beauty Parlour,Parlor => Beautician
Carpenter,Wood Worker,Furniture Carpenter => Carpenter
Cashier,Store Manager,Store Incharge,Purchase Executive,Billing Executive,Billing Boy => Cashier
Content Writer,Writer,Translator,Writing,Copywriter,Content Creation,Script Writer,Freelance Writer,Freelance Content Writer => Content Writer
My Search Query:
http://{{domain}}/jobs_user_profile_v2/_search
{
"query": {
"nested":{
"path": "rsa",
"query":{
"query_string": {
"query": "hair dresser",
"fields": ["answer"],
"analyzer" :"synonym"
}
},
"inner_hits": {
"explain": true
}
}
},
"explain" : true,
"sort" : [ {
"_score" : { }
} ]
}
It is showing proper Beautician and 'Cashierprofiles for search queryHair Dresserandbilling executivebut not showing anything forwood worker => carpenter` case.
My analyzer results:
http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=hair dresser
{
"tokens": [
{
"token": "beautician",
"start_offset": 0,
"end_offset": 12,
"type": "SYNONYM",
"position": 1
}
]
}
and for wood worker case
http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=wood worker
{
"tokens": [
{
"token": "carpenter",
"start_offset": 0,
"end_offset": 11,
"type": "SYNONYM",
"position": 1
}
]
}
It is also not working a few other cases.
My analyzer setting for index:
"analysis": {
"filter": {
"synonym": {
"ignore_case": "true",
"type": "synonym",
"synonyms_path": "synonym.txt"
},
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "3",
"max_gram": "10"
}
},
"analyzer": {
"text_en_splitting_search": {
"type": "custom",
"filter": [
"stop",
"lowercase",
"porter_stem",
"word_delimiter"
],
"tokenizer": "whitespace"
},
"synonym": {
"filter": [
"stop",
"lowercase",
"synonym"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"type": "custom",
"tokenizer": "standard"
},
"text_en_splitting": {
"filter": [
"lowercase",
"porter_stem",
"word_delimiter"
],
"type": "custom",
"tokenizer": "whitespace"
},
"text_general": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "standard"
},
"edge_ngram_analyzer": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "edge_ngram_tokenizer"
},
"autocomplete_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "whitespace"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "2",
"type": "edgeNGram",
"max_gram": "10"
}
}
}
For the above case one multi-match is more ideal than query-string.
Multi-Match unlike query string does not tokenize the query terms before analyzing it . As a result multi-word synonyms may not work as expected.
Example:
{
"query": {
"nested": {
"path": "rsa",
"query": {
"multi_match": {
"query": "wood worker",
"fields": [
"rsa.answer"
],
"type" : "cross_fields",
"analyzer": "synonym"
}
}
}
}
}
If for some reason you prefer query-string then you would need to pass the entire query in double quotes to ensure it is not tokenized:
example :
post test/_search
{
"query": {
"nested": {
"path": "rsa",
"query": {
"query_string": {
"query": "\"wood worker\"",
"fields": [
"rsa.answer"
],
"analyzer": "synonym"
}
}
}
}
}

Highlight with fuzziness and ngram

I guess the title of the topic spoiled you enough :D
I use edge_ngram and highlight to build an autocomplete search. I have added fuzziness in the query to allow users to mispell their search, but it brokes a bit the highlight.
When i write Sport this is what I get :
<em>Spor</em>t
<em>Spor</em>t mécanique
<em>Spor</em>t nautique
I guess it's because it matches with the token spor generated by the ngram tokenizer.
The query:
{
"query": {
"bool": {
"should": [
{
"match": {
"name": {
"query": "sport",
"operator": "and",
"fuzziness": "AUTO"
}
}
},
{
"match_phrase_prefix": {
"name.raw": {
"query": "sport"
}
}
}
]
}
},
"highlight": {
"fields": {
"name": {
"term_vector": "with_positions_offsets"
}
}
}
}
And the mapping:
{
"settings": {
"analysis": {
"analyzer": {
"partialAnalyzer": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"keywordAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["asciifolding", "lowercase"]
},
"searchAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "letter", "digit" ]
}
}
}
},
"mappings": {
"place": {
"properties": {
"name": {
"type": "string",
"index_analyzer": "partialAnalyzer",
"search_analyzer": "searchAnalyzer",
"term_vector": "with_positions_offsets",
"fields": {
"raw": {
"type": "string",
"analyzer": "keywordAnalyzer"
}
}
}
}
}
}
}
I tried to add a new match clause without fuzziness in the query to try to match the keyword before the match with fuzziness but it changed nothing.
'match': {
'name': {
'query': 'sport',
'operator': 'and'
}
Any idea how I can handle this?
Regards, Raphaël
You could do that with highlight_query I guess
Try this in your highlighting query.
"highlight": {
"fields": {
"name": {
"term_vector": "with_positions_offsets",
"highlight_query": {
"match": {
"name.raw": {
"query": "spotr",
"fuzziness": 2
}
}
}
}
}
}
I hope it helps.

ElasticSearch filter on array of terms is filtering out all terms

In my application, users belong to a list of roles and objects have a list of roles associated with them to determine visibility. I'm trying to create a query that ensures the user belongs to at least one of the groups that is required by the object.
Here is my index configuration:
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 12,
"token_chars": []
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"team" : {
"dynamic": "strict",
"properties" : {
"id": {
"type": "string",
"index": "not_analyzed"
},
"object": {
"type": "string",
"index": "not_analyzed"
},
"roles": {
"type": "string",
"index": "not_analyzed"
},
"name": {
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"text": {
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
}
Here is some sample data that I have indexed:
(verified via localhost:9200/index/_search?q=name:employee_1&pretty)
{
"id":"lsJ17K4sgQVfd",
"roles: ["OwnerslsJ17K21px6VX","AdminslsJ17K21px6VX"],
"object":"contact",
"name":"employee_1",
"text":"lsJ17K4sgQVfd employee_1 employee_1 employee_1#lsj17k1nysk75.com"
}
Here is my query that I am trying to execute to find that same contact:
{
"_source": ["id", "object", "name"],
"size": 30,
"query": {
"filtered": {
"query": {
"bool": {
"should": {
"multi_match": {
"query": "employee_1",
"type": "cross_fields",
"operator": "or",
"fields": ["name^2", "text"],
"minimum_should_match": "50%",
"fuzziness": "AUTO"
}
},
...,
"minimum_should_match": 1
}
},
"filter": {
"terms": {
"roles": [ "AdminslsJ17K21px6VX", "lsJ17K3gHCH4P" ]
}
}
}
},
"suggest": {
"text": "employee_1",
"text_suggestion": {
"term": {
"size": 3,
"field": "name",
"sort": "score",
"suggest_mode": "missing",
"prefix_length": 1
}
}
}
}
If I remove the filter clause then I get results, but as soon as I add it back everything gets filtered out again. What is the right way to express that I want the results to have at least one role in common?
The query above works as expected, the only problem was that my test case was executing before the index was fully populated. Adding a short wait time before making my first query solved the problem.

Resources