Querying elasticsearch with OR and wildcards - elasticsearch

I'm trying to do a simple query to my elasticsearch _type and match multiple fields with wildcards, my first attempt was like this:
POST my_index/my_type/_search
{
"sort" : { "date_field" : {"order" : "desc"}},
"query" : {
"filtered" : {
"filter" : {
"or" : [
{
"term" : { "field1" : "4848" }
},
{
"term" : { "field2" : "6867" }
}
]
}
}
}
}
This example will successfully match every record when field1 OR field2 are exactly equal to 4848 and 6867 respectively.
What I'm trying to do is to match on field1 any text that contains 4848 and field2 that contains 6867 but I'm not really sure how to do it.
I appreciate any help I can get :)

It sounds like your problem has mostly to do with analysis. The appropriate solution depends on the structure of your data and what you want to match. I'll provide a couple of examples.
First, let's assume that your data is such that we can get what we want just using the standard analyzer. This analyzer will tokenize text fields on whitespace, punctuation and symbols. So the text "1234-5678-90" will be broken into the terms "1234", "5678", and "90", so a "term" query or filter for any of those terms will match that document. More concretely:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"field1":{
"type": "string",
"analyzer": "standard"
},
"field2":{
"type": "string",
"analyzer": "standard"
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"field1": "1212-2323-4848","field2": "1234-5678-90"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"field1": "0000-0000-0000","field2": "0987-6543-21"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"field1": "1111-2222-3333","field2": "6867-4545-90"}
POST test_index/_search
{
"query": {
"filtered": {
"filter": {
"or": [
{
"term": { "field1": "4848" }
},
{
"term": { "field2": "6867" }
}
]
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "1212-2323-4848",
"field2": "1234-5678-90"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field1": "1111-2222-3333",
"field2": "6867-4545-90"
}
}
]
}
}
(Explicitly writing "analyzer": "standard" is redundant since that is the default analyzer used if you do not specify one; I just wanted to make it obvious.)
On the other hand, if the text is embedded in such a way that the standard analysis doesn't provide what you want, say something like "121223234848" and you want to match on "4848", you will have to do something little more sophisticated, using ngrams. Here is an example of that (notice the difference in the data):
DELETE /test_index
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field1":{
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"field2":{
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"field1": "121223234848","field2": "1234567890"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"field1": "000000000000","field2": "0987654321"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"field1": "111122223333","field2": "6867454590"}
POST test_index/_search
{
"query": {
"filtered": {
"filter": {
"or": [
{
"term": { "field1": "4848" }
},
{
"term": { "field2": "6867" }
}
]
}
}
}
}
...
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "121223234848",
"field2": "1234567890"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field1": "111122223333",
"field2": "6867454590"
}
}
]
}
}
There is a lot going on here, so I won't attempt to explain it in this post. If you want more explanation I would encourage you to read this blog post: http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams. Hope you'll forgive the shameless plug. ;)
Hope that helps.

Related

Elasticsearch template to support case insensitive searches

I've setup a normalizer on an index field to support case insensitive searches, cant seem to get it to work.
GET users/
Returns the following mapping:
{
"users": {
"aliases": {},
"mappings": {
"user": {
"properties": {
"active": {
"type": "boolean"
},
"first_name": {
"type": "keyword",
"fields": {
"normalize": {
"type": "keyword",
"normalizer": "search_normalizer"
}
}
},
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "users",
"creation_date": "1567936315432",
"analysis": {
"normalizer": {
"search_normalizer": {
"filter": [
"lowercase"
],
"type": "custom"
}
}
},
"number_of_replicas": "1",
"uuid": "5SknFdwJTpmF",
"version": {
"created": "6040299"
}
}
}
}
}
Although first_name is normalized to lowercase, queries on the first_name field are case sensitive.
Using the following query for a user with first name Dave
GET users/_search
{
"query": {
"bool": {
"should": [
{
"regexp": {
"first_name": {
"value": ".*dave.*"
}
}
}
]
}
}
}
GET users/_analyze
{
"analyzer" : "standard",
"text": "Dave"
}
returns
{
"tokens": [
{
"token": "dave",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
}
]
}
Although "Dave" is tokenized to "dave" the following query
GET users/_search
{
"query": {
"match": {
"first_name": "dave"
}
}
}
Returns no hits.
Is there an issue with my current mapping? or the query?
I think you have missed first_name.normalize in query
Indexing Records
{"first_name": "Daveraj"}
{"index": {}}
{"first_name": "RajdaveN"}
{"index": {}}
{"first_name": "Dave"}
Query
"query": {
"bool": {
"should": [
{
"regexp": {
"first_name.normalize": {
"value": ".*dave.*"
}
}
}
]
}
}
}
Result
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.0,
"hits": [
{
"_index": "test3",
"_type": "test3_type",
"_id": "M8-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "Dave"
}
},
{
"_index": "test3",
"_type": "test3_type",
"_id": "Mc-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "Daveraj"
}
},
{
"_index": "test3",
"_type": "test3_type",
"_id": "Ms-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "RajdaveN"
}
}
]
}
}```
You have created a normalized multi-field: first_name.normalize , but you are searching on the original field first_name which doesn't have any analyzer specified (will default to index-default analyzer or standard).
The examples given here might help:
https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html
You need to explicitly specify the multi-field you want to search on, note even though a multi-field cant have its own content, it indexes different terms as opposed to its parent (although not always) as a result of possibly being analyzed using diff analyzers/char/token filters.

How to control scoring or ordering of results while using ngram in Elasticsearch?

I am using Elasticsearch 6.X..
I have created an index test_index with index type doc as follow:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "my_ngram_tokenizer"
}
},
"tokenizer": {
"my_ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "7",
"token_chars": [
"letter",
"digit",
"punctuation"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"my_text": {
"type": "text",
"fielddata": true,
"fields": {
"ngram": {
"type": "text",
"fielddata": true,
"analyzer": "my_analyzer"
}
}
}
}
}
}
}
I have indexed data as follow:
PUT /text_index/doc/1
{
"my_text": "ohio"
}
PUT /text_index/doc/2
{
"my_text": "ohlin"
}
PUT /text_index/doc/3
{
"my_text": "john"
}
Then I used search query:
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "oh",
"fields": [
"my_text^5",
"my_text.ngram"
]
}
}
]
}
}
}
And got the response:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 1.0042334,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1.0042334,
"_source": {
"my_text": "ohio"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 0.97201055,
"_source": {
"my_text": "john"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 0.80404717,
"_source": {
"my_text": "ohlin"
}
}
]
}
}
Here, we can see the when I searched for oh, I got results in the order:
-> ohio
-> john
-> ohlin
But, I want to have scoring and order of the results in a way which gives higher priority to matching prefix:
-> ohio
-> ohlin
-> john
How can I achieve such result ? What approaches can I take here ?
Thanks in advance.
You should add a new subfield with a new analyzer using the edge_ngram tokenizer then add the new subfield in your multimatch.
You need then to use the type most_fields for your multimatch query. Then only the documents starting by the search term will match on this subfield and then will be boosted against others matching documents.

Getting the document with searched term first, before its synonyms [Elastic]

I think I should explain my problem with an example:
Assume that I've created index with synonym analyzer and I declare that "laptop", "phone" and "tablet" are similar words that can be generalized as "mobile":
PUT synonym
{
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 2,
"analysis": {
"analyzer": {
"synonym": {
"tokenizer": "whitespace",
"filter": [
"synonym"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"phone, tablet, laptop => mobile"
]
}
}
}
}
},
"mappings": {
"synonym" : {
"properties" : {
"field1" : {
"type" : "text",
"analyzer": "synonym",
"search_analyzer": "synonym"
}
}
}
}
}
Now I am creating some docs:
PUT synonym/synonym/1
{
"field1" : "phone"
}
PUT synonym/synonym/2
{
"field1" : "tablet"
}
PUT synonym/synonym/3
{
"field1" : "laptop"
}
Now when I match query for laptop, tablet or phone, the result is always:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.2876821,
"hits": [
{
"_index": "synonym",
"_type": "synonym",
"_id": "2",
"_score": 0.2876821,
"_source": {
"field1": "tablet"
}
},
{
"_index": "synonym",
"_type": "synonym",
"_id": "1",
"_score": 0.18232156,
"_source": {
"field1": "phone"
}
},
{
"_index": "synonym",
"_type": "synonym",
"_id": "3",
"_score": 0.18232156,
"_source": {
"field1": "laptop"
}
}
]
}
}
You can see that the score of tablet is always higher even when I search for laptop.
I know that is because I declared them as similar words.
However, I am trying to figure out how can I query so that document with the search term can appear in the first place, before the similar words in the result list.
It can be done by boosting, but there must be a simpler approach..
Multi-fields to your rescue.
Index the field1 in two ways, one with the synonym analyzer, and the other with a standard analyzer.
Now you can simply use a bool-should query to add score for match on field1 (synonym) and on field1.raw (standard).
So, your mappings should be like so:
PUT synonym
{
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 2,
"analysis": {
"analyzer": {
"synonym": {
"tokenizer": "whitespace",
"filter": [
"synonym"
]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"phone, tablet, laptop => mobile"
]
}
}
}
}
},
"mappings": {
"synonym": {
"properties": {
"field1": {
"type": "text",
"analyzer": "synonym",
"search_analyzer": "synonym",
"fields": {
"raw": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
}
}
And you can query using:
GET synonyms/_search?search_type=dfs_query_then_fetch
{
"query": {
"bool": {
"should": [
{
"match": {
"field1": "tablet"
}
},
{
"match": {
"field1.raw": "tablet"
}
}
]
}
}
}
Notice: I've used search_type=dfs_query_then_fetch. Since you're testing on 3 shards and have very few documents, the scores you're getting aren't what they should be. This is because the frequencies are calculated per shard. You can use dfs_query_then_fetch while testing but it is discouraged for production. See: https://www.elastic.co/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch

ElasticSearch match score

I have a simple field of type "text" in my index.
"keywordName": {
"type": "text"
}
And I have these documents already inserted : "samsung", "samsung galaxy", "samsung cover", "samsung charger".
If I make a simple "match" query, the results are disturbing:
Query:
GET keywords/_search
{
"query": {
"match": {
"keywordName": "samsung"
}
}
}
Results:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.113083,
"hits": [
{
"_index": "keywords",
"_type": "keyword",
"_id": "samsung galaxy",
"_score": 1.113083,
"_source": {
"keywordName": "samsung galaxy"
}
},
{
"_index": "keywords",
"_type": "keyword",
"_id": "samsung charger",
"_score": 0.9433406,
"_source": {
"keywordName": "samsung charger"
}
},
{
"_index": "keywords",
"_type": "keyword",
"_id": "samsung",
"_score": 0.8405092,
"_source": {
"keywordName": "samsung"
}
},
{
"_index": "keywords",
"_type": "keyword",
"_id": "samsung cover",
"_score": 0.58279467,
"_source": {
"keywordName": "samsung cover"
}
}
]
}
}
First Question : Why "samsung" has not the highest score?
Second Question : How can I make a query or analyser which gives me "samsung" as highest score?
Starting from the same index settings (analyzers, filters, mappings) as in my previous reply, I suggest the following solution. But, as I mentioned, you need to lay down all the requirements in terms of what you need to search for in this index and consider all of this as a complete solution.
DELETE test
PUT test
{
"settings": {
"analysis": {
"analyzer": {
"custom_stop": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"my_stop",
"my_snow",
"asciifolding"
]
}
},
"filter": {
"my_stop": {
"type": "stop",
"stopwords": "_french_"
},
"my_snow": {
"type": "snowball",
"language": "French"
}
}
}
},
"mappings": {
"test": {
"properties": {
"keywordName": {
"type": "text",
"analyzer": "custom_stop",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
}
}
POST /test/test/_bulk
{"index":{}}
{"keywordName":"samsung galaxy"}
{"index":{}}
{"keywordName":"samsung charger"}
{"index":{}}
{"keywordName":"samsung cover"}
{"index":{}}
{"keywordName":"samsung"}
GET /test/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"keywordName": {
"query": "samsungs",
"operator": "and"
}
}
},
{
"term": {
"keywordName.raw": {
"value": "samsungs"
}
}
},
{
"fuzzy": {
"keywordName.raw": {
"value": "samsungs",
"fuzziness": 1
}
}
}
]
}
},
"size": 10
}

Make a full word have more score than a Edge NGram subset

I'm trying to get an higher score on a document where the full name is matched, instead of the Edge NGram subset with the same value.
So the results are:
Pos Name _score _id
1 Baritone horn 7.56878 1786
2 Baritone ukulele 7.56878 2313
3 Bari 7.56878 2360
4 Baritone voice 7.56878 1787
I intended that the third ("Bari") would have an higher score since it's the full name, however, since the edge ngram decomposition will make all the others to have exactly the "bari" word indexed. So has you can see on the results table, the score is equal for all, and I don't even know how elasticsearch order this, since the _id's are not even sequencial, nor the names ordered.
How can I achieve this?
Thanks
Example 'code'
Settings
{
"analysis": {
"filter": {
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": 3,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
},
"analyzer": {
"edgeNGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"edgeNGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}
source
Mapping:
{
"name": {
"type": "string",
"index": "not_analyzed"
},
"suggest": {
"type": "completion",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer",
"payloads": true
}
}
Query:
POST /attribute-tree/attribute/_search
{
"query": {
"match": {
"suggest": "Bari"
}
}
}
Results:
(only left relevant data)
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 7.56878,
"hits": [
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "1786",
"_score": 7.56878,
"_source": {
"name": "Baritone horn",
"suggest": {
"input": [
"Baritone",
"horn"
],
"output": "Baritone horn"
}
}
},
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "2313",
"_score": 7.56878,
"_source": {
"name": "Baritone ukulele",
"suggest": {
"input": [
"Baritone",
"ukulele"
],
"output": "Baritone ukulele"
}
}
},
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "2360",
"_score": 7.56878,
"_source": {
"name": "Bari",
"suggest": {
"input": [
"Bari"
],
"output": "Bari"
}
}
},
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "1787",
"_score": 7.568078,
"_source": {
"name": "Baritone voice",
"suggest": {
"input": [
"Baritone",
"voice"
],
"output": "Baritone voice"
}
}
}
]
}
}
You can use the bool query operator and its should clause to add score to exact matches like this :
POST /attribute-tree/attribute/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"suggest": "Bari"
}
}
],
"should": [
{
"match": {
"name": "Bari"
}
}
]
}
}
}
The query in the should clause is called a signal clause in the ElasticSearch definitive guide, and this is how you can distinguish between perfect matches and ngram ones. You will have all documents that match the must clause, but the documents matching should queries will have more score due to the bool query scoring formula :
score = ("must" queries total score + matching "should" queries total score) / (total number of "must" queries and "should" queries)
The result is what you expect, Bari is the first result (far ahead in scoring :) ) :
"hits": {
"total": 3,
"max_score": 0.4339554,
"hits": [
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "2360",
"_score": 0.4339554,
"_source": {
"name": "Bari",
"suggest": {
"input": [
"Bari"
],
"output": "Bari"
}
}
},
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "1786",
"_score": 0.04500804,
"_source": {
"name": "Baritone horn",
"suggest": {
"input": [
"Baritone",
"horn"
],
"output": "Baritone horn"
}
}
},
{
"_index": "attribute-tree",
"_type": "attribute",
"_id": "2313",
"_score": 0.04500804,
"_source": {
"name": "Baritone ukulele",
"suggest": {
"input": [
"Baritone",
"ukulele"
],
"output": "Baritone ukulele"
}
}
}
]

Resources