Elasticsearch. Full text search for the Russian language - elasticsearch

Right now I am using hunspell dictionary as my search engine in ES. It works weirdly and I don't understand why. For example, I have several entries in my index with the word "перец" in different forms:
1 ч. л. смеси перцев горошком;
2–3 колечка красного перца чили с семенами;
черный молотый перец;
and several entries with the word "колодец" in different forms:
несколько колодцев;
3 колодца;
1 колодец;
My index has the following settings:
PUT http://localhost:9200/ingredient
Content-Type: application/json
{
"settings": {
"analysis": {
"analyzer": {
"custom_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"ru_RU",
"my_stemmer"
],
"char_filter": [
"html_strip"
]
}
},
"filter": {
"my_stemmer": {
"type": "stemmer",
"language": "russian"
},
"ru_RU": {
"type": "hunspell",
"locale": "ru_RU"
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "custom_analyzer"
}
}
}
}
When I make my search query for "колодец" like this:
GET http://localhost:9200/ingredient/_search?pretty
Content-Type: application/json
{
"query": {
"query_string": {
"query": "колодец",
"default_field": "name"
}
}
}
I receive the following JSON:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 5.0841255,
"hits": [
{
"_index": "ingredient",
"_type": "_doc",
"_id": "2940d2bc-59ca-4c41-98d6-803d50913d04",
"_score": 5.0841255,
"_source": {
"name": "несколько колодцев",
"id": "2940d2bc-59ca-4c41-98d6-803d50913d04",
"_meta": {}
}
},
{
"_index": "ingredient",
"_type": "_doc",
"_id": "2940d2bc-59ca-4c41-98d6-803d50913d05",
"_score": 5.0841255,
"_source": {
"name": "3 колодца",
"id": "2940d2bc-59ca-4c41-98d6-803d50913d05",
"_meta": {}
}
},
{
"_index": "ingredient",
"_type": "_doc",
"_id": "2940d2bc-59ca-4c41-98d6-803d50913d06",
"_score": 5.0841255,
"_source": {
"name": "1 колодец",
"id": "2940d2bc-59ca-4c41-98d6-803d50913d06",
"_meta": {}
}
}
]
}
}
Response code: 200 (OK); Time: 45ms; Content length: 1199 bytes
But when I make the similar request with "перец":
GET http://localhost:9200/ingredient/_search?pretty
Content-Type: application/json
{
"query": {
"query_string": {
"query": "перец",
"default_field": "name"
}
}
}
I only get this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 23,
"relation": "eq"
},
"max_score": 3.1693017,
"hits": [
{
"_index": "ingredient",
"_type": "_doc",
"_id": "9c72cba2-2986-40dd-b15b-0df0288e91f1",
"_score": 2.8541024,
"_source": {
"name": "свежемолотый черный перец",
"id": "9c72cba2-2986-40dd-b15b-0df0288e91f1",
"_meta": {}
}
},
]
}
}
I do not get neither 1 ч. л. смеси перцев горошком nor 2–3 колечка красного перца чили с семенами.
It seems strange to me because колодец and перец have a similar way of making their morphological forms. Do I have this problem because my hunspell dictionary is not full enough? If so where can I find the most complete hunspell dictionary or the other dictionary for the Russian language?

Related

Normalizing keyword field: ascii should match diacritic, but not vice versa

I have a keyword field that can contain characters with diacritics. Queries without diacritics should return results with those diacritics, but not vice versa. The first part can be resolved by using a normalizer, the configuration for which is also described in a related question. If I use that for e.g. {"title": "Sulgi"} and {"title": "Šulgi"}, searching for "Sulgi" will (correctly) return both documents. However, searching for "Šulgi" also returns both documents, instead of just the one with the diacritic. It seems ES is also normalizing the query input, which is generally good, but is it possible to change that behavior?
PUT _template/test
{
"index_patterns": ["*"],
"settings": {
"analysis": {
"normalizer": {
"exact": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "keyword",
"normalizer": "exact"
}
}
}
}
POST test/_doc/1
{
"title": "Sulgi"
}
POST test/_doc/2
{
"title": "Šulgi"
}
Example search query:
POST test/_search
{
"query": {
"term": {
"title":"Šulgi"
}
}
}
{
"took": 294,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.18232156,
"hits": [
{
"_index": "test",
"_type": "_doc",
"_id": "1",
"_score": 0.18232156,
"_source": {
"title": "Šulgi"
}
},
{
"_index": "test",
"_type": "_doc",
"_id": "2",
"_score": 0.18232156,
"_source": {
"title": "Sulgi"
}
}
]
}
}

What does total value shows inside the _search query result in elasticsearch?

When we call the elasticsearch, say as follows:
POST https:////_search with body:
{
"from": 0,
"size": 1,
"query": {
"bool": {
"must": [
{
"range": {
"createdAt": {
"gt": "2019-11-11T10:00:00"
}
}
}
]
}
},
"sort": [
{
"createdAt" : {
"order" : "desc"
}
}
]
}
I see that I get only 1 result as pagination is set to 1 but total inside hits in response shows 2. This is the response I get:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": “<index-name>”,
"_type": "_doc",
"_id": "5113c843-dff3-499f-a12e-44c7ac103bcf_0",
"_score": null,
"_source": {
"oId": "5113c843-dff3-499f-a12e-44c7ac103bcf",
"oItemId": 0,
"createdAt": "2019-11-13T11:00:00"
},
"sort": [
1573642800000
]
}
]
}
}
Doesn’t total doesn’t capture the pagination part? And it only cares about the query report? It should show the total count of items matching the query irrespective of the pagination set, right?
Yes, You are right that total doesn't capture the pagination part and just cares about the query report ie. whatever the total no of the document matches for a given query.
To be precise, it is as explained in official ES docs .
total (Object) Metadata about the number of returned documents.
Returned parameters include:
value: Total number of returned documents. relation: Indicates whether
the number of documents returned. Returned values are:
eq: Accurate gte: Lower bound, including returned documents
It means its the total no of returned documents, but as pagination is set to 1 in your example, inner hits have just 1 document.You can cross-check this understanding easily by creating a sample example as below:
Create a sample index with just 1 text field:
URL:- http://localhost:9200/{your-index-name}/ --> PUT method
{
"mappings": {
"properties": {
"name": {
"type": "text"
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "1"
}
}
}
Once the above index is created index below 4 documents:
URL:- http://localhost:9200/{your-index-name}/_doc/{1,2,like..} --> POST method
{
"name": "foo 1"
}
{
"name": "foo bar"
}
{
"name": "foo"
}
{
"name": "foo 2"
}
Now when you hit below search query without pagination:
{
"query": {
"bool": {
"must": [
{
"match": {
"name": "foo"
}
}
]
}
}
}
It gives below response:
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4, --> Note 4 here
"relation": "eq"
},
"max_score": 0.12199639,
"hits": [
{
"_index": "59638303",
"_type": "_doc",
"_id": "1",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "3",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "2",
"_score": 0.09271725,
"_source": {
"name": "foo bar"
}
},
{
"_index": "59638303",
"_type": "_doc",
"_id": "4",
"_score": 0.09271725,
"_source": {
"name": "foo 1"
}
}
]
}
}
But when you hit a search query with pagination:
{
"from": 0,
"size": 1,--> note size 1
"query": {
"bool": {
"must": [
{
"match": {
"name": "foo"
}
}
]
}
}
}
it gives below response
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4, --> this is still 4
"relation": "eq"
},
"max_score": 0.12199639,
"hits": [
{
"_index": "59638303",
"_type": "_doc",
"_id": "1",
"_score": 0.12199639,
"_source": {
"name": "foo"
}
}
]
}
}
Now in the above query, you can change the size and check only inner-hits array gets change but the outer hits object which contains total always remains same as 4, this confirms your understanding is correct.

Elasticsearch returns documents with a query must_not exists

Elasticsearch: 6.5.4
Issue: I'm executing a bool query (sample to follow) where I'm checking for the existence of a specific field. The issue is, I'm getting results back where the field does exist but has an empty array.
My question is, how do I properly execute a query and only get results where nlp is not added to the document at all.
Sample query:
{
"size": 100,
"sort": [{
"publishedAt": {
"order": "asc"
}
}],
"_source": {
"includes": ["nlp"]
},
"query": {
"bool": {
"must_not": {
"exists": {
"field": "nlp.categories.gcp"
}
}
}
}
}
Sample Mapping:
(This was automatically created by Elastic Search, with the exception of the null_value, I tried adding that).
{
"mapping": {
"article": {
"properties": {
"nlp": {
"properties": {
"categories": {
"properties": {
"gcp": {
"properties": {
"confidence": {
"type": "float"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"null_value": "[]",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
}
}
}
Sample Result:
{
"took": 68,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1126581,
"max_score": null,
"hits": [
{
"_index": "news",
"_type": "article",
"_id": "UTuVmmsBE1H01hY9Rn6i",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": []
}
}
},
"sort": [
1509940860000
]
},
{
"_index": "news",
"_type": "article",
"_id": "2w6PmmsBIpi-jAhhO13F",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": []
}
}
},
"sort": [
1510027260000
]
}
]
}
}
When the nlp.categories.gcp has values in it, a typical response would look like this.
{
"took": 26,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 475690,
"max_score": null,
"hits": [
{
"_index": "news",
"_type": "article",
"_id": "6Q6JmmsBIpi-jAhhAlcm",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": [
{
"confidence": 0.8999999761581421,
"name": "/Travel/Hotels & Accommodations"
}
]
}
}
},
"sort": [
1510215565000
]
},
{
"_index": "news",
"_type": "article",
"_id": "rzunmmsBE1H01hY9sLyE",
"_score": null,
"_source": {
"nlp": {
"categories": {
"gcp": [
{
"confidence": 0.9399999976158142,
"name": "/Travel/Hotels & Accommodations"
}
]
}
}
},
"sort": [
1510228881000
]
}
]
}
}

sorting on aggregate of value in a given field in elasticsearch

I have the following field in my index
field1:{key:value}
Is it possible to sort my query on sum of values in field1.
Thanks
Here's one way you could do this, assuming you know the fields ahead of time. It should be possible with some minor refinements if you need to wildcard the fields. This assumes the sibling fields on the nested type are numeric.
Example mapping:
"test": {
"mappings": {
"type1": {
"properties": {
"field1": {
"properties": {
"key1": {
"type": "integer"
},
"key2": {
"type": "integer"
}
}
}
}
}
}
}
Default results:
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "type1",
"_id": "AV8O7956gIcGI2d5A_5g",
"_score": 1,
"_source": {
"field1": {
"key1": 11,
"key2": 17
}
}
},
{
"_index": "test",
"_type": "type1",
"_id": "AV8O78FqgIcGI2d5A_5f",
"_score": 1,
"_source": {
"field1": {
"key1": 5,
"key2": 6
}
}
}
]
}
Query with script:
GET /test/_search
{
"query": {
"function_score": {
"query": {
"match_all": {}
},
"functions": [
{
"script_score": {
"script": "return (doc['field1.key1'].value + doc['field1.key2'].value) * -1"
}
}
]
}
}
}
Logic taking the lowest score as the best score (least negative in this case):
{
"took": 18,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": -11,
"hits": [
{
"_index": "test",
"_type": "type1",
"_id": "AV8O78FqgIcGI2d5A_5f",
"_score": -11,
"_source": {
"field1": {
"key1": 5,
"key2": 6
}
}
},
{
"_index": "test",
"_type": "type1",
"_id": "AV8O7956gIcGI2d5A_5g",
"_score": -28,
"_source": {
"field1": {
"key1": 11,
"key2": 17
}
}
}
]
}
}
Hopefully this gives you the gist of whatever specific scoring logic you need

Elasticsearch - Return nested values in format

How can i make elasticsearch return nested values in format of hits {value1:..., value2..., value3..., etc..}
This is my request:
{
"_source": 0,
"query": {
"bool": {
"must": [
{
"nested": {
"path": "photo",
"query": {
"bool": {
"must": [
{
"match": {
"photo.hello": "true"
}
}
]
}
},
"inner_hits" : {}
}
}
]
}}}
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "6UAaCls5iSgavEtFE2qMX902Xmb2",
"_score": 1.2231436,
"inner_hits": {
"photo": {
"hits": {
"total": 1,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "6UAaCls5iSgavEtFE2qMX902Xmb2",
"_nested": {
"field": "photo",
"offset": 0
},
"_score": 1.2231436,
"_source": {
"hello": "true",
"i_am_superCOOL": "true",
"xoxox": "true",
"id": "-KSDRx5BN54JHitoq7Wb"
}
}
]
}
}
}
},
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_score": 1.2231436,
"inner_hits": {
"photo": {
"hits": {
"total": 2,
"max_score": 1.2231436,
"hits": [
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_nested": {
"field": "photo",
"offset": 1
},
"_score": 1.2231436,
"_source": {
"alahu": "true",
"hello": "true",
"same": "true",
"smukais": "true",
"id": "-KSDJzyUC_N5je-cR2aT"
}
},
{
"_index": ".3eautiful",
"_type": "profile",
"_id": "KDFbeXrOedf7b6NVRGMO0HDIFgx1",
"_nested": {
"field": "photo",
"offset": 0
},
"_score": 1.2231436,
"_source": {
"hello": "true",
"same": "true",
"selfyyy": "true",
"superSexy": "true",
"id": "-KPn4p7spS8NO7IVSLdF"
}
}
]
}
}
}
}
]
}
}
I am using 2 dimension dynamic attribute search, the problem with this approach is that the result's can be 20 from 1 user, but i need to make it propriety based.
Just sticked to the same format.

Resources