Keyword normalizer not applied on document - elasticsearch

I'm using Elasticsearch 6.8
here is my mapping
{
"index_patterns": [
"my_index_*"
],
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 1
},
"analysis": {
"analyzer": {
"lower_ascii_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"audit_conformity": {
"dynamic": "false",
"properties": {
"country": {
"type": "keyword",
"normalizer": "my_normalizer"
},
[…]
Then I post a document with this body
{
"_source": {
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1"
}
}
When I search for the document, the country is still capitalized
GET /my_index_country/_search
I get
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "my_index_country",
"_type": "my_index",
"_id": "LOT0fYIBCNP9gFG_7cet",
"_score": 1,
"_source": {
"_source": {
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1",
}
}
}
]
}
}
What do I do wrong ?

You do nothing wrong, but normalizers (and analyzer alike) will never modify your source document, only whatever is indexed from it.
This means that the source document keeps holding MX but underneath mx will be indexed for the country field.
If you want to lowercase the country field, you should use an ingest pipeline with a lowercase processor instead which will modify your source document before indexing it:
PUT _ingest/pipeline/lowercase-pipiline
{
"processors": [
{
"lowercase": {
"field": "country"
}
}
]
}
Then use it when indexing your documents:
PUT my_index_country/my_index/LOT0fYIBCNP9gFG_7cet?pipeline=lowercase-pipeline
{
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "MX",
"user_entity_id": "1",
}
GET my_index_country/my_index/LOT0fYIBCNP9gFG_7cet
Result =>
{
"company_id": "a813bec1-f9f3-44c7-96ac-11157f64b79b",
"country": "mx",
"user_entity_id": "1",
}

Related

Is there anyway to sort an index before the aggregation

I have the following index template
{
"index_patterns": "notificationtiles*",
"order": 1,
"version": 1,
"aliases": {
"notificationtiles": {}
},
"settings": {
"number_of_shards": 5,
"analysis": {
"normalizer": {
"lowercase_normalizer": {
"type": "custom",
"char_filter": [],
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"id": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"influencerId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"friendId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"message": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"type": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"sponsorshipCharityId": {
"type": "keyword",
"normalizer": "lowercase_normalizer"
},
"createdTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"updatedTimestampEpochInMilliseconds": {
"type": "date",
"format": "epoch_millis",
"index": false
},
"createdDate": {
"type": "date"
},
"updatedDate": {
"type": "date"
}
}
}
}
with the following query
{
"query": {
"bool": {
"must": [
{
"match": {
"influencerId": "52407710-f7be-49c1-bc15-6d52363144a6"
}
},
{
"match": {
"type": "friend_completed_sponsorship"
}
}
]
}
},
"size": 0,
"aggs": {
"friendId": {
"terms": {
"field": "friendId",
"size": 2
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"createdDate": {
"order": "desc"
}
}
],
"_source": {
"includes": [
"sponsorshipCharityId",
"message",
"createdDate"
]
},
"size": 1
}
}
}
}
}
}
which returns
{
"took": 72,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 12,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"friendId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 7,
"buckets": [
{
"key": "cf750fd8-998f-4dcd-9c88-56b2b6d6fce9",
"doc_count": 3,
"latest": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "416a8e07-fd72-46d4-ade1-b9442ef46978",
"_score": null,
"_source": {
"createdDate": "2020-06-24T17:35:17.816842Z",
"sponsorshipCharityId": "336de13c-f522-4796-9218-f373ff0b4373",
"message": "Contact Test 788826 Completed Sponsorship!"
},
"sort": [
1593020117816
]
}
]
}
}
},
{
"key": "93ab55c5-795f-44b0-900c-912e3e186da0",
"doc_count": 2,
"latest": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "66913b8f-94fe-49fd-9483-f332329b80dd",
"_score": null,
"_source": {
"createdDate": "2020-06-24T17:57:17.816842Z",
"sponsorshipCharityId": "dbad136c-5002-4470-b85d-e5ba1eff515b",
"message": "Contact Test 788826 Completed Sponsorship!"
},
"sort": [
1593021437816
]
}
]
}
}
}
]
}
}
}
However, I'd like the results to include the latest documents (ordered by createdDate desc), for example the following document
{
"_index": "notificationtiles-1",
"_type": "_doc",
"_id": "68a2a0a8-27aa-4347-8751-d7afccfa797d",
"_score": 1.0,
"_source": {
"id": "68a2a0a8-27aa-4347-8751-d7afccfa797d",
"influencerId": "52407710-f7be-49c1-bc15-6d52363144a6",
"friendId": "af342805-1990-4794-9d67-3bb2dd1e36dc",
"message": "Contact Test 788826 Completed Sponsorship!",
"type": "friend_completed_sponsorship",
"sponsorshipCharityId": "b2db72e6-a70e-414a-bf8b-558e6314e7ec",
"createdDate": "2020-06-25T17:35:17.816842Z",
"updatedDate": "2020-06-25T17:35:17.816876Z",
"createdTimestampEpochInMilliseconds": 1593021437817,
"updatedTimestampEpochInMilliseconds": 1593021437817
}
}
I need to get the 2 latests documents grouped by friendId with the latest document per friendId. The part of grouping by friendId with the latest document per friendId, works fine. However, I'm unable to sort the index by createdDate desc before the aggregation happens.
essentially, i'd like to sort the index by createdDate desc, before the aggregation takes place. I don't want to have a parent aggregate by createdDate since that wouldn't result in unique friendId. How can that be achieved?
It looks like you need to set the order property of your terms aggregation. By default they are ordered by hit count. You want them to be ordered by the max createdDate. So you should add a sub aggregation to calculate the max createdDate, and then you can use the ID of that aggregation to order the parent terms aggregation.

Elasticsearch template to support case insensitive searches

I've setup a normalizer on an index field to support case insensitive searches, cant seem to get it to work.
GET users/
Returns the following mapping:
{
"users": {
"aliases": {},
"mappings": {
"user": {
"properties": {
"active": {
"type": "boolean"
},
"first_name": {
"type": "keyword",
"fields": {
"normalize": {
"type": "keyword",
"normalizer": "search_normalizer"
}
}
},
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "users",
"creation_date": "1567936315432",
"analysis": {
"normalizer": {
"search_normalizer": {
"filter": [
"lowercase"
],
"type": "custom"
}
}
},
"number_of_replicas": "1",
"uuid": "5SknFdwJTpmF",
"version": {
"created": "6040299"
}
}
}
}
}
Although first_name is normalized to lowercase, queries on the first_name field are case sensitive.
Using the following query for a user with first name Dave
GET users/_search
{
"query": {
"bool": {
"should": [
{
"regexp": {
"first_name": {
"value": ".*dave.*"
}
}
}
]
}
}
}
GET users/_analyze
{
"analyzer" : "standard",
"text": "Dave"
}
returns
{
"tokens": [
{
"token": "dave",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
}
]
}
Although "Dave" is tokenized to "dave" the following query
GET users/_search
{
"query": {
"match": {
"first_name": "dave"
}
}
}
Returns no hits.
Is there an issue with my current mapping? or the query?
I think you have missed first_name.normalize in query
Indexing Records
{"first_name": "Daveraj"}
{"index": {}}
{"first_name": "RajdaveN"}
{"index": {}}
{"first_name": "Dave"}
Query
"query": {
"bool": {
"should": [
{
"regexp": {
"first_name.normalize": {
"value": ".*dave.*"
}
}
}
]
}
}
}
Result
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.0,
"hits": [
{
"_index": "test3",
"_type": "test3_type",
"_id": "M8-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "Dave"
}
},
{
"_index": "test3",
"_type": "test3_type",
"_id": "Mc-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "Daveraj"
}
},
{
"_index": "test3",
"_type": "test3_type",
"_id": "Ms-lEG0BLCpzI1hbBWYC",
"_score": 1.0,
"_source": {
"first_name": "RajdaveN"
}
}
]
}
}```
You have created a normalized multi-field: first_name.normalize , but you are searching on the original field first_name which doesn't have any analyzer specified (will default to index-default analyzer or standard).
The examples given here might help:
https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html
You need to explicitly specify the multi-field you want to search on, note even though a multi-field cant have its own content, it indexes different terms as opposed to its parent (although not always) as a result of possibly being analyzed using diff analyzers/char/token filters.

Elasticsearch concatenate two words into one

I have a field ManufacturerName
"ManufacturerName": {
"type": "keyword",
"normalizer" : "keyword_lowercase"
},
And a normalizer
"normalizer": {
"keyword_lowercase": {
"type": "custom",
"filter": ["lowercase"]
}
}
When searching for 'ripcurl' it matches. However when searching for 'rip curl' it doesn't.
How/what would use to concatenate certain words. i.e. 'rip curl' -> 'ripcurl'
Apologies if this is a duplicate, I've spent some time seeking a solution to this.
You would want to make use of text field for what you are looking for and get this kind of requirement carried out via Ngram Tokenizer
Below is a sample mapping, query and response:
Mapping:
PUT mysomeindex
{
"mappings": {
"mydocs":{
"properties": {
"ManufacturerName":{
"type": "text",
"analyzer": "my_analyzer",
"fields":{
"keyword":{
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
},
"settings": {
"analysis": {
"normalizer": {
"my_normalizer":{
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer",
"filter": [ "synonyms" ]
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 5,
"token_chars": [
"letter",
"digit"
]
}
},
"filter": {
"synonyms":{
"type": "synonym",
"synonyms" : ["henry loyd, henry loid, henry lloyd => henri lloyd"]
}
}
}
}
}
Notice that the field ManufacturerName is a multi-field which has both text type and its sibling keyword type. That way for exact matches & for aggregation queries you could make use of keyword field while for this requirement you can make use of text field.
Sample Document:
POST mysomeindex/mydocs/1
{
"ManufacturerName": "ripcurl"
}
POST mysomeindex/mydocs/2
{
"ManufacturerName": "henri lloyd"
}
What elasticsearch does when you ingest the above document is, it creates tokens of size from 3 to 5 length and stored them in inverted index for e.g. `rip, ipc, pcu etc...
You can execute the below query to see what tokens gets created:
POST mysomeindex/_analyze
{
"text": "ripcurl",
"analyzer": "my_analyzer"
}
Also I'd suggest you to look into Edge Ngram tokenizer and see if that fits better for your requirement.
Query:
POST mysomeindex/_search
{
"query": {
"match": {
"ManufacturerName": "rip curl"
}
}
}
Response:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.25316024,
"hits": [
{
"_index": "mysomeindex",
"_type": "mydocs",
"_id": "1",
"_score": 0.25316024,
"_source": {
"ManufacturerName": "ripcurl"
}
}
]
}
}
Query for Synonyms:
POST mysomeindex/_search
{
"query": {
"match": {
"ManufacturerName": "henri lloyd"
}
}
}
Response:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 2.2784421,
"hits": [
{
"_index": "mysomeindex",
"_type": "mydocs",
"_id": "2",
"_score": 2.2784421,
"_source": {
"ManufacturerName": "henry lloyd"
}
}
]
}
}
Note: If you intend to make use of synonyms then the best way it to have them in the a text file and add that relative to the config folder location as mentioned here
Hope this helps!

Elastic Search : Restricting the search result in array

My index metadata :
{
"never": {
"aliases": {},
"mappings": {
"userDetails": {
"properties": {
"Residence_address": {
"type": "nested",
"include_in_parent": true,
"properties": {
"Address_type": {
"type": "string",
"analyzer": "standard"
},
"Pincode": {
"type": "string",
"analyzer": "standard"
},
"address": {
"type": "string",
"analyzer": "standard"
}
}
}
}
}
},
"settings": {
"index": {
"creation_date": "1468850158519",
"number_of_shards": "5",
"number_of_replicas": "1",
"version": {
"created": "1060099"
},
"uuid": "v2njuC2-QwSau4DiwzfQ-g"
}
},
"warmers": {}
}
}
My setting :
POST never
{
"settings": {
"number_of_shards" : 5,
"analysis": {
"analyzer": {
"standard": {
"tokenizer": "keyword",
"filter" : ["lowercase","reverse"]
}
}
}
}
}
My data :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
My query :
POST /never/_search?pretty
{
"query": {
"match": {
"Residence_address.address": "Omega"
}
}
}
My Result :
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.375,
"hits": [
{
"_index": "never",
"_type": "userDetails",
"_id": "1",
"_score": 0.375,
"_source": {
"Residence_address": [
{
"address": "Omega Residency",
"Address_type": "Owned",
"Pincode": "500004"
},
{
"address": "Collage of Engineering",
"Address_type": "Rented",
"Pincode": "411005"
}
]
}
}
]
}
}
Is there any way to restrict my result to only object containing address = Omega Residency and NOT the other object having address = Collage of Engineering?
You can only do it with nested query and inner_hits. I see that you have include_in_parent: true and not using nested queries though. If you only want to get the matched nested objects you'd need to use inner_hits from nested queries:
GET /never/_search?pretty
{
"_source": false,
"query": {
"nested": {
"path": "Residence_address",
"query": {
"match": {
"Residence_address.address": "Omega Residency"
}
},
"inner_hits" : {}
}
}
}

Querying elasticsearch with OR and wildcards

I'm trying to do a simple query to my elasticsearch _type and match multiple fields with wildcards, my first attempt was like this:
POST my_index/my_type/_search
{
"sort" : { "date_field" : {"order" : "desc"}},
"query" : {
"filtered" : {
"filter" : {
"or" : [
{
"term" : { "field1" : "4848" }
},
{
"term" : { "field2" : "6867" }
}
]
}
}
}
}
This example will successfully match every record when field1 OR field2 are exactly equal to 4848 and 6867 respectively.
What I'm trying to do is to match on field1 any text that contains 4848 and field2 that contains 6867 but I'm not really sure how to do it.
I appreciate any help I can get :)
It sounds like your problem has mostly to do with analysis. The appropriate solution depends on the structure of your data and what you want to match. I'll provide a couple of examples.
First, let's assume that your data is such that we can get what we want just using the standard analyzer. This analyzer will tokenize text fields on whitespace, punctuation and symbols. So the text "1234-5678-90" will be broken into the terms "1234", "5678", and "90", so a "term" query or filter for any of those terms will match that document. More concretely:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"field1":{
"type": "string",
"analyzer": "standard"
},
"field2":{
"type": "string",
"analyzer": "standard"
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"field1": "1212-2323-4848","field2": "1234-5678-90"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"field1": "0000-0000-0000","field2": "0987-6543-21"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"field1": "1111-2222-3333","field2": "6867-4545-90"}
POST test_index/_search
{
"query": {
"filtered": {
"filter": {
"or": [
{
"term": { "field1": "4848" }
},
{
"term": { "field2": "6867" }
}
]
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "1212-2323-4848",
"field2": "1234-5678-90"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field1": "1111-2222-3333",
"field2": "6867-4545-90"
}
}
]
}
}
(Explicitly writing "analyzer": "standard" is redundant since that is the default analyzer used if you do not specify one; I just wanted to make it obvious.)
On the other hand, if the text is embedded in such a way that the standard analysis doesn't provide what you want, say something like "121223234848" and you want to match on "4848", you will have to do something little more sophisticated, using ngrams. Here is an example of that (notice the difference in the data):
DELETE /test_index
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
},
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"field1":{
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"field2":{
"type": "string",
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
}
}
}
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"field1": "121223234848","field2": "1234567890"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"field1": "000000000000","field2": "0987654321"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"field1": "111122223333","field2": "6867454590"}
POST test_index/_search
{
"query": {
"filtered": {
"filter": {
"or": [
{
"term": { "field1": "4848" }
},
{
"term": { "field2": "6867" }
}
]
}
}
}
}
...
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"field1": "121223234848",
"field2": "1234567890"
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"field1": "111122223333",
"field2": "6867454590"
}
}
]
}
}
There is a lot going on here, so I won't attempt to explain it in this post. If you want more explanation I would encourage you to read this blog post: http://blog.qbox.io/multi-field-partial-word-autocomplete-in-elasticsearch-using-ngrams. Hope you'll forgive the shameless plug. ;)
Hope that helps.

Resources