How to apply custom analyser? - elasticsearch

Just discovered an issue with our Elastic Search. It is not returning anything for '&' in field name. Did some googling and I think I need a custom analyser. Never worked with ES before, assumption is I'm missing something basic here.
This is what I have got and it is not working as expected.
PUT custom_analyser
{
"settings": {
"analysis": {
"analyzer": {
"suggest_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [ "lowercase", "my_synonym_filter" ]
}
},
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"&, and",
"foo, bar" ]
}
}
}
}
}
And trying to use it like:
GET custom_analyser/_search
{
"aggs": {
"section": {
"terms": {
"field": "section",
"size": 10,
"shard_size": 500,
"include": "jill & jerry" //Not returning anything back for this field using default analyser
}
}
}
}
Output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
},
"aggregations": {
"section": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
Mappings
"_doc":{
"dynamic":"false",
"date_detection":false,
"properties":{
"section":{
"type":"keyword"
}
}
}
GET custom_analyser:
{
"custom_analyser": {
"aliases": {},
"mappings": {},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "custom_analyser",
"creation_date": "1565971369814",
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"&, and",
"foo, bar"
]
}
},
"analyzer": {
"suggest_analyzer": {
"filter": [
"lowercase",
"my_synonym_filter"
],
"type": "custom",
"tokenizer": "whitespace"
}
}
},
"number_of_replicas": "1",
"uuid": "oVMOU5wPQ--vKhE3dDFG2Q",
"version": {
"created": "6030199"
}
}
}
}
}

I think there is a slight confusion here: An analyzer won't help you, because you are (correctly) using a keyword field for the aggregation, but those are not analyzed. You could only use a normalizer on those fields.
For your specific problem: The include (and exclude) are regular expressions — you'll need to escape the & to make this work as expected.
Full example
Mapping and sample data:
PUT test
{
"mappings": {
"properties": {
"section": {
"type": "keyword"
}
}
}
}
PUT test/_doc/1
{
"section": "jill & jerry"
}
PUT test/_doc/2
{
"section": "jill jerry"
}
PUT test/_doc/3
{
"section": "jill"
}
PUT test/_doc/4
{
"section": "jill & jerry"
}
Query — you need a double backslash for the escape to work here (and I'm also excluding the actual documents with "size": 0 to keep the response shorter):
GET test/_search
{
"size": 0,
"aggs": {
"section": {
"terms": {
"field": "section",
"include": "jill \\& jerry"
}
}
}
}
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"section" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "jill & jerry",
"doc_count" : 2
}
]
}
}
}

Related

Synonym search in ElasticSearch

I want to retrieve the data from the index using the notion of synonym. When I perform a search with title A I also want to retrieve the documents whose title contains B. For that I set up the following mapping :
{
"settings": {
"index" : {
"analysis" : {
"filter" : {
"synonym_filter" : {
"type" : "synonym",
"synonyms" : [
"A=>A,B"
]
}
},
"analyzer" : {
"synonym_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["synonym_filter"]
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer" : "synonym_analyzer"
}
}
}
}
I then added 3 documents to my index
{
"title": "C"
}
{
"title": "B"
}
{
"title": "A"
}
I then used the analysis api to see if it works (everything is ok):
curl -X GET "localhost:9200/my_custom_index_title/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"analyzer": "synonym_analyzer",
"text": "A"
}
'
{
"tokens" : [
{
"token" : "A",
"start_offset" : 0,
"end_offset" : 1,
"type" : "SYNONYM",
"position" : 0
},
{
"token" : "B",
"start_offset" : 0,
"end_offset" : 1,
"type" : "SYNONYM",
"position" : 0
}
]
}
url -X GET "localhost:9200/my_custom_index_title/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"analyzer": "synonym_analyzer",
"text": "B"
}
'
{
"tokens" : [
{
"token" : "B",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
}
]
}
When I search for title A results are correct :
{
"query": {
"match": {
"title": {
"query": "A"
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.6951314,
"hits": [
{
"_index": "my_custom_index_title",
"_id": "i5bb_4IBqFAXxSLAgrDj",
"_score": 0.6951314,
"_source": {
"title": "A"
}
},
{
"_index": "my_custom_index_title",
"_id": "jJbb_4IBqFAXxSLAlLBj",
"_score": 0.52354836,
"_source": {
"title": "B"
}
}
]
}
}
But when I search for B the results are not correct, I just want result who contains B when I search and not A
{
"query": {
"match": {
"title": {
"query": "B"
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.52354836,
"hits": [
{
"_index": "my_custom_index_title",
"_id": "i5bb_4IBqFAXxSLAgrDj",
"_score": 0.52354836,
"_source": {
"title": "A"
}
},
{
"_index": "my_custom_index_title",
"_id": "jJbb_4IBqFAXxSLAlLBj",
"_score": 0.52354836,
"_source": {
"title": "B"
}
}
]
}
}
For example when I search for computer I wish to obtain laptop, computer, mac. But when I search for mac I only want to get the results for it (not laptop and computer)
I do not understand why the result for the search with B does not return only one result
I understand, in this case as you applied synonym_analyzer as a field analyzer, you indexed the synonyms.
To solve it, you can use synonyms only at search time, adding the parameter "search_analyzer". Note that I added the lowercase filter in the synonym_analyzer because the standard analyzer applies lowercase by default.
To get token synonyms for Term B do this:
{
"settings": {
"index": {
"analysis": {
"filter": {
"synonym_filter": {
"type": "synonym",
"expand":"false",
"synonyms": [
"A=>A,B"
]
}
},
"analyzer": {
"synonym_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"synonym_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "standard",
"search_analyzer": "synonym_analyzer"
}
}
}
}

Getting empty buckets array in elasticsearch aggregation

I'm using version 5.4.1 of ElasticSearch.
When I try to perform a groupBy aggregate/bucket aggregate, I'm not getting any values in the bucket array.
This is my index:
curl -X PUT localhost:9200/urldata -d '{
"mappings" : {
"components" : {
"properties" : {
"name" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"status" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
}
}'
And this the aggregate query:
curl -XGET 'localhost:9200/urldata/_search?pretty' -H 'Content-Type: application/json' -d'
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name.keyword"
}
}
}
}
'
Output:
{
"took":2,
"timed_out":false,
"_shards":{
"total":5,
"successful":5,
"failed":0
},
"hits":{
"total":3,
"max_score":0.0,
"hits":[
]
},
"aggregations":{
"components":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
]
}
}
}
Where am I going wrong??
Try this, it should do it:
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
EDIT:
Here is all the steps to replicate your use case:
PUT test
{
"settings" : {
"index" : {
"number_of_shards" : 1,
"number_of_replicas" : 0
}
}
}
PUT test/_mapping/people_name
{
"properties":{
"name":{
"type":"keyword",
"index":"not_analyzed"
},
"status":{
"type":"keyword",
"index":"not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
POST test/people_name
{
"name": "A",
"status": "success",
"created_at": "2017-08-17"
}
POST test/people_name
{
"name": "A",
"status": "success_2",
"created_at": "2017-06-15"
}
POST test/people_name
{
"name": "B",
"status": "success",
"created_at": "2017-09-15"
}
GET test/people_name/_search
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
The result of the aggregation is:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"components": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 2
},
{
"key": "B",
"doc_count": 1
}
]
}
}
}

Phrase suggester Elasticsearch not correcting two words

I have the following mapping to my phrase suggester:
{
"settings": {
"analysis": {
"analyzer": {
"suggests_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"shingle_filter"
],
"type": "custom"
}
},
"filter": {
"shingle_filter": {
"min_shingle_size": 2,
"max_shingle_size": 6,
"type": "shingle"
}
}
}
},
"mappings": {
"sample_data": {
"properties": {
"name": {
"type": "string",
"analyzer": "suggests_analyzer"
}
}
}
}
}
I have "lung cancer", "colorectal cancer", "breast cancer" indexed in my index. But when I query for a mispelt query where both words are mispelt like "lhng cancar" returns zero results when I use the collate functionality. My sample query is as follows.
{
"suggest": {
"text": "lhng cancar",
"simple_phrase": {
"phrase": {
"field": "name",
"size": 5,
"real_word_error_likelihood": 0.95,
"max_errors": 0.5,
"direct_generator": [
{
"field": "name",
"suggest_mode": "always",
"size": 5
}
],
"collate": {
"query": {
"inline": {
"match_phrase": {
"{{field_name}}": "{{suggestion}}"
}
}
},
"params": {
"field_name": "name"
},
"prune": false
}
}
}
},
"size": 0
}
The response to the above query is:
{
"took": 17,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1868381,
"max_score": 0,
"hits": []
},
"suggest": {
"simple_phrase": [
{
"text": "lhng cancar",
"offset": 0,
"length": 11,
"options": []
}
]
}
}
What changes do need to do in the query so that I get the expected result as "lung cancer" in the suggestions?
You have to raise max_errors to 0.8 or more.
Same answer is given here
ElasticSearch - Phrase Suggestor
Raising the parameter of max_errors: 2 solved my problem.

Not able to aggregate on nested fields in elasticsearch

I have set a field to nested and now i am not able to aggregate on it.
Sample document -
{
"attributes" : [
{ "name" : "snake" , "type" : "reptile" },
{ "name" : "cow" , "type" : "mamal" }
]
}
attributes field is nested.
Following terms query is not working on this
{
"aggs" : {
"terms" : { "field" : "attributes.name" }
}
}
How can I do the aggregation in elasticsearch?
Use a nested aggregation.
As a simple example, I created an index with a nested property matching what you posted:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"attributes": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
Then added your document:
PUT /test_index/doc/1
{
"attributes": [
{ "name": "snake", "type": "reptile" },
{ "name": "cow", "type": "mammal" }
]
}
Now I can get "attribute.name" terms as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_attributes": {
"nested": {
"path": "attributes"
},
"aggs": {
"name_terms": {
"terms": {
"field": "attributes.name"
}
}
}
}
}
}
...
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_attributes": {
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cow",
"doc_count": 1
},
{
"key": "snake",
"doc_count": 1
}
]
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/0e3ed9c700f240e523be08a27551707d4448a9df

Elastic search aggregations buckets counting email format as two different bucket key .

I have field stored as "user1#user.com " .
Using aggregations json query :
"aggregations": {
"email-terms": {
"terms": {
"field": "l_obj.email",
"size": 0,
"shard_size": 0,
"order": {
"_count": "desc"
}
}
}
}
I am getting response :
"buckets" : [
{
"key" : "user.com",
"doc_count" : 1
},
{
"key" : "user1",
"doc_count" : 1
}
instead of
"buckets" : [
{
"key" : "user1#user.com",
"doc_count" : 1
}
]
Same issue persists for string type likes : user1.user2.user.com ,I am doing terms aggregations .
Am i missing something here ?
You need to set "index": "not_analyzed" on the "email" field in your mapping.
If I set up a toy index without specifying an analyzer (or to not use one), the standard analyzer will be used, which will split on whitespace and symbols like "#". So, with this index definition:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"email": {
"type": "string"
}
}
}
}
}
if I add a single doc:
PUT /test_index/doc/1
{
"email": "user1#user.com"
}
and then ask for a terms aggregation, I get back two terms:
POST /test_index/_search?search_type=count
{
"aggregations": {
"email-terms": {
"terms": {
"field": "email"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"email-terms": {
"buckets": [
{
"key": "user.com",
"doc_count": 1
},
{
"key": "user1",
"doc_count": 1
}
]
}
}
}
But if I rebuild the index with "index": "not_analyzed" in that field, and again index the same document:
DELETE /test_index
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"email": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test_index/doc/1
{
"email": "user1#user.com"
}
and run the same terms aggregation, I only get back a single term for that email address:
POST /test_index/_search?search_type=count
{
"aggregations": {
"email-terms": {
"terms": {
"field": "email"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"email-terms": {
"buckets": [
{
"key": "user1#user.com",
"doc_count": 1
}
]
}
}
}
Here is the code I used, altogether:
http://sense.qbox.io/gist/a73a28bf7450b637138b02a371fb15cabf344ab6
We can use index template to predefined field types ,http://www.elastic.co/guide/en/elasticsearch/reference/1.3/indices-templates.html
,ex :
Use rest client or elastic search sense
PUT/POST http://escluster:port/_template
{
"testtemplate": {
"aliases": {},
"mappings": {
"test1": {
"_all": {
"enabled": false
},
"_source": {
"enabled": true
},
"properties": {
"email": {
"fielddata": {
"format": "doc_values"
},
"index": "not_analyzed",
"type": "string"
}...

Resources