How to apply custom analyser?

How to apply custom analyser? - elasticsearch

Just discovered an issue with our Elastic Search. It is not returning anything for '&' in field name. Did some googling and I think I need a custom analyser. Never worked with ES before, assumption is I'm missing something basic here.
This is what I have got and it is not working as expected.
PUT custom_analyser
{
"settings": {
"analysis": {
"analyzer": {
"suggest_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [ "lowercase", "my_synonym_filter" ]
}
},
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"&, and",
"foo, bar" ]
}
}
}
}
}
And trying to use it like:
GET custom_analyser/_search
{
"aggs": {
"section": {
"terms": {
"field": "section",
"size": 10,
"shard_size": 500,
"include": "jill & jerry" //Not returning anything back for this field using default analyser
}
}
}
}
Output:
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
},
"aggregations": {
"section": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
Mappings
"_doc":{
"dynamic":"false",
"date_detection":false,
"properties":{
"section":{
"type":"keyword"
}
}
}
GET custom_analyser:
{
"custom_analyser": {
"aliases": {},
"mappings": {},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "custom_analyser",
"creation_date": "1565971369814",
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"&, and",
"foo, bar"
]
}
},
"analyzer": {
"suggest_analyzer": {
"filter": [
"lowercase",
"my_synonym_filter"
],
"type": "custom",
"tokenizer": "whitespace"
}
}
},
"number_of_replicas": "1",
"uuid": "oVMOU5wPQ--vKhE3dDFG2Q",
"version": {
"created": "6030199"
}
}
}
}
}

I think there is a slight confusion here: An analyzer won't help you, because you are (correctly) using a keyword field for the aggregation, but those are not analyzed. You could only use a normalizer on those fields.
For your specific problem: The include (and exclude) are regular expressions — you'll need to escape the & to make this work as expected.
Full example
Mapping and sample data:
PUT test
{
"mappings": {
"properties": {
"section": {
"type": "keyword"
}
}
}
}
PUT test/_doc/1
{
"section": "jill & jerry"
}
PUT test/_doc/2
{
"section": "jill jerry"
}
PUT test/_doc/3
{
"section": "jill"
}
PUT test/_doc/4
{
"section": "jill & jerry"
}
Query — you need a double backslash for the escape to work here (and I'm also excluding the actual documents with "size": 0 to keep the response shorter):
GET test/_search
{
"size": 0,
"aggs": {
"section": {
"terms": {
"field": "section",
"include": "jill \\& jerry"
}
}
}
}
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"section" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "jill & jerry",
"doc_count" : 2
}
]
}
}
}

Related

Synonym search in ElasticSearch

I want to retrieve the data from the index using the notion of synonym. When I perform a search with title A I also want to retrieve the documents whose title contains B. For that I set up the following mapping :
{
"settings": {
"index" : {
"analysis" : {
"filter" : {
"synonym_filter" : {
"type" : "synonym",
"synonyms" : [
"A=>A,B"
]
}
},
"analyzer" : {
"synonym_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["synonym_filter"]
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer" : "synonym_analyzer"
}
}
}
}
I then added 3 documents to my index
{
"title": "C"
}
{
"title": "B"
}
{
"title": "A"
}
I then used the analysis api to see if it works (everything is ok):
curl -X GET "localhost:9200/my_custom_index_title/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"analyzer": "synonym_analyzer",
"text": "A"
}
'
{
"tokens" : [
{
"token" : "A",
"start_offset" : 0,
"end_offset" : 1,
"type" : "SYNONYM",
"position" : 0
},
{
"token" : "B",
"start_offset" : 0,
"end_offset" : 1,
"type" : "SYNONYM",
"position" : 0
}
]
}
url -X GET "localhost:9200/my_custom_index_title/_analyze?pretty" -H 'Content-Type: application/json' -d'
{
"analyzer": "synonym_analyzer",
"text": "B"
}
'
{
"tokens" : [
{
"token" : "B",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
}
]
}
When I search for title A results are correct :
{
"query": {
"match": {
"title": {
"query": "A"
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.6951314,
"hits": [
{
"_index": "my_custom_index_title",
"_id": "i5bb_4IBqFAXxSLAgrDj",
"_score": 0.6951314,
"_source": {
"title": "A"
}
},
{
"_index": "my_custom_index_title",
"_id": "jJbb_4IBqFAXxSLAlLBj",
"_score": 0.52354836,
"_source": {
"title": "B"
}
}
]
}
}
But when I search for B the results are not correct, I just want result who contains B when I search and not A
{
"query": {
"match": {
"title": {
"query": "B"
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.52354836,
"hits": [
{
"_index": "my_custom_index_title",
"_id": "i5bb_4IBqFAXxSLAgrDj",
"_score": 0.52354836,
"_source": {
"title": "A"
}
},
{
"_index": "my_custom_index_title",
"_id": "jJbb_4IBqFAXxSLAlLBj",
"_score": 0.52354836,
"_source": {
"title": "B"
}
}
]
}
}
For example when I search for computer I wish to obtain laptop, computer, mac. But when I search for mac I only want to get the results for it (not laptop and computer)
I do not understand why the result for the search with B does not return only one result

I understand, in this case as you applied synonym_analyzer as a field analyzer, you indexed the synonyms.
To solve it, you can use synonyms only at search time, adding the parameter "search_analyzer". Note that I added the lowercase filter in the synonym_analyzer because the standard analyzer applies lowercase by default.
To get token synonyms for Term B do this:
{
"settings": {
"index": {
"analysis": {
"filter": {
"synonym_filter": {
"type": "synonym",
"expand":"false",
"synonyms": [
"A=>A,B"
]
}
},
"analyzer": {
"synonym_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"synonym_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "standard",
"search_analyzer": "synonym_analyzer"
}
}
}
}

Getting empty buckets array in elasticsearch aggregation

I'm using version 5.4.1 of ElasticSearch.
When I try to perform a groupBy aggregate/bucket aggregate, I'm not getting any values in the bucket array.
This is my index:
curl -X PUT localhost:9200/urldata -d '{
"mappings" : {
"components" : {
"properties" : {
"name" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"status" : {
"type" : "keyword",
"index" : "not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
}
}'
And this the aggregate query:
curl -XGET 'localhost:9200/urldata/_search?pretty' -H 'Content-Type: application/json' -d'
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name.keyword"
}
}
}
}
'
Output:
{
"took":2,
"timed_out":false,
"_shards":{
"total":5,
"successful":5,
"failed":0
},
"hits":{
"total":3,
"max_score":0.0,
"hits":[
]
},
"aggregations":{
"components":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
]
}
}
}
Where am I going wrong??

Try this, it should do it:
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
EDIT:
Here is all the steps to replicate your use case:
PUT test
{
"settings" : {
"index" : {
"number_of_shards" : 1,
"number_of_replicas" : 0
}
}
}
PUT test/_mapping/people_name
{
"properties":{
"name":{
"type":"keyword",
"index":"not_analyzed"
},
"status":{
"type":"keyword",
"index":"not_analyzed"
},
"timestamp":{
"type":"date",
"index":"not_analyzed"
}
}
}
POST test/people_name
{
"name": "A",
"status": "success",
"created_at": "2017-08-17"
}
POST test/people_name
{
"name": "A",
"status": "success_2",
"created_at": "2017-06-15"
}
POST test/people_name
{
"name": "B",
"status": "success",
"created_at": "2017-09-15"
}
GET test/people_name/_search
{
"size": 0,
"aggs": {
"components": {
"terms": {
"field": "name"
}
}
}
}
The result of the aggregation is:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"components": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "A",
"doc_count": 2
},
{
"key": "B",
"doc_count": 1
}
]
}
}
}

Phrase suggester Elasticsearch not correcting two words

I have the following mapping to my phrase suggester:
{
"settings": {
"analysis": {
"analyzer": {
"suggests_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"shingle_filter"
],
"type": "custom"
}
},
"filter": {
"shingle_filter": {
"min_shingle_size": 2,
"max_shingle_size": 6,
"type": "shingle"
}
}
}
},
"mappings": {
"sample_data": {
"properties": {
"name": {
"type": "string",
"analyzer": "suggests_analyzer"
}
}
}
}
}
I have "lung cancer", "colorectal cancer", "breast cancer" indexed in my index. But when I query for a mispelt query where both words are mispelt like "lhng cancar" returns zero results when I use the collate functionality. My sample query is as follows.
{
"suggest": {
"text": "lhng cancar",
"simple_phrase": {
"phrase": {
"field": "name",
"size": 5,
"real_word_error_likelihood": 0.95,
"max_errors": 0.5,
"direct_generator": [
{
"field": "name",
"suggest_mode": "always",
"size": 5
}
],
"collate": {
"query": {
"inline": {
"match_phrase": {
"{{field_name}}": "{{suggestion}}"
}
}
},
"params": {
"field_name": "name"
},
"prune": false
}
}
}
},
"size": 0
}
The response to the above query is:
{
"took": 17,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1868381,
"max_score": 0,
"hits": []
},
"suggest": {
"simple_phrase": [
{
"text": "lhng cancar",
"offset": 0,
"length": 11,
"options": []
}
]
}
}
What changes do need to do in the query so that I get the expected result as "lung cancer" in the suggestions?

You have to raise max_errors to 0.8 or more.
Same answer is given here
ElasticSearch - Phrase Suggestor

Raising the parameter of max_errors: 2 solved my problem.

Not able to aggregate on nested fields in elasticsearch

I have set a field to nested and now i am not able to aggregate on it.
Sample document -
{
"attributes" : [
{ "name" : "snake" , "type" : "reptile" },
{ "name" : "cow" , "type" : "mamal" }
]
}
attributes field is nested.
Following terms query is not working on this
{
"aggs" : {
"terms" : { "field" : "attributes.name" }
}
}
How can I do the aggregation in elasticsearch?

Use a nested aggregation.
As a simple example, I created an index with a nested property matching what you posted:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"attributes": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
Then added your document:
PUT /test_index/doc/1
{
"attributes": [
{ "name": "snake", "type": "reptile" },
{ "name": "cow", "type": "mammal" }
]
}
Now I can get "attribute.name" terms as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_attributes": {
"nested": {
"path": "attributes"
},
"aggs": {
"name_terms": {
"terms": {
"field": "attributes.name"
}
}
}
}
}
}
...
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_attributes": {
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cow",
"doc_count": 1
},
{
"key": "snake",
"doc_count": 1
}
]
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/0e3ed9c700f240e523be08a27551707d4448a9df

Elastic search aggregations buckets counting email format as two different bucket key .

I have field stored as "user1#user.com " .
Using aggregations json query :
"aggregations": {
"email-terms": {
"terms": {
"field": "l_obj.email",
"size": 0,
"shard_size": 0,
"order": {
"_count": "desc"
}
}
}
}
I am getting response :
"buckets" : [
{
"key" : "user.com",
"doc_count" : 1
},
{
"key" : "user1",
"doc_count" : 1
}
instead of
"buckets" : [
{
"key" : "user1#user.com",
"doc_count" : 1
}
]
Same issue persists for string type likes : user1.user2.user.com ,I am doing terms aggregations .
Am i missing something here ?

You need to set "index": "not_analyzed" on the "email" field in your mapping.
If I set up a toy index without specifying an analyzer (or to not use one), the standard analyzer will be used, which will split on whitespace and symbols like "#". So, with this index definition:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"email": {
"type": "string"
}
}
}
}
}
if I add a single doc:
PUT /test_index/doc/1
{
"email": "user1#user.com"
}
and then ask for a terms aggregation, I get back two terms:
POST /test_index/_search?search_type=count
{
"aggregations": {
"email-terms": {
"terms": {
"field": "email"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"email-terms": {
"buckets": [
{
"key": "user.com",
"doc_count": 1
},
{
"key": "user1",
"doc_count": 1
}
]
}
}
}
But if I rebuild the index with "index": "not_analyzed" in that field, and again index the same document:
DELETE /test_index
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"email": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
PUT /test_index/doc/1
{
"email": "user1#user.com"
}
and run the same terms aggregation, I only get back a single term for that email address:
POST /test_index/_search?search_type=count
{
"aggregations": {
"email-terms": {
"terms": {
"field": "email"
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"email-terms": {
"buckets": [
{
"key": "user1#user.com",
"doc_count": 1
}
]
}
}
}
Here is the code I used, altogether:
http://sense.qbox.io/gist/a73a28bf7450b637138b02a371fb15cabf344ab6

We can use index template to predefined field types ,http://www.elastic.co/guide/en/elasticsearch/reference/1.3/indices-templates.html
,ex :
Use rest client or elastic search sense
PUT/POST http://escluster:port/_template
{
"testtemplate": {
"aliases": {},
"mappings": {
"test1": {
"_all": {
"enabled": false
},
"_source": {
"enabled": true
},
"properties": {
"email": {
"fielddata": {
"format": "doc_values"
},
"index": "not_analyzed",
"type": "string"
}...

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

How to apply custom analyser? - elasticsearch

Related

Synonym search in ElasticSearch

Getting empty buckets array in elasticsearch aggregation

Phrase suggester Elasticsearch not correcting two words

Not able to aggregate on nested fields in elasticsearch

Elastic search aggregations buckets counting email format as two different bucket key .

Categories

Resources