elasticsearch custom analyzer by specific chracter - elasticsearch

How to create custom analyzer that tokenize a field by '/' characters only.
I have url strings in my field for exp: "https://stackoverflow.com/questions/ask"
I want tokenized this like: "http", "stackoverflow.com", "questions" and "ask"

This seems to do what you want, using a pattern tokenizer:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"slash_analyzer": {
"type": "pattern",
"pattern": "[/:]+",
"lowercase": true
}
}
}
},
"mappings": {
"doc": {
"properties": {
"url": {
"type": "string",
"index_analyzer": "slash_analyzer",
"search_analyzer": "standard",
"term_vector": "yes"
}
}
}
}
}
PUT /test_index/doc/1
{
"url": "http://stackoverflow.com/questions/ask"
}
I added term vectors in the mapping (you probably don't want to do this in production), so we can see what terms are generated:
GET /test_index/doc/1/_termvector
...
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_version": 1,
"found": true,
"took": 1,
"term_vectors": {
"url": {
"field_statistics": {
"sum_doc_freq": 4,
"doc_count": 1,
"sum_ttf": 4
},
"terms": {
"ask": {
"term_freq": 1
},
"http": {
"term_freq": 1
},
"questions": {
"term_freq": 1
},
"stackoverflow.com": {
"term_freq": 1
}
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/669fbdd681895d7e9f8db13799865c6e8be75b11

The standard analyzer already does that for you.
curl -XGET 'localhost:9200/_analyze?analyzer=standard&pretty' -d 'http://stackoverflow.com/questions/ask'
You get this:
{
"tokens" : [ {
"token" : "http",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 1
}, {
"token" : "stackoverflow.com",
"start_offset" : 7,
"end_offset" : 24,
"type" : "<ALPHANUM>",
"position" : 2
}, {
"token" : "questions",
"start_offset" : 25,
"end_offset" : 34,
"type" : "<ALPHANUM>",
"position" : 3
}, {
"token" : "ask",
"start_offset" : 35,
"end_offset" : 38,
"type" : "<ALPHANUM>",
"position" : 4
} ]
}

Related

Why can't I search email domain name when using `text` type in Elasticsearch

I have a email field in the document get saved in Elasticsearch index. I am able to search the value before # but I can't find anything by searching the domain value.
For example, below query give me nothing:
GET transaction-green/_search
{
"query": {
"match": {
"email": "gmail"
}
},
"_source": {
"includes": [
"email"
]
}
}
but it returns document if I search test#gmail.com or just test.
The mapping for this email field is the default text type:
"email" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
why does the domain name ignored from searching?
It is happening due to standrad analyzer. As you are using default analyzer, it will analyze your value something like below:
You can use below API for checking analyzer:
POST email/_analyze
{
"analyzer": "standard",
"text": ["test#gmail.com"]
}
{
"tokens" : [
{
"token" : "test",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "gmail.com",
"start_offset" : 5,
"end_offset" : 14,
"type" : "<ALPHANUM>",
"position" : 1
}
]
}
You can define your custom analyzer with character filter like below and your query will work:
PUT /email
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_char_filter"
]
}
},
"char_filter": {
"my_char_filter": {
"type": "pattern_replace",
"pattern": "\\.",
"replacement": " "
}
}
}
},
"mappings": {
"properties": {
"email":{
"type": "text",
"analyzer": "my_analyzer"
}
}
}
}
Now you can analyze value using below analzyer and you can see it will create 3 seperate token for email.
POST email/_analyze
{
"analyzer": "my_analyzer",
"text": ["test#gmail.com"]
}
{
"tokens" : [
{
"token" : "test",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "gmail",
"start_offset" : 5,
"end_offset" : 10,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "com",
"start_offset" : 11,
"end_offset" : 14,
"type" : "<ALPHANUM>",
"position" : 2
}
]
}

elasticsearch can't hanlde space after add synonym analyzer

I created an index called my_index by this command
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"synonym": {
"type": "synonym",
"lenient": "true",
"synonyms": [
...
...
...
]
}
},
"analyzer": {
"synonym": {
"filter": [
"uppercase",
"synonym"
],
"tokenizer": "whitespace"
}
}
}
},
"mappings": {
"items": {
"properties": {
"country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"information": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "synonym"
},
"person": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
inside information, I had a data that looks like this 100 /INDIA/2022 (pay attention to the space after 100). If i search for 100/INDIA/2022 (no space after 100), elasticsearch will return nothing. If I create new index with no analyzer, 100/INDIA/2022 will return the expected result. Can someone help me for this problem?
synonym analyzer defined in your index settings, includes tokenizing the text on whitespace. So, on analyzing the text 100 /INDIA/2022
GET 71595890/_analyze
{
"text": "100 /INDIA/2022",
"analyzer": "synonym"
}
Following tokens are produced
{
"tokens" : [
{
"token" : "100",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "/INDIA/2022",
"start_offset" : 4,
"end_offset" : 15,
"type" : "word",
"position" : 1
}
]
}
Since you have not explicitly defined any search_analyzer then by default index analyzer (which is the analyzer you have defined in your index mapping) is the same as the search analyzer.
So, when you are searching for 100/INDIA/2022, the text gets tokenized into
{
"tokens" : [
{
"token" : "100/INDIA/2022",
"start_offset" : 0,
"end_offset" : 14,
"type" : "word",
"position" : 0
}
]
}
There is no matching token produced (when compared to 100 and /INDIA/2022), therefore no documents will match.
In the second case when you have created a new index with no analyzer, then by default standard analyzer is taken.
In the case of standard analyzer following tokens are produced
{
"tokens" : [
{
"token" : "100",
"start_offset" : 0,
"end_offset" : 3,
"type" : "<NUM>",
"position" : 0
},
{
"token" : "india",
"start_offset" : 5,
"end_offset" : 10,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "2022",
"start_offset" : 11,
"end_offset" : 15,
"type" : "<NUM>",
"position" : 2
}
]
}
The tokens made with 100 /INDIA/2022 and 100/INDIA/2022 with standard analyzer are same as shown above.

ElasticSearch Edge NGram Preserve Numbers

I'm working on creating an autocompletion API for residential addresses.
I would like to preserve the numbers, so I don't get the following problem:
Let's say the index contains a couple of documents:
{"fullAddressLine": "Kooimanweg 10 1442BZ Purmerend", "streetName": "Kooimanweg", houseNumber: "10", "postCode": "1442BZ", "cityName": "Purmerend"}
{"fullAddressLine": "Kooimanweg 1009 1442BZ Purmerend", "streetName": "Kooimanweg", houseNumber: "1009", "postCode": "1442BZ", "cityName": "Purmerend"}
{"fullAddressLine": "Kooimanweg 1011 1442BZ Purmerend", "streetName": "Kooimanweg", houseNumber: "1011", "postCode": "1442BZ", "cityName": "Purmerend"}
{"fullAddressLine": "Kooimanweg 1013 1442BZ Purmerend", "streetName": "Kooimanweg", houseNumber: "1013", "postCode": "1442BZ", "cityName": "Purmerend"}
These are the settings and mappings:
{
"settings": {
"analysis": {
"filter": {
"EdgeNGramFilter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 40
}
},
"analyzer": {
"EdgeNGramAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"EdgeNGramFilter"
]
},
"keywordAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"asciifolding",
"lowercase"
]
}
}
}
},
"mappings": {
"properties": {
"fullAddressLine": {
"type": "text",
"analyzer": "EdgeNGramAnalyzer",
"search_analyzer": "standard",
"fields": {
"raw": {
"type": "text",
"analyzer": "keywordAnalyzer"
}
}
}
}
}
}
And this would be the ElasticSearch query:
{
"query": {
"bool": {
"must": [{
"match": {
"fullAddressLine": {
"query": "kooiman 10",
"operator": "and"
}
}
}]
}
}
}
The result of this is:
Kooimanweg 10 1442BZ Purmerend
Kooimanweg 1009 1442BZ Purmerend
Kooimanweg 1011 1442BZ Purmerend
Kooimanweg 1033 1442BZ Purmerend
This works, but I would only like to see this:
Kooimanweg 10 1442BZ Purmerend
How can I change the query or mappings/settings to achieve this result?
When using the "EdgeNgramAnalyzer" analyzer on "Test 1009" I get:
{
"tokens" : [
{
"token" : "t",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "te",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "tes",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "test",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "1",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<NUM>",
"position" : 1
},
{
"token" : "10",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<NUM>",
"position" : 1
},
{
"token" : "100",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<NUM>",
"position" : 1
},
{
"token" : "1009",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<NUM>",
"position" : 1
}
]
}
I want to reserve numbers so they don't get split.
Thanks to everyone in advance.

Can't get proper result from elasticsearch based on query and document tokenization

I'm trying to implement a search system in which I need to use Edge NGRAM Tokenizer. Settings for creating an index are shown below. I have used same tokenizer for both documents and search query.
(Documents are in Perisan language)
PUT /test
{
"settings": {
"analysis": {
"analyzer": {
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
"lowercase"
]
},
"autocomplete_search": {
"tokenizer": "autocomplete"
}
},
"tokenizer": {
"autocomplete": {
"type": "edge-ngram",
"min_gram": 2,
"max_gram": 10,
"token_chars": [
"letter"
]
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search"
}
}
}
}
}
The problem shows up when I get 0 hits (results) from searching term 'آلمانی' in docs while I have a doc with data : 'آلمان خوب است'.
As you can see the result for analyzing term 'آلمانی' shows that it generates token 'آلمان' and works properly.
{
"tokens" : [
{
"token" : "آ",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "آل",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "آلم",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "آلما",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "آلمان",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "آلمانی",
"start_offset" : 0,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 0
}
]
}
The searching query shown below gets 0 hits.
GET /test/_search
{
"query": {"match": {
"title": {"query": "آلمانی" , "operator": "and"}
}}
}
However searching term 'آلما' returns doc with data 'آلمان خوب است'.
How can I fix this problem?
Your assistance would be greatly appreciated.
I found this DevTicks post by Ricardo Heck which solved my problem.
enter the link for more detailed description
I changed my mapping setting like this:
"mappings": {
"_doc": {
"properties": {
"title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "autocomplete_search",
"fields": {
"ngram": {
"type": "text",
"analyzer": "autocomplete"
}
}
}
}
}
}
And now I get doc "آلمان خوب است" by searching the term "آلمانی".

With html_strip as search query analyzer, searches are still performed in HTML markup

In short: can html_strip be used with analyzer which is used only in queries?
I have a very simple test indice with the following settings:
POST test
{
"settings": {
"analysis": {
"filter": {
"synonym": {
"synonyms_path": "/usr/share/wordnet-prolog/wn_s.pl",
"ignore_case": "true",
"type": "synonym",
"format": "wordnet"
}
},
"analyzer": {
"synonym_analyzer": {
"char_filter": "html_strip",
"filter": [
"asciifolding",
"snowball",
"synonym"
],
"type": "custom",
"tokenizer": "lowercase"
}
}
}
},
"mappings": {
"child": {
"_parent": {
"type": "parent"
}
}
}
}
And some sample data:
PUT test/parent/1
{
"type": "flying stuff"
}
PUT test/child/1?parent=1
{
"name": "butterfly"
}
PUT test/child/2?parent=1
{
"name": "<strong>tire</strong>"
}
On corresponding child/parent searches I get results with the tag name "strong", for example:
GET test/parent/_search
{
"query": {
"has_child": {
"type": "child",
"query": {
"match": {
"name": {
"query": "strong",
"analyzer": "synonym_analyzer"
}
}
}
}
}
}
GET test/child/_search
{
"query": {
"match": {
"name": {
"query": "strong",
"analyzer": "synonym_analyzer"
}
}
}
}
What is interesting, when I test tokenizer with http://localhost:9200/test/_analyze?text=%3Cstrong%3Edemo%3C/strong%3E&analyzer=synonym_analyzer&pretty=true data is interpreted correctly (no "strong" and related synonyms):
{
"tokens" : [ {
"token" : "demonstration",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
}, {
"token" : "demo",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
}, {
"token" : "show",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
}, {
"token" : "exhibit",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
}, {
"token" : "present",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
}, {
"token" : "demonstrate",
"start_offset" : 8,
"end_offset" : 21,
"type" : "SYNONYM",
"position" : 1
} ]
}

Resources