I am looking for a way to get the result by matching EXACT WHOLE WORDS from Elasticsearch. This is for "EQ" ("=") operations from UI.
{
"_index": "docs",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"DocId": 1,
"DocDate": "2020-07-24T10:16:44.0000000Z",
"Conversation": "I just need to know how frequently I should remind you"
}
},
{
"_index": "docs",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"DocId": 2,
"DocDate": "2020-07-25T10:16:45.0000000Z",
"Conversation": "Building a work culture in your firm"
}
}
in here, when querying with "I just need to know how frequently I should remind you" for Conversation, then only ES should return DocId 1 data.
Even the query is like "I just need to know how frequently I should remind", then it should return empty.
I tried these ES queries, but not able to figure it out.
GET docs/_search
{
"query": {
"bool": {
"must": [
{"match_phrase": {
"Conversation": "just need to know"
}}
]
}
}
}
GET docs/_search
{
"query": {
"query_string": {
"default_field": "Conversation",
"query": "\"just need to know\""
}
}
}
GET docs/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"Conversation": {"query": "just need to know",
"operator": "and"
}
}
}
]
}
}
}
You need to add .keyword to the Conversation field. This uses the keyword analyzer instead of the standard analyzer (notice the ".keyword" after the Conversation field).
When using standard analyzer
GET /_analyze
{
"analyzer" : "standard",
"text" : "I just need to know how frequently I should remind you"
}
The following tokens are generated :
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "just",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "need",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "to",
"start_offset": 12,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "know",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 4
},
{
"token": "how",
"start_offset": 20,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 5
},
{
"token": "frequently",
"start_offset": 24,
"end_offset": 34,
"type": "<ALPHANUM>",
"position": 6
},
{
"token": "i",
"start_offset": 35,
"end_offset": 36,
"type": "<ALPHANUM>",
"position": 7
},
{
"token": "should",
"start_offset": 37,
"end_offset": 43,
"type": "<ALPHANUM>",
"position": 8
},
{
"token": "remind",
"start_offset": 44,
"end_offset": 50,
"type": "<ALPHANUM>",
"position": 9
},
{
"token": "you",
"start_offset": 51,
"end_offset": 54,
"type": "<ALPHANUM>",
"position": 10
}
]
}
Whereas the keyword analyzer returns the entire input string as a single token.
If you have not defined any explicit mapping, then your modified search query will be :
{
"query": {
"match": {
"Conversation.keyword": "I just need to know how frequently I should remind you"
}
}
}
You can even change your index mapping, in the following way :
{
"mappings": {
"properties": {
"Conversation": {
"type": "keyword"
}
}
}
}
Related
I have to search document where text field "Body" include "Balance for subscriber with SAN" and exclude "was not found after invoking reip-adapter". I create KQL request in Kibana:
Body : "Balance for subscriber with SAN" and not Body : "was not found after invoking reip-adapter"
But have result including two condition such: "Balance for subscriber with SAN" and "was not found after invoking reip-adapter". Why in my result present AND "Balance for subscriber with SAN" AND "was not found after invoking reip-adapter"?
Inspect KQL Request:
"query": {
"bool": {
"must": [],
"filter": [
{
"bool": {
"filter": [
{
"bool": {
"should": [
{
"match_phrase": {
"Body": "Balance for subscriber with SAN"
}
}
],
"minimum_should_match": 1
}
},
{
"bool": {
"must_not": {
"bool": {
"should": [
{
"match_phrase": {
"Body": "was not found after invoking reip-adapter"
}
}
],
"minimum_should_match": 1
}
}
}
}
]
}
},
{
"range": {
"Timestamp": {
"format": "strict_date_optional_time",
"gte": "2020-08-29T08:24:55.067Z",
"lte": "2020-08-29T10:24:55.067Z"
}
}
}
],
"should": [],
"must_not": []
}
}
"and not" condition don`t working, Response:
-----omitted--------
"_source": {
"prospector": {},
"Severity": "INFO",
"uuid": "e71b207a-42a6-4b2c-98d1-b1094c578776",
"Body": "Balance for subscriber with SAN=0400043102was not found after invoking reip-adapter.",
"tags": [
"iptv",
"beats_input_codec_plain_applied"
],
"source": "/applogs/Iptv/app.log",
"host": {
"name": "e38"
},
"offset": 23097554,
"pid": "2473",
"Configuration": "IptvFacadeBean",
"Timestamp": "2020-08-29T10:24:50.040Z",
"#timestamp": "2020-08-29T10:24:50.446Z",
"input": {}
}
-----omitted--------
The index data you are indexing for Body field is :
"Body": "Balance for subscriber with SAN=0400043102was not found after
invoking reip-adapter."
There is no gap between the number and was ( 0400043102was), so the tokens generated are:
POST/_analyze
{
"analyzer" : "standard",
"text" : "Balance for subscriber with SAN=0400043102was not found after invoking reip-adapter."
}
The tokens are :
{
"tokens": [
{
"token": "balance",
"start_offset": 0,
"end_offset": 7,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "for",
"start_offset": 8,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "subscriber",
"start_offset": 12,
"end_offset": 22,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "with",
"start_offset": 23,
"end_offset": 27,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "san",
"start_offset": 28,
"end_offset": 31,
"type": "<ALPHANUM>",
"position": 4
},
{
"token": "0400043102was", <-- note this
"start_offset": 32,
"end_offset": 45,
"type": "<ALPHANUM>",
"position": 5
},
{
"token": "not",
"start_offset": 46,
"end_offset": 49,
"type": "<ALPHANUM>",
"position": 6
},
{
"token": "found",
"start_offset": 50,
"end_offset": 55,
"type": "<ALPHANUM>",
"position": 7
},
{
"token": "after",
"start_offset": 56,
"end_offset": 61,
"type": "<ALPHANUM>",
"position": 8
},
{
"token": "invoking",
"start_offset": 62,
"end_offset": 70,
"type": "<ALPHANUM>",
"position": 9
},
{
"token": "reip",
"start_offset": 71,
"end_offset": 75,
"type": "<ALPHANUM>",
"position": 10
},
{
"token": "adapter",
"start_offset": 76,
"end_offset": 83,
"type": "<ALPHANUM>",
"position": 11
}
]
}
Therefore, when you are trying to do match_phrase like this :
"should": [
{
"match_phrase": {
"Body": "was not found after invoking reip-adapter"
}
}
]
No token was is generated, therefore, the document matches and must_not condition is not working.
Index Data:
{ "Body":"Balance for subscriber with SAN=0400043102" }
{ "Body":"Balance for subscriber with SAN=0400043102was not found after invoking reip-adapter." }
Search Query
{
"query": {
"bool": {
"must": {
"match_phrase": {
"Body": "Balance for subscriber with SAN"
}
},
"must_not": {
"match_phrase": {
"Body": "not found after invoking reip-adapter"
}
}
}
}
}
Search Result:
"hits": [
{
"_index": "my_index",
"_type": "_doc",
"_id": "2",
"_score": 1.055546,
"_source": {
"Body": "Balance for subscriber with SAN=0400043102"
}
}
]
Below is the elastic search mapping with one field called hostname and other field called catch_all which is basically copy_to field(there will be many more fields copying values to this)
{
"settings": {
"analysis": {
"filter": {
"myNGramFilter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 40
}},
"analyzer": {
"myNGramAnalyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "myNGramFilter"]
}
}
}
},
"mappings": {
"test": {
"properties": {
"catch_all": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"store": true,
"ignore_above": 256
},
"grams": {
"type": "text",
"store": true,
"analyzer": "myNGramAnalyzer"
}
}
},
"hostname": {
"type": "text",
"copy_to": "catch_all"
}
}
}
}
}
When I do the
GET index/_analyze
{
"analyzer": "myNGramAnalyzer",
"text": "Dell PowerEdge R630"
}
{
"tokens": [
{
"token": "d",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "de",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "del",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "dell",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "p",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "po",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "pow",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powe",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "power",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powere",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "powered",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "poweredg",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "poweredge",
"start_offset": 5,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "r",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r6",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r63",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "r630",
"start_offset": 15,
"end_offset": 19,
"type": "<ALPHANUM>",
"position": 2
}
]
}
There is a token called "poweredge".
Right now we search with below query
{
"query": {
"multi_match": {
"fields": ["catch_all.grams"],
"query": "poweredge",
"operator": "and"
}
}
}
When we query with "poweredge" we get 1 result. But when we search by only "edge" there is no result.
Even the match query does not yield results for search word "edge".
Can somebody help here ?
I suggest to don't query with multi_match api for your use case, but to use a match query. The edgengram works in that way: it try to make ngram on the tokens generated by a whitespace tokenizer on you text. As written in documentation - read here:
The edge_ngram tokenizer first breaks text down into words whenever it
encounters one of a list of specified characters, then it emits
N-grams of each word where the start of the N-gram is anchored to the
beginning of the word.
As you have tested in your query to analyze API, it doesn't product "edge" - from poweredge - as ngram because it products ngram from the beginning of the word - look at the output of you analyze API call. Take a look here: https://www.elastic.co/guide/en/elasticsearch/guide/master/ngrams-compound-words.html
How do you see the analyzed data that is stored after you index something.
I know you can just do a search to see it like this
http://localhost:9200/local_products_fr/fields/_search/
But what I want to see is the actual data not the _source
something like what you get when you call _analyzer
http://localhost:9200/local_products_fr/_analyze?text=<p>my <b>super</b> text</p>&analyzer=analyzer_fr
{
"tokens": [
{
"token": "my",
"start_offset": 3,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "b",
"start_offset": 7,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "sup",
"start_offset": 9,
"end_offset": 14,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "b",
"start_offset": 16,
"end_offset": 17,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "text",
"start_offset": 19,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 4
}
]
}
i use this to get inverted index for a field per document
{
"query": {
"bool": {
"must": [{
"term": {
"_id": {
"value": "2"
}
}
}]
}
},
"script_fields": {
"terms": {
"script": "doc['product.name'].values"
}
}
}
Hope this works for you
I use this search query:
GET videosearch/_search
{
"query": {
"match": {
"tags": "logs"
}
}
}
in order to return all documents that contain "logs" in the tags field.
Tags field has this mapping:
"tags": {
"type": "string",
"analyzer": "english",
"fields": {
"verbatim": {
"type": "string",
"index": "not_analyzed"
}
}
}
The query returns good results like this one:
{
"_index": "videosearch",
"_type": "videos",
"_id": "10",
"_score": 0.792282,
"_source": {
"id": "10",
"url": "https://www.youtube.com/watch?v=yDLtyLi6Ny8",
"title": "#bbuzz: Radu Gheorghe JSON Logging with Elasticsearch",
"uploaded_by": "newthinking communications",
"upload_date": "2013-06-19",
"views": 370,
"likes": 0,
"tags": [
"elasticsearch",
"logs",
"logstash",
"rsyslog",
"json"
]
}
}
but also return bad results like this one:
{
"_index": "videosearch",
"_type": "videos",
"_id": "15",
"_score": 0.9054651,
"_source": {
"id": "15",
"url": "https://www.youtube.com/watch?v=4L1DjY90Whk",
"title": "Tuning Solr for Logs, by Radu Gheorghe",
"uploaded_by": "Lucidworks",
"upload_date": "2015-01-07",
"views": 280,
"likes": 2,
"tags": [
"logging",
"solr",
"tuning",
"performance"
]
}
}
I consider the last one a "bad" result because it does not contain the "logs" string in the tags field. Also I can notice that even if it is a "bad" result it has a greater score than "good" result: 0.9054651 vs 0.792282.
What is happening, am I missing something?
After more research, I read about analyzers, which are used by elasticsearch to break words into tokens.
English analyzer is using stemming in order to construct the tokens.
In the bellow example I will break some words into search tokens using the english analyzer:
GET _analyze?pretty
{
"analyzer": "english",
"text": ["hair dryer", "introduction", "stars", "Introspective", "fishing", "logging"]
}
This results in the following tokens:
{
"tokens": [
{
"token": "hair",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "dryer",
"start_offset": 5,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "introduct",
"start_offset": 11,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "star",
"start_offset": 24,
"end_offset": 29,
"type": "<ALPHANUM>",
"position": 3
},
{
"token": "introspect",
"start_offset": 30,
"end_offset": 43,
"type": "<ALPHANUM>",
"position": 4
},
{
"token": "fish",
"start_offset": 44,
"end_offset": 51,
"type": "<ALPHANUM>",
"position": 5
},
{
"token": "log",
"start_offset": 52,
"end_offset": 59,
"type": "<ALPHANUM>",
"position": 6
}
]
}
You can notice that the tokens are in fact the corresponding word stems for each word that is to be analyzed.
In conclusion the words log, logs, logging have the same word stem which is log, so all three of them are search results candidates.
When we are passing a query containing special characters, Elastic Search is splitting the text.
E.g. If we pass "test-test" in query how can we make Elastic Search treat this as a single word and not split it up.
Analyzer used on the field we are searching:
"text_search_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
},
"standard_stop_filter": {
"type": "stop",
"stopwords": "_english_"
}
},
"analyzer": {
"text_search_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"text_search_filter"
]
}
}
Also the query for search:
"query": {
"multi_match": {
"query": "test-test",
"type": "cross_fields",
"fields": [
"FIELD_NAME"
],
}
}
{
"tokens": [
{
"token": "'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-t",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-te",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-tes",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
},
{
"token": "'test-test'",
"start_offset": 0,
"end_offset": 11,
"type": "word",
"position": 1
}
]
}
in my code i catch all words which contains "-" and added quotes for it.
example:
joe-doe -> "joe-doe"
java code for this:
static String placeWordsWithDashInQuote(String value) {
return Arrays.stream(value.split("\\s"))
.filter(v -> !v.isEmpty())
.map(v -> v.contains("-") && !v.startsWith("\"") ? "\"" + v + "\"" : v)
.collect(Collectors.joining(" "));
}
and after this example query looks like:
{
"query": {
"bool": {
"must": [
{
"query_string": {
"fields": [
"lastName",
"firstName"
],
"query": "\"joe-doe\"",
"default_operator": "AND"
}
}
]
}
},
"sort": [],
"from": 0,
"size": 10 }