Elastic synonym usage in aggregations - elasticsearch

Situation :
Elastic version used: 2.3.1
I have an elastic index configured like so
PUT /my_index
{
"settings": {
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"british,english",
"queen,monarch"
]
}
},
"analyzer": {
"my_synonyms": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
}
}
}
}
Which is great, when I query the document and use a query term "english" or "queen" I get all documents matching british and monarch. When I use a synonym term in filter aggregation it doesnt work. For example
In my index I have 5 documents, 3 of them have monarch, 2 of them have queen
POST /my_index/_search
{
"size": 0,
"query" : {
"match" : {
"status.synonym":{
"query": "queen",
"operator": "and"
}
}
},
"aggs" : {
"status_terms" : {
"terms" : { "field" : "status.synonym" }
},
"monarch_filter" : {
"filter" : { "term": { "status.synonym": "monarch" } }
}
},
"explain" : 0
}
The result produces:
Total hits:
5 doc count (as expected, great!)
Status terms: 5 doc count for queen (as expected, great!)
Monarch filter: 0 doc count
I have tried different synonym filter configuration:
queen,monarch
queen,monarch => queen
queen,monarch => queen,monarch
But the above hasn't changed the results. I was wanting to conclude that maybe you can use filters at query time only but then if terms aggregation is working why shouldn't filter, hence I think its my synonym filter configuration that is wrong. A more extensive synonym filter example can be found here.
QUESTION:
How to use/configure synonyms in filter aggregation?
Example to replicate the case above:
1. Create and configure index:
PUT /my_index
{
"settings": {
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"wlh,wellhead=>wellwell"
]
}
},
"analyzer": {
"my_synonyms": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
}
}
}
}
PUT my_index/_mapping/job
{
"properties": {
"title":{
"type": "string",
"analyzer": "my_synonyms"
}
}
}
2.Put two documents:
PUT my_index/job/1
{
"title":"wellhead smth else"
}
PUT my_index/job/2
{
"title":"wlh other stuff"
}
3.Execute a search on wlh which should return 2 documents; have a terms aggregation which should have 2 documents for wellwell and a filter which shouldn't have 0 count:
POST my_index/_search
{
"size": 0,
"query" : {
"match" : {
"title":{
"query": "wlh",
"operator": "and"
}
}
},
"aggs" : {
"wlhAggs" : {
"terms" : { "field" : "title" }
},
"wlhFilter" : {
"filter" : { "term": { "title": "wlh" } }
}
},
"explain" : 0
}
The results of this query is:
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"wlhAggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "wellwell",
"doc_count": 2
},
{
"key": "else",
"doc_count": 1
},
{
"key": "other",
"doc_count": 1
},
{
"key": "smth",
"doc_count": 1
},
{
"key": "stuff",
"doc_count": 1
}
]
},
"wlhFilter": {
"doc_count": 0
}
}
}
And thats my problem, the wlhFilter should have at least 1 doc count in it.

I'm short in time, so if needed I can elaborate a bit more at a later time today/tomorrow. But the following should work:
DELETE /my_index
PUT /my_index
{
"settings": {
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"british,english",
"queen,monarch"
]
}
},
"analyzer": {
"my_synonyms": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"title": {
"type": "text",
"analyzer": "my_synonyms",
"fielddata": true
}
}
}
}
}
POST my_index/test/1
{
"title" : "the british monarch"
}
GET my_index/_search
{
"query": {
"match": {
"title": "queen"
}
}
}
GET my_index/_search
{
"query": {
"match": {
"title": "queen"
}
},
"aggs": {
"queen_filter": {
"filter": {
"term": {
"title": "queen"
}
}
},
"monarch_filter": {
"filter": {
"term": {
"title": "monarch"
}
}
}
}
}
Could you share the mapping you have defined for your status.synonym field?
EDIT: V2
The reason why your filter's output is 0, is because a filter in Elasticsearch never goes through an analysis phase. It's meant for exact matches.
The token 'wlh' in your aggregation will not be translated to 'wellwell', meaning that it doesn't occur in the inverted index. This is because, during index time, your 'wlh' is translated into 'wellwell'.
In order to achieve what you want, you will have to index the data into a separate field and adjust your filter accordingly.
You could try something like:
DELETE my_index
PUT /my_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"my_synonym_filter": {
"type": "synonym",
"synonyms": [
"wlh,wellhead=>wellwell"
]
}
},
"analyzer": {
"my_synonyms": {
"tokenizer": "standard",
"filter": [
"lowercase",
"my_synonym_filter"
]
}
}
}
},
"mappings": {
"job": {
"properties": {
"title": {
"type": "string",
"fields": {
"synonym": {
"type": "string",
"analyzer": "my_synonyms"
}
}
}
}
}
}
}
PUT my_index/job/1
{
"title":"wellhead smth else"
}
PUT my_index/job/2
{
"title":"wlh other stuff"
}
POST my_index/_search
{
"size": 0,
"query": {
"match": {
"title.synonym": {
"query": "wlh",
"operator": "and"
}
}
},
"aggs": {
"wlhAggs": {
"terms": {
"field": "title.synonym"
}
},
"wlhFilter": {
"filter": {
"term": {
"title": "wlh"
}
}
}
}
}
Output:
{
"aggregations": {
"wlhAggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "wellwell",
"doc_count": 2
},
{
"key": "else",
"doc_count": 1
},
{
"key": "other",
"doc_count": 1
},
{
"key": "smth",
"doc_count": 1
},
{
"key": "stuff",
"doc_count": 1
}
]
},
"wlhFilter": {
"doc_count": 1
}
}
}
Hope this helps!!

So with the help of #Byron Voorbach below and his comments this is my solution:
I have created a separate field which I use synonym analyser on, as
opposed to having a property field (mainfield.property).
And most importantly the problem was my synonyms were contracted! I
had, for example, british,english => uk. Changing that to
british,english,uk solved my issue and the filter aggregation is
returning the right number of documents.
Hope this helps someone, or at least point to the right direction.
Edit:
Oh lord praise the documentation! I completely fixed my issue with Filters (S!) aggregation (link here). In filters configuration I specified Match type of query and it worked! Ended up with something like this:
"aggs" : {
"messages" : {
"filters" : {
"filters" : {
"status" : { "match" : { "cats.saurus" : "monarch" }},
"country" : { "match" : { "cats.saurus" : "british" }}
}
}
}
}

Related

how to exclude search words in synonyms filter in elasticsearch

While I'm adding table and tables as synonym filter in elastic search, I need to filter out the results for table fan. How to achieve this in elastic search
Could we build a taxonomy of inclusion and exclusion lists filters in settings rather than at run time queries in elastic search
GET <indexName>/_search
{
"query": {
"bool": {
"must_not": [
{
"match": {
"<fieldName>": {
"query": "table fan", // <======= Below operator will applied b/w table(&synonyms) And fan(&synonyms)
"operator": "AND"
}
}
}
]
}
}
}
You can use above query to exclude all the documents having both 'table', 'fan' and their corresponding synonyms.
OR:
If you want to play with multiple logical operators. e.g Given me all the documents which doesn't contain either "table fan" Or "ac" you can use simple_query_string
GET <indexName>/_search
{
"query": {
"bool": {
"must_not": [
{
"simple_query_string": {
"query": "(table + fan) | ac", // <=== '+'='and', '|'='or', '-'='not'
"fields": [
"<fieldName>" // <==== use multiple field names, wildcard also supported
]
}
}
]
}
}
}
Adding a working example with index data, mapping, search query and search result
Index Mapping:
{
"settings": {
"index": {
"analysis": {
"filter": {
"synonym_filter": {
"type": "synonym",
"synonyms": [
"table, tables"
]
}
},
"analyzer": {
"synonym_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"synonym_filter"
]
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "synonym_analyzer",
"search_analyzer": "standard"
}
}
}
}
Analyze API
POST/_analyze
{
"analyzer" : "synonym_analyzer",
"text" : "table fan"
}
The following tokens are generated:
{
"tokens": [
{
"token": "table",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "tables",
"start_offset": 0,
"end_offset": 5,
"type": "SYNONYM",
"position": 0
},
{
"token": "fan",
"start_offset": 6,
"end_offset": 9,
"type": "<ALPHANUM>",
"position": 1
}
]
}
Index Data:
{ "title": "table and fan" }
{ "title": "tables and fan" }
{ "title": "table fan" }
{ "title": "tables fan" }
{ "title": "table chair" }
Search Query:
{
"query": {
"bool": {
"must": {
"match": {
"title": "table"
}
},
"filter": {
"bool": {
"must_not": [
{
"match_phrase": {
"title": "table fan"
}
},
{
"match_phrase": {
"title": "table and fan"
}
}
]
}
}
}
}
}
You can also use match query in place of match_phrase query
{
"query": {
"bool": {
"must": {
"match": {
"title": "table"
}
},
"filter": {
"bool": {
"must_not": [
{
"match": {
"title": {
"query": "table fan",
"operator": "AND"
}
}
}
]
}
}
}
}
}
Search Result:
"hits": [
{
"_index": "synonym",
"_type": "_doc",
"_id": "2",
"_score": 0.06783115,
"_source": {
"title": "table chair"
}
}
]
Update 1:
Could we build a taxonomy of inclusion and exclusion lists filters in
settings rather than at run time queries in elastic search
Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.Refer this ES documentation on mapping to understand what mapping is used to define.
Please refer to this documentation on Dynamic template that allow you to define custom mappings that can be applied to dynamically added fields

Distinct values from array-field matching filter in Elasticsearch 2.4

In short: I want to lookup for distinct values in some field of the document BUT only matching some filter. The problem is in array-fields.
Imagine there are following documents in ES 2.4:
[
{
"states": [
"Washington (US-WA)",
"California (US-CA)"
]
},
{
"states": [
"Washington (US-WA)"
]
}
]
I'd like my users to be able to lookup all possible states via typeahead, so I have the following query for the "wa" user request:
{
"query": {
"wildcard": {
"states.raw": "*wa*"
}
},
"aggregations": {
"typed": {
"terms": {
"field": "states.raw"
},
"aggregations": {
"typed_hits": {
"top_hits": {
"_source": { "includes": ["states"] }
}
}
}
}
}
}
states.raw is a sub-field with not_analyzed option
This query works pretty well unless I have an array of values like in the example - it returns both Washington and California. I do understand why it happens (query and aggregations are working on top of the document and the document contains both, even though only one option matched the filter), but I really want to only see Washington and don't want to add another layer of filtering on the application side for the ES results.
Is there a way to do so via single ES 2.4 request?
You could use the "Filtering Values" feature (see https://www.elastic.co/guide/en/elasticsearch/reference/2.4/search-aggregations-bucket-terms-aggregation.html#_filtering_values_2).
So, your request could look like:
POST /index/collection/_search?size=0
{
"aggregations": {
"typed": {
"terms": {
"field": "states.raw",
"include": ".*wa.*" // You need to carefully quote the "wa" string because it'll be used as part of RegExp
},
"aggregations": {
"typed_hits": {
"top_hits": {
"_source": { "includes": ["states"] }
}
}
}
}
}
}
I can't hold myself back, though, and not tell you that using wildcard with leading wildcard is not the best solution. Do, please please, consider using ngrams for this:
PUT states
{
"settings": {
"analysis": {
"filter": {
"ngrams": {
"type": "nGram",
"min_gram": "2",
"max_gram": "20"
}
},
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"filter": [
"standard",
"lowercase",
"ngrams"
],
"tokenizer": "standard"
}
}
}
},
"mappings": {
"doc": {
"properties": {
"location": {
"properties": {
"states": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
},
"ngrams": {
"type": "string",
"analyzer": "ngram_analyzer"
}
}
}
}
}
}
}
}
}
POST states/doc/1
{
"text":"bla1",
"location": [
{
"states": [
"Washington (US-WA)",
"California (US-CA)"
]
},
{
"states": [
"Washington (US-WA)"
]
}
]
}
POST states/doc/2
{
"text":"bla2",
"location": [
{
"states": [
"Washington (US-WA)",
"California (US-CA)"
]
}
]
}
POST states/doc/3
{
"text":"bla3",
"location": [
{
"states": [
"California (US-CA)"
]
},
{
"states": [
"Illinois (US-IL)"
]
}
]
}
And the final query:
GET states/_search
{
"query": {
"term": {
"location.states.ngrams": {
"value": "sh"
}
}
},
"aggregations": {
"filtering_states": {
"terms": {
"field": "location.states.raw",
"include": ".*sh.*"
},
"aggs": {
"typed_hits": {
"top_hits": {
"_source": {
"includes": [
"location.states"
]
}
}
}
}
}
}
}

ElasticSearch - Fuzzy and strict match with multiple fields

We want to leverage ElasticSearch to find us similar objects.
Lets say I have an Object with 4 fields:
product_name, seller_name, seller_phone, platform_id.
Similar products can have different product names and seller names across different platforms (fuzzy match).
While, phone is strict and a single variation might cause yield a wrong record (strict match).
What were trying to create is a query that will:
Take into account all fields we have for current record and OR
between them.
Mandate platform_id is the one I want to specific look at. (AND)
Fuzzy the product_name and seller_name
Strictly match the phone number or ignore it in the OR between the fields.
If I would write it in pseudo code, I would write something like:
((product_name like 'some_product_name') OR (seller_name like
'some_seller_name') OR (seller_phone = 'some_phone')) AND (platform_id
= 123)
To do exact match on seller_phone i am indexing this field without ngram analyzers along with fuzzy_query for product_name and seller_name
Mapping
PUT index111
{
"settings": {
"analysis": {
"analyzer": {
"edge_n_gram_analyzer": {
"tokenizer": "whitespace",
"filter" : ["lowercase", "ednge_gram_filter"]
}
},
"filter": {
"ednge_gram_filter" : {
"type" : "NGram",
"min_gram" : 2,
"max_gram": 10
}
}
}
},
"mappings": {
"document_type" : {
"properties": {
"product_name" : {
"type": "text",
"analyzer": "edge_n_gram_analyzer"
},
"seller_name" : {
"type": "text",
"analyzer": "edge_n_gram_analyzer"
},
"seller_phone" : {
"type": "text"
},
"platform_id" : {
"type": "text"
}
}
}
}
}
Index documents
POST index111/document_type
{
"product_name":"macbok",
"seller_name":"apple",
"seller_phone":"9988",
"platform_id":"123"
}
For following pseudo sql query
((product_name like 'some_product_name') OR (seller_name like 'some_seller_name') OR (seller_phone = 'some_phone')) AND (platform_id = 123)
Elastic Query
POST index111/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"platform_id": {
"value": "123"
}
}
},
{
"bool": {
"should": [{
"fuzzy": {
"product_name": {
"value": "macbouk",
"boost": 1.0,
"fuzziness": 2,
"prefix_length": 0,
"max_expansions": 100
}
}
},
{
"fuzzy": {
"seller_name": {
"value": "apdle",
"boost": 1.0,
"fuzziness": 2,
"prefix_length": 0,
"max_expansions": 100
}
}
},
{
"term": {
"seller_phone": {
"value": "9988"
}
}
}
]
}
}]
}
}
}
Hope this helps

Elasticsearch fielddata - should I use it?

Given an index with documents that have a brand property, we need to create a term aggregation that is case insensitive.
Index definition
Please note that the use of fielddata
PUT demo_products
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"product": {
"properties": {
"brand": {
"type": "text",
"analyzer": "my_custom_analyzer",
"fielddata": true,
}
}
}
}
}
Data
POST demo_products/product
{
"brand": "New York Jets"
}
POST demo_products/product
{
"brand": "new york jets"
}
POST demo_products/product
{
"brand": "Washington Redskins"
}
Query
GET demo_products/product/_search
{
"size": 0,
"aggs": {
"brand_facet": {
"terms": {
"field": "brand"
}
}
}
}
Result
"aggregations": {
"brand_facet": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new york jets",
"doc_count": 2
},
{
"key": "washington redskins",
"doc_count": 1
}
]
}
}
If we use keyword instead of text we end up the 2 buckets for New York Jets because of the differences in casing.
We're concerned about the performance implications by using fielddata. However if fielddata is disabled we get the dreaded "Fielddata is disabled on text fields by default."
Any other tips to resolve this - or should we not be so concerned about fielddate?
Starting with ES 5.2 (out today), you can use normalizers with keyword fields in order to (e.g.) lowercase the value.
The role of normalizers is a bit like analyzers for text fields, though what you can do with them is more restrained, but that would probably help with the issue you're facing.
You'd create the index like this:
PUT demo_products
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"filter": [ "lowercase" ]
}
}
}
},
"mappings": {
"product": {
"properties": {
"brand": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
And your query would return this:
"aggregations" : {
"brand_facet" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "new york jets",
"doc_count" : 2
},
{
"key" : "washington redskins",
"doc_count" : 1
}
]
}
}
Best of both worlds!
You can lowercase the aggregation at query time if you use a script. It won't perform as well as a normalized keyword field, but is still quite fast in my experience. For example, your query would be:
GET demo_products/product/_search
{
"size": 0,
"aggs": {
"brand_facet": {
"terms": {
"script": "doc['brand'].value.toLowerCase()"
}
}
}
}

Nested filtering in elasticsearch with more than one term of the same nested type

I'm new to elasticsearch, so maybe my approach is plain wrong, but I want to make an index of recipes and allow the user to filter it down with the aggregated ingredients that are still found in the subset.
Maybe I'm using the wrong language to explain so maybe this example will clarify. I would like to search for recipes with the term salt; which results in three recipes:
with ingredients: salt, flour, water
with ingredients: salt, pepper, egg
with ingredients: water, flour, egg, salt
The aggregate on the results ingredients returns salt, flour, water, pepper, egg. When I filter with flour I only want recipe 1 and 3 to appear in the search results (and the aggregate on ingredients should only return salt, flour, water, egg and salt). When I add another filter egg I want only recipe 3 to appear (and the aggregate should only return water, flour, egg, salt).
I can't make the latter to work: one filter next to the default query does narrow down the results as desired but when adding the other term (egg) to the terms filter the results again start to include b as well, as if it were an OR filter. Adding AND however to the filter execution results in NO results ... what am I doing wrong?
My mapping:
{
"recipe": {
"properties": {
"title": {
"analyzer": "dutch",
"type": "string"
},
"ingredients": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"analyzer": "dutch",
"include_in_parent": true,
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
My query:
{
"query": {
"filtered": {
"query": {
"bool": {
"should": [
{
"match": {
"_all": "salt"
}
}
]
}
},
"filter": {
"nested": {
"path": "ingredients",
"filter": {
"terms": {
"ingredients.name": [
"flour",
"egg"
],
"execution": "and"
}
}
}
}
}
},
"size": 50,
"aggregations": {
"ingredients": {
"nested": {
"path": "ingredients"
},
"aggregations": {
"count": {
"terms": {
"field": "ingredients.name.raw"
}
}
}
}
}
}
Why are you using a nested mapping here? Its main purpose is to keep relations between the sub-object attributes, but your ingredients field has just one attribute and can be modeled simply as a string field.
So, if you update your mapping like this :
POST recipes
{
"mappings": {
"recipe": {
"properties": {
"title": {
"type": "string"
},
"ingredients": {
"name": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
You can still index your recipes as :
{
"title":"recipe b",
"ingredients":["salt","pepper","egg"]
}
And this query gives you the result you are waiting for :
POST recipes/recipe/_search
{
"query": {
"filtered": {
"query": {
"match": {
"_all": "salt"
}
},
"filter": {
"terms": {
"ingredients": [
"flour",
"egg"
],
"execution": "and"
}
}
}
},
"size": 50,
"aggregations": {
"ingredients": {
"terms": {
"field": "ingredients"
}
}
}
}
which is :
{
...
"hits": {
"total": 1,
"max_score": 0.22295055,
"hits": [
{
"_index": "recipes",
"_type": "recipe",
"_id": "PP195TTsSOy-5OweArNsvA",
"_score": 0.22295055,
"_source": {
"title": "recipe c",
"ingredients": [
"salt",
"flour",
"egg",
"water"
]
}
}
]
},
"aggregations": {
"ingredients": {
"buckets": [
{
"key": "egg",
"doc_count": 1
},
{
"key": "flour",
"doc_count": 1
},
{
"key": "salt",
"doc_count": 1
},
{
"key": "water",
"doc_count": 1
}
]
}
}
}
Hope this helps.

Resources