Elasticsearch: Wrong facet terms result - filter

I want to implement facet filters on my products list page using elasticsearch.
Basically my product document index contains a number of products which contains a number of variations.
The variations are defines as "nested" objects in order to make sure only products are returned where a variation matches all filter criteria.
The filtering the variations seems to work correct. However the facets results are not as I would expect a facet filter to work.
For example my query below returns the terms "oriental" and "citrus" for the facet "f_attribute_scent". However I only want to get back the term that matched my filter which would be "citrus".
I have tried out a lot of different things with facet filters and everything but I just can't get it to work correctly.
My mapping looks like this:
curl -XPOST localhost:9200/products -d '
{
"mappings": {
"de": {
"properties": {
"variants": {
"type": "nested",
"include_in_parent": true
}
}
}
}
}
'
Here is my test data:
curl -XPUT localhost:9200/products/de/12 -d '
{
"id": "12",
"categories": [
{
"id": "12345",
"sort": "1"
},
{
"id": "23456",
"sort": "2"
},
{
"id": "34567",
"sort": "3"
}
],
"variants": [
{
"id": "12.1.1",
"brand": "guerlain",
"collection": "emporio",
"rating": 4,
"color": "green",
"price": 31,
"scent": "fruity"
},
{
"id": "12.1.2",
"brand": "guerlain",
"collection": "emporio",
"rating": 2,
"color": "blue",
"price": 49.99,
"scent": "flowery"
}
]
}'
curl -XPUT localhost:9200/products/de/15 -d '
{
"id": "15",
"categories": [
{
"id": "12345",
"sort": "1"
},
{
"id": "23456",
"sort": "2"
},
{
"id": "34567",
"sort": "3"
}
],
"variants": [
{
"id": "15.1.1",
"brand": "dior",
"collection": "foobar",
"rating": 4,
"color": "green",
"price": 48.00,
"scent": "oriental"
},
{
"id": "15.1.2",
"brand": "dior",
"collection": "foobar",
"rating": 2,
"color": "red",
"price": 52,
"scent": "citrus"
}
]
}'
This is the query:
curl -XGET localhost:9200/products/de/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "variants",
"filter": {
"bool": {
"must": [
{
"terms": {
"variants.color": [
"green",
"red"
]
}
},
{
"term": {
"variants.scent": "citrus"
}
}
]
}
}
}
}
}
},
"facets": {
"f_attribute_color": {
"terms": {
"all_terms": true,
"field": "variants.color"
}
},
"f_attribute_scent": {
"terms": {
"field": "variants.scent"
}
}
}
}
... And the result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "products",
"_type": "de",
"_id": "15",
"_score": 1,
"_source": {
"id": "15",
"categories": [
{
"id": "12345",
"sort": "1"
},
{
"id": "23456",
"sort": "2"
},
{
"id": "34567",
"sort": "3"
}
],
"variants": [
{
"id": "15.1.1",
"brand": "dior",
"collection": "foobar",
"rating": 4,
"color": "green",
"price": 48,
"scent": "oriental"
},
{
"id": "15.1.2",
"brand": "dior",
"collection": "foobar",
"rating": 2,
"color": "red",
"price": 52,
"scent": "citrus"
}
]
}
}
]
},
"facets": {
"f_attribute_color": {
"_type": "terms",
"missing": 0,
"total": 2,
"other": 0,
"terms": [
{
"term": "red",
"count": 1
},
{
"term": "green",
"count": 1
}
]
},
"f_attribute_scent": {
"_type": "terms",
"missing": 0,
"total": 2,
"other": 0,
"terms": [
{
"term": "oriental",
"count": 1
},
{
"term": "citrus",
"count": 1
}
]
}
}
}

Based on your data examples above that is being indexed, you are seeing both citrus and oriental as terms facets results because your documents have variants as an array and both of those terms are valid for the document that matched your query.
From the Elasticsearch Facets Documentation:
There’s one important distinction to keep in mind. While search queries restrict both the returned documents and facet counts, search filters restrict only returned documents — but not facet counts.
If you need to restrict both the documents and facets, and you’re not willing or able to use a query, you may use a facet filter.
Based on the documentation and the desired results that you are asking for, you may want to look into using a Filter Facet instead.

Your nested docs are being indexed in two ways:
as independent documents, one for each element in the variants array, and
in the top level de document as if you had set the variants field to be type object
The reason for (2) above is that you set include_in_parent to true. So actually, the top level doc looks like:
{
"id": "12",
"variants.id": [ "12.1.1","12.1.2"],
"variants.brand": [ "guerlain", "guerlain"],
"variants.color": [ "green", "blue"]
... etc ...
}
Your query uses the nested filter correctly, which identifies the top level documents which match, but then you facet on the top-level doc, not the nested docs, which is why you are getting all of the results.
To fix it, all you need to do is to change your facets to use the nested docs instead, and to
add the same nested filter that you used in your main query as a facet_filter:
"facets": {
"f_attribute_color": {
"terms": {
"field": "variants.color"
},
"nested": "variants",
"facet_filter": {
"bool": {
"must": [
{
"terms": {
"variants.color": [
"green",
"red"
]
}
},
{
"term": {
"variants.scent": "citrus"
}
}
]
}
}
},
"f_attribute_scent": {
"terms": {
"field": "variants.scent"
},
"nested": "variants",
"facet_filter": {
"bool": {
"must": [
{
"terms": {
"variants.color": [
"green",
"red"
]
}
},
{
"term": {
"variants.scent": "citrus"
}
}
]
}
}
}
}

You are correct: If I use your filter facets I only get returned "citrus" for the "scent" facet.
However, if I want to filter by brand name "dior" I have got the same problem again. The facet result returns "dior" with a count of "2". The reason beeing that now both variations have the same brand name:
GET /products/de/_search
{
"filter": {
"nested": {
"path": "variants",
"filter": {
"bool": {
"must": [
{
"term": {
"variants.brand": "dior"
}
}
]
}
}
}
},
"facets": {
"f_attribute_brand": {
"nested": "variants",
"facet_filter": {
"bool": {
"must": [
{
"term": {
"variants.brand": "dior"
}
}
]
}
},
"terms": {
"field": "variants.brand"
}
}
}
}
And the result:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "products",
"_type": "de",
"_id": "15",
"_score": 1
}
]
},
"facets": {
"f_attribute_brand": {
"_type": "terms",
"missing": 0,
"total": 2,
"other": 0,
"terms": [
{
"term": "dior",
"count": 2
}
]
}
}
}

Related

Elastic Search: Aggregation sum on a particular field

I am new to elastic search and requesting some help.
Basically I have some 2 million documents in my elastic search and the documents look like below:
{
"_index": "flipkart",
"_type": "PSAD_ThirdParty",
"_id": "430001_MAM_2016-02-04",
"_version": 1,
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 206,
"bookedNumbers": 15000,
"ut": 1454567400000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
I want to write an aggregation query which satisfies below conditions:
1) First query based on "_index", "_type" and "pliId".
2) Do aggregation sum on metrics.value based on metrics.id = "Metric1".
Basically I need to query records based on some fields and aggregate sum on a particular metrics value based on metrics id.
Please can you help me in getting my query right.
Your metrics field needs to be of type nested:
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
}
}
}
If you want Metric1 to match, meaning upper-case letter, then as you see above the id needs to be not_analyzed.
Then, if you only want metrics.id = "Metric1" aggregations, you need something like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"pliId": 206
}
}
]
}
}
}
},
"aggs": {
"by_metrics": {
"nested": {
"path": "metrics"
},
"aggs": {
"metric1_only": {
"filter": {
"bool": {
"must": [
{
"term": {
"metrics.id": {
"value": "Metric1"
}
}
}
]
}
},
"aggs": {
"by_metric_id": {
"terms": {
"field": "metrics.id"
},
"aggs": {
"total_delivery": {
"sum": {
"field": "metrics.value"
}
}
}
}
}
}
}
}
}
}
Created new index:
Method : PUT ,
URL : http://localhost:9200/google/
Body:
{
"mappings": {
"PSAD_Primary": {
"properties": {
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
},
"value": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then I inserted some 200 thousand documents and than ran the query and it worked.
Response:
{
"took": 34,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "google",
"_type": "PSAD_Primary",
"_id": "383701291_MAM_2016-01-06",
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 221244,
"bookedNumbers": 15000,
"ut": 1452061800000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
]
},
"aggregations": {
"by_metrics": {
"doc_count": 3,
"metric1_only": {
"doc_count": 1,
"by_metric_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Metric1",
"doc_count": 1,
"total_delivery": {
"value": 70
}
}
]
}
}
}
}
}

Configuring elasticsearch to search and filter with has many / belongs to relationship

I have a Product model where each product has many skus.
I need to be able to search and filter via elasticsearch across both models, but not quite sure how to go about it. I'm currently uploading to elasticsearch in this format:
[{
id: 1
title: 'Product 1'
image: 'image1.jpg'
skus: [{
id: 1
material: 'cotton'
quantity: 4
},{
id: 2
material: 'polyester'
quantity: 22
}]
},{
...
}]
I can search the title just fine, but I am unsure as to how I could do something like
Search for title 'foobar' and filter by material 'cotton' and quantity > 5
Is this possible with elasticsearch?
Edit
I am open to uploading in a different format or using multiple indices.
I think the parent/child relationship is what you're looking for.
As a quick example, I can set up an index with a parent type and child type like this:
PUT /test_index
{
"mappings": {
"product": {
"properties": {
"id": {
"type": "long"
},
"image": {
"type": "string"
},
"title": {
"type": "string"
}
}
},
"sku": {
"_parent": {
"type": "product"
},
"properties": {
"id": {
"type": "long"
},
"material": {
"type": "string"
},
"quantity": {
"type": "long"
}
}
}
}
}
Then add a parent document and two child documents:
POST /test_index/_bulk
{"index":{"_type":"product","_id":1}}
{"id": 1,"title": "Product1","image": "image1.jpg"}
{"index":{"_type":"sku", "_id":1,"_parent":1}}
{"id": 1,"material": "cotton","quantity": 4}
{"index":{"_type":"sku","_id":2,"_parent":1}}
{"id": 2,"material": "polyester","quantity": 22}
Now if I search for a "product" with "title": "Product1" that has a child "sku" with "material": "cotton" and "quantity" greater than 5, I won't find one:
POST /test_index/product/_search
{
"query": {
"filtered": {
"query": {
"match": {
"title": "Product1"
}
},
"filter": {
"has_child": {
"type": "sku",
"filter": {
"bool": {
"must": [
{
"term": {
"material": "cotton"
}
},
{
"range": {
"quantity": {
"gt": 5
}
}
}
]
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
But if I search for a "product" with "title": "Product1" that has a child "sku" with "material": "polyester" and "quantity" greater than 5, I will find one:
POST /test_index/product/_search
{
"query": {
"filtered": {
"query": {
"match": {
"title": "Product1"
}
},
"filter": {
"has_child": {
"type": "sku",
"filter": {
"bool": {
"must": [
{
"term": {
"material": "polyester"
}
},
{
"range": {
"quantity": {
"gt": 5
}
}
}
]
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.4054651,
"hits": [
{
"_index": "test_index",
"_type": "product",
"_id": "1",
"_score": 1.4054651,
"_source": {
"id": 1,
"title": "Product1",
"image": "image1.jpg"
}
}
]
}
}
Here is some code I used for testing:
http://sense.qbox.io/gist/d1989a28372ac9daae335d585601c11818b2fa11

elasticsearch retrieving nested objects - not individual fields

When I use the "fields" option of a query I get a separate array for each field. Is it possible to get back the "complete" nested objects rather than just the field?
In the following example if I try to do "fields": ["cast"] it tells me that cast is not a leaf node. And if I do "fields": ["cast.firstName", "cast.middleName", "cast.lastName"] it returns 3 arrays.
Is there another way of retrieving just a partial amount of the document? Or is there a way to "reassemble" the separate fields into a complete "cast" object?
Example Index and Data:
POST /movies
{
"mappings": {
"movie": {
"properties": {
"cast": {
"type": "nested"
}
}
}
}
}
POST /movies/movie
{
"title": "The Matrix",
"cast": [
{
"firstName": "Keanu",
"lastName": "Reeves",
"address": {
"street": "somewhere",
"city": "LA"
}
},
{
"firstName": "Laurence",
"middleName": "John",
"lastName": "Fishburne",
"address": {
"street": "somewhere else",
"city": "NYC"
}
}
]
}
Example Query:
GET /movies/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "cast",
"filter": {
"bool": {
"must": [
{ "term": { "firstName": "laurence"} },
{ "term": { "lastName": "fishburne"} }
]
}
}
}
}
}
},
"fields": [
"cast.address.city",
"cast.firstName",
"cast.middleName",
"cast.lastName"
]
}
Result of example query:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "movies",
"_type": "movie",
"_id": "AU1JeyBseLgwMCOuOLsZ",
"_score": 1,
"fields": {
"cast.firstName": [
"Keanu",
"Laurence"
],
"cast.lastName": [
"Reeves",
"Fishburne"
],
"cast.address.city": [
"LA",
"NYC"
],
"cast.middleName": [
"John"
]
}
}
]
}
}
I think this is what you're looking for:
POST /movies/_search
{
"_source": {
"include": [
"cast.address.city",
"cast.firstName",
"cast.middleName",
"cast.lastName"
]
},
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "cast",
"filter": {
"bool": {
"must": [
{
"term": {
"firstName": "laurence"
}
},
{
"term": {
"lastName": "fishburne"
}
}
]
}
}
}
}
}
}
}
Result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "movies",
"_type": "movie",
"_id": "AU1PIJgBA_0Cyshym7-m",
"_score": 1,
"_source": {
"cast": [
{
"lastName": "Reeves",
"address": {
"city": "LA"
},
"firstName": "Keanu"
},
{
"middleName": "John",
"lastName": "Fishburne",
"address": {
"city": "NYC"
},
"firstName": "Laurence"
}
]
}
}
]
}
}
You can also choose to exclude fields instead of including or both, see documentation here: http://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html

Filter ElasticSearch result whose array contains at least 1 tag

I query against elasticsearch with following DSL.
{
"query": {
"filtered": {
"query": {
"multi_match": {
"query": "Next",
"type": "phrase_prefix",
"fields": [
"defaultContent"
]
}
},
"filter": {
"bool": {
"must_not": {
"term": {
"_deleted": true
}
},
"should": [
{
"term": {
"site": "xxx"
}
},
{
"term": {
"site": "base"
}
}
]
}
}
}
}
}
And it works and return 1 match.
{
"took": 42,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 2.733073,
"hits": [
{
"_index": "cms",
"_type": "content",
"_id": "base>3453fm9lxkmyy_17",
"_score": 2.733073,
"_source": {
"tags": [
"tag1",
"tag2"
],
"site": "base",
"_rev": "1-3b6eb2b3c3d5554bb3ef3f16a299160c",
"defaultContent": "Next action to be settled",
"_id": "base>3453fm9lxkmyy_17",
"type": "content",
"key": "3453fm9lxkmyy_17"
}
}
]
}
}
Now I want to modify the DSL, and add a new condition -- Only returns those whose tags contains tag1 or tag8
{
"query": {
"filtered": {
"query": {
"multi_match": {
"query": "Next",
"type": "phrase_prefix",
"fields": [
"defaultContent"
]
}
},
"filter": {
"bool": {
"must": {
"term" : {
"tags" : ["tag1", "tag8"],
"minimum_should_match" : 1
}
},
"must_not": {
"term": {
"_deleted": true
}
},
"should": [
{
"term": {
"site": "xxx"
}
},
{
"term": {
"site": "base"
}
}
]
}
}
}
}
}
And then, I get nothing.
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
Am I doing something wrong? It should return 1 match because it contains tag1
The "term" filter is used when you want to match on a single term. Kind of like SQL column1 = 'foo'
You want to use the "terms" filter which is the equivalent of SQL column1 IN ('foo', 'bar')

term and range filters together in an elasticsearch query

In my eleasticsearch index, I have two documents indexed as below:
POST dyn-props/item
{
"name": "bar foo",
"properties": [
{
"type": "foo",
"value": 1.45
},
{
"type": "bar",
"value": 256.34
},
{
"type": "foobar",
"value": 43.43
}
]
}
POST dyn-props/item
{
"name": "foo bar",
"properties": [
{
"type": "foo",
"value": 33.34
},
{
"type": "bar",
"value": 22.23
}
]
}
On this item type, I would like to query for items which have foo property whose value is greater than 10. I can filter the results down for items that has a property whose type is foo with the below query:
POST dyn-props/item/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"term": {
"properties.type": "foo"
}
}
}
}
}
but I am not sure how I can apply the range filter for value. Any idea?
Edit:
Issuing the below query gives me the wrong results as expected:
POST dyn-props/item/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"term": {
"properties.type": "foo"
}
},
{
"range": {
"properties.value": {
"gte" : 10
}
}
}
]
}
}
}
}
}
The result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "dyn-props",
"_type": "item",
"_id": "PetPVxwARLOcZqlv28xjpw",
"_score": 1,
"_source": {
"name": "bar foo",
"properties": [
{
"type": "foo",
"value": 1.45
},
{
"type": "bar",
"value": 256.34
},
{
"type": "foobar",
"value": 43.43
}
]
}
},
{
"_index": "dyn-props",
"_type": "item",
"_id": "KqOTXcC9RG6FzPsDDDs8Hw",
"_score": 1,
"_source": {
"name": "foo bar",
"properties": [
{
"type": "foo",
"value": 33.34
},
{
"type": "bar",
"value": 22.23
}
]
}
}
]
}
}
Found the answer. This post helped a lot: ElasticSearch – nested mappings and filters
Changed the mapping of the type:
PUT dyn-props
{
"mappings": {
"item": {
"properties": {
"name": {
"type": "string"
},
"properties": {
"type": "nested"
}
}
}
}
}
By making the properties as nested type, I was able to maintain the association between type and value fields.
Finally, I was able to issue the nested query for this:
POST dyn-props/item/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "properties",
"filter": {
"bool": {
"must": [
{
"term": {
"type": "foo"
}
},
{
"range": {
"value": {
"gte": 10
}
}
}
]
}
}
}
}
}
}
}
This got me the correct result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "dyn-props",
"_type": "item",
"_id": "CzTL4sseR2GVYtvf-0slVQ",
"_score": 1,
"_source": {
"name": "foo bar",
"properties": [
{
"type": "foo",
"value": 33.34
},
{
"type": "bar",
"value": 22.23
}
]
}
}
]
}
}
You have got to change the mapping of the index and change the type of the properties to nested.
This case has been explained in the docs:
http://www.elasticsearch.org/blog/managing-relations-inside-elasticsearch/

Resources