Nested ElasticSearch query results in too many items - elasticsearch

The nested ElasticSearch query below returns some results it should not hit. A lot of results do not contain the requested order number but are listed nevertheless. I'm not getting all documents though so the query is definitely reducing the result set on some level.
{
"query": {
"nested": {
"path": "orders",
"query": {
"match": {
"orderNumber": "242347"
}
}
}
}
}
The query result (truncated):
{
"took":0,
"timed_out":false,
"_shards": {
"total":1,
"successful":1,
"failed":0
},
"hits": {
"total":60,
"max_score":9.656103,
"hits":[
{
"_index": "index1",
"_type":"documenttype1",
"_id":"mUmudQrVSC6rn68ujDJ8iA",
"_score":9.656103,
"_source" : {
"documentId": 12093894,
"orders": [
{
"customerId": 129048669,
"orderNumber": "242347", // <-- CORRECT HIT ON ORDER
},
{
"customerId": 229405848,
"orderNumber": "431962"
}
]
}
},
{
"_index":"index1",
"_type":"documenttype1",
"_id":"9iO5QBCpT_6kmH3CoBTdWw",
"_score":9.656103,
"_source" : {
"documentId": 43390283,
// <-- ORDER ISN'T HERE BUT THE DOCUMENT IS HIT NEVERTHELESS!
"orders": [
{
"customerId": 229405848,
"orderNumber": "431962"
},
{
"customerId": 129408979,
"orderNumber": "142701"
}
]
}
}
// Left out 58 more results most of which do not contain
// the requested order number.
]
}
}
As you can see, there is a hit (actually, there are quite a few of them) that shouldn't be there because none of the orders contain the requested order number.
This is the mapping for documenttype1:
{
"index1":{
"properties":{
"documentId":{
"type":"integer"
},
"orders":{
"type":"nested",
"properties":{
"customerId":{
"type":"integer"
},
"orderNumber":{
"type":"string",
"analyzer":"custom_internal_code"
}
}
}
}
}
}
Finally, here are the settings to clarify the custom_internal_code analyzer as referred to in the mapping shown above:
{
"index1":{
"settings":{
"index.analysis.analyzer.custom_internal_code.filter.1":"asciifolding",
"index.analysis.analyzer.custom_internal_code.type":"custom",
"index.analysis.analyzer.custom_internal_code.filter.0":"lowercase",
"index.analysis.analyzer.custom_internal_code.tokenizer":"keyword",
}
}
}

for a exact search use termquery [1] and make orderNumber not_analyzed [2].
[1]
http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-term-query.html#query-dsl-term-query
[2]
http://www.elasticsearch.org/guide/en/elasticsearch/guide/current/mapping-intro.html#_literal_index_literal

It seems that you should use bool query instead of match.
But. If you want just filter your records, your should use nested filter instead of query. It works faster, because you have not to calculate scores.
http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-nested-filter.html
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "orders",
"filter": {
"bool": {
"must": [
{
"term": {
"orderNumber": "242347"
}
}
]
}
},
"_cache": true
}
}
}
}
}

Related

Elastic: How i can filter aggregation buckets by string key

i have some data from one provider - very big structured JSON data:
"mappings": {
"properties": {
"field_a": { .. },
"field_b": { .. },
"field_c": { .. },
"field_d": {
"properties": {
"subfield_a": {...},
"subfield_b": {...},
"subfield_c": {...},
"subfield_d": {...},
"subfield_e": {
"properties": {
"myfield": {
"type": "keyword"
},
"another_a": {...},
"another_b": {...},
}
}
}
}
}
}
subfield_e is array of objects contains many fields with my interest "myfield".
I need aggregation with only fields "myfield" what contain some string.
So, i now do this with wrong (but logic result):
GET /index/_search
{
"query": {
"wildcard": {
"field_d.subfield_e.myfield": "*string*"
}
},
"aggs": {
"interest": {
"terms": {
"field": "field_d.subfield_e.myfield",
"size": 10
}
}
},
"size": 0
}
The problem of this query is, that query will choose all documents where array of objects "esubfield_e" contain object myfield with string and under these all documents made aggregation. So, finally i get results with all "myfields" under these documents and not only myfields containing string.
I was try make a bucket_selector aggregation after my main aggregation, but i got error: "buckets_path must reference either a number value or a single value numeric metric aggregation, got: [String] at aggregation [_key]"
My code is inspired by: Filter Elasticsearch Aggregation by Bucket Key Value and looks now:
GET /index/_search
{
"query": {
"wildcard": {
"field_d.subfield_e.myfield": "*string*"
}
},
"aggs": {
"interest": {
"terms": {
"field": "field_d.subfield_e.myfield",
"size": 10
}
},
"aggs": {
"buckets": {
"bucket_selector": {
"buckets_path": {
"key": "_key"
},
"script": "params.key.contains('string')"
}
}
}
}
},
"size": 0
}
So, how i can filter a aggregations buckets (term aggs) by their string key ?
I solved it by switching subfield_e to nested object instead of undefined array and I reimported all data to this new mapping.
Current mapping looks as:
"mappings": {
"properties": {
"field_a": { .. },
"field_b": { .. },
"field_c": { .. },
"field_d": {
"properties": {
"subfield_a": {...},
"subfield_b": {...},
"subfield_c": {...},
"subfield_d": {...},
"subfield_e": {
"type": "nested" <======= This line added
"properties": {
"myfield": {
"type": "keyword"
},
"another_a": {...},
"another_b": {...},
}
}
}
}
}
}
And final working query is:
GET /index/_search
{
"query": {
"nested": {
"path": "field_d.subfield_e",
"query": {
"wildcard": {
"field_d.subfield_e.myfield": {
"value": "*string*"
}
}
}
}
},
"aggs": {
"agg": {
"nested": {
"path": "field_d.subfield_e"
},
"aggs": {
"inner": {
"filter": {
"wildcard": {
"field_d.subfield_e.myfield": "*string*"
}
}, "aggs": {
"interest": {
"terms": {
"field": "field_d.subfield_e.myfield",
"size": 10
}
}
}
}
}
}
},
"size": 0
}
The speed of this query is in my case much more better than using include/exclude in terms aggregation.

Filter and sort based on attributes in Terms lookup document in Elastic Search

I have some documents in my index:
POST "/index/thing/_bulk" -s -d'
{ "index":{ "_id": 1 } }
{ "title":"One thing"}
{ "index":{ "_id": 2 } }
{ "title":"Second thing"}
{ "index":{ "_id": 3 } }
{ "title":"Three things"}
{ "index":{ "_id": 4 } }
{ "title":"And so fourth"}
{ "index":{ "_id": 5 } }
{ "title":"Five things"}
'
I also have documents which contain a users collection which are linked to the other documents (things) through the documents id attribute like so:
PUT /index/collection/1
{
"items": [
{"id": 1, "time_added": "2017-08-07T09:07:15.000Z", "condition": "fair"},
{"id": 3, "time_added": "2019-08-07T09:07:15.000Z", "condition": "good"},
{"id": 4, "time_added": "2016-08-07T09:07:15.000Z", "condition": "poor"}
]
}
I then use a terms lookup to get all the things in a users collection like so:
GET /documents/_search
{
"query" : {
"terms" : {
"_id" : {
"index" : "index",
"type" : "collection",
"id" : 1,
"path" : "items.id"
}
}
}
}
This works fine. I get the three documents in the collection and can search, sort and use aggregations like I want.
But is there a way to aggregate, filter and sort those documents based on the attributes (time_added or condition in this case) in the collection document? Say I wanted to sort based on time_added or filter for condition=="good" from the collection?
Maybe a script that can be applied to collection to sort or filter the items in there? It feels like this is getting pretty close to sql like left-join, so maybe Elastic Search is the wrong tool?
It looks like you need the nested data type
Taking your data as an example:
Without nested type:
POST collection/_bulk?filter_path=_
{"index":{}}
{"items":[{"id":11,"time_added":"2017-08-07T09:07:15.000Z","condition":"fair"},{"id":13,"time_added":"2019-08-07T09:07:15.000Z","condition":"good"},{"id":14,"time_added":"2016-08-07T09:07:15.000Z","condition":"poor"}]}
{"index":{}}
{"items":[{"id":21,"time_added":"2017-09-07T09:07:15.000Z","condition":"fair"},{"id":23,"time_added":"2019-09-07T09:07:15.000Z","condition":"good"},{"id":24,"time_added":"2016-09-07T09:07:15.000Z","condition":"poor"}]}
{"index":{}}
{"items":[{"id":31,"time_added":"2017-10-07T09:07:15.000Z","condition":"fair"},{"id":33,"time_added":"2019-10-07T09:07:15.000Z","condition":"good"},{"id":34,"time_added":"2016-10-07T09:07:15.000Z","condition":"poor"}]}
{"index":{}}
{"items":[{"id":41,"time_added":"2017-11-07T09:07:15.000Z","condition":"fair"},{"id":43,"time_added":"2019-11-07T09:07:15.000Z","condition":"good"},{"id":44,"time_added":"2016-11-07T09:07:15.000Z","condition":"poor"}]}
{"index":{}}
{"items":[{"id":51,"time_added":"2017-12-07T09:07:15.000Z","condition":"fair"},{"id":53,"time_added":"2019-12-07T09:07:15.000Z","condition":"good"},{"id":54,"time_added":"2016-12-07T09:07:15.000Z","condition":"poor"}]}
Query (you'd get incorrect results - expected one, got five):
GET collection/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"items.condition": {
"value": "good"
}
}
},
{
"range": {
"items.time_added": {
"lte": "2019-09-01"
}
}
}
]
}
}
}
Aggregation (incorect results - look at the first bucket "2016-08-01T00:00:00.000Z" - it contains 3 CONDITION sub-buckets with every condition type)
GET collection/_search
{
"size": 0,
"aggs": {
"DATE": {
"date_histogram": {
"field": "items.time_added",
"calendar_interval": "month"
},
"aggs": {
"CONDITION": {
"terms": {
"field": "items.condition.keyword",
"size": 10
}
}
}
}
}
}
With nested type
DELETE collection
PUT collection
{
"mappings": {
"properties": {
"items": {
"type": "nested"
}
}
}
}
# and POST the same data from above
Query (returns just one result)
GET collection/_search
{
"query": {
"nested": {
"path": "items",
"query": {
"bool": {
"must": [
{
"term": {
"items.condition": {
"value": "good"
}
}
},
{
"range": {
"items.time_added": {
"lte": "2019-09-01"
}
}
}
]
}
}
}
}
}
Aggregation (the first date bucket contains just one CONDITION sub-bucket)
GET collection/_search
{
"size": 0,
"aggs": {
"ITEMS": {
"nested": {
"path": "items"
},
"aggs": {
"DATE": {
"date_histogram": {
"field": "items.time_added",
"calendar_interval": "month"
},
"aggs": {
"CONDITION": {
"terms": {
"field": "items.condition.keyword",
"size": 10
}
}
}
}
}
}
}
}
Hope that helps :)

Unable to create nested date aggregation query

I am trying to create an ElasticSearch aggregation query which can generate sum or average of value in all my ingested documents.
The documents are of the format -
{
"weather":"cold",
"date_1":"2017/07/05",
"feedback":[
{
"date_2":"2017/08/07",
"value":28,
"comment":"not cold"
},{
"date_2":"2017/08/09",
"value":48,
"comment":"a bit chilly"
},{
"date_2":"2017/09/07",
"value":18,
"comment":"very cold"
}, ...
]
}
I am able to create a sum aggregation of all "feedback.value" using "date_1" by using the following request -
GET _search
{
"query": {
"query_string": {
"query": "cold"
}
},
"size": 0,
"aggs": {
"temperature": {
"date_histogram":{
"field" : "date_1",
"interval" : "month"
},
"aggs":{
"temperature_agg":{
"terms": {
"field": "feedback.value"
}
}
}
}
}
}
However, I need to generate the same query across all documents aggregate based on "feedback.date_2". I am not sure if ElasticSearch can resolve such aggregation or how to approach it. Any guidance would be helpful
[EDIT]
Mapping file( I only define the nested items, ES identifes other fields on its own)
{
"mappings": {
"catalog_item": {
"properties": {
"feedback":{
"type":"nested",
"properties":{
"date_2":{
"type": "date",
"format":"YYYY-MM-DD"
},
"value": {
"type": "float"
},
"comment": {
"type": "text"
}
}
}
}
}
}
}
You would need to make use of nested documents and sum aggregation.
Here's a working example:
Sample Mapping:
PUT test
{
"mappings": {
"doc": {
"properties": {
"feedback": {
"type": "nested"
}
}
}
}
}
Add Sample document:
PUT test/doc/1
{
"date_1": "2017/08/07",
"feedback": [
{
"date_2": "2017/08/07",
"value": 28,
"comment": "not cold"
},
{
"date_2": "2017/08/09",
"value": 48,
"comment": "a bit chilly"
},
{
"date_2": "2017/09/07",
"value": 18,
"comment": "very cold"
}
]
}
Calculate both the sum and average based on date_2.
GET test/_search
{
"size": 0,
"aggs": {
"temperature_aggregation": {
"nested": {
"path": "feedback"
},
"aggs": {
"temperature": {
"date_histogram": {
"field": "feedback.date_2",
"interval": "month"
},
"aggs": {
"sum": {
"sum": {
"field": "feedback.value"
}
},
"avg": {
"avg": {
"field": "feedback.value"
}
}
}
}
}
}
}
}

Elasticsearch - Applying multi level filter on nested aggregation bucket?

I'm, trying to get distinct nested objects by applying multiple filters.
Basically in Elasticsearch I have cities as top level document and inside I have nested citizens documents, which have another nested pets documents.
I am trying to get all citizens that have certain conditions applied on all of these 3 levels (cities, citizens and pets):
Give me all distinct citizens
that have age:"40",
that have pets "name":"Casper",
from cities with office_type="secondary"
I know that to filter 1st level I can use query condition, and then if I need to filter the nested citizens I can add a filter in the aggregation level.
I am using this article as an example: https://iridakos.com/tutorials/2018/10/22/elasticsearch-bucket-aggregations.html
Query working so far:
GET city_offices/_search
{
"size" : 10,
"query": {
"term" : { "office_type" : "secondary" }
},
"aggs": {
"citizens": {
"nested": {
"path": "citizens"
},
"aggs": {
"inner_agg": {
"filter": {
"term": { "citizens.age": "40" }
} ,
"aggs": {
"occupations": {
"terms": {
"field": "citizens.occupation"
}
}
}
}
}
}
}
}
BUT: How can I add the "pets" nested filter condition?
Mapping:
PUT city_offices
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"city": {
"type": "keyword"
},
"office_type": {
"type": "keyword"
},
"citizens": {
"type": "nested",
"properties": {
"occupation": {
"type": "keyword"
},
"age": {
"type": "integer"
},
"pets": {
"type": "nested",
"properties": {
"kind": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"age": {
"type": "integer"
}
}
}
}
}
}
}
}
}
Index data:
PUT /city_offices/doc/1
{
"city":"Athens",
"office_type":"secondary",
"citizens":[
{
"occupation":"Statistician",
"age":30,
"pets":[
{
"kind":"Cat",
"name":"Phoebe",
"age":14
}
]
},
{
"occupation":"Librarian",
"age":30,
"pets":[
{
"kind":"Rabbit",
"name":"Nino",
"age":13
}
]
},
{
"occupation":"Librarian",
"age":40,
"pets":[
{
"kind":"Rabbit",
"name":"Nino",
"age":13
}
]
},
{
"occupation":"Statistician",
"age":40,
"pets":[
{
"kind":"Rabbit",
"name":"Casper",
"age":2
},
{
"kind":"Rabbit",
"name":"Nino",
"age":13
},
{
"kind":"Dog",
"name":"Nino",
"age":15
}
]
}
]
}
So I found a solution for this.
Basically I apply top level filters in the query section and then apply rest of conditions in the aggregations.
First I apply citizens level filter aggregation, then I go inside nested pets and apply the filter and then I need to get back up to citizens level (using reverse_nested: citizens) and then set the term that will generate the final bucket.
Query looks like this:
GET city_offices/_search
{
"size" : 10,
"query": {
"term" : { "office_type" : "secondary" }
},
"aggs": {
"citizens": {
"nested": {
"path": "citizens"
},
"aggs": {
"inner": {
"filter": {
"term": { "citizens.age": "40" }
} ,
"aggs": {
"occupations": {
"nested": {
"path": "citizens.pets"
},
"aggs": {
"inner_pets": {
"filter": {
"term": { "citizens.pets.name": "Casper" }
} ,
"aggs": {
"lll": {
"reverse_nested": {
"path": "citizens"
},
"aggs": {
"xxx": {
"terms": {
"field": "citizens.occupation",
"size": 10
}
}
}
}
}
}
}
}
}
}
}
}
}
}
The response bucket looks like this:
"xxx": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Librarian",
"doc_count": 1
},
{
"key": "Statistician",
"doc_count": 1
}
]
}
Any other suggestions?

Create keyword string type with custom analyzer in 5.3.0

I have a string I'd like to index as keyword type but with a special comma analyzer:
For example:
"San Francisco, Boston, New York" -> "San Francisco", "Boston, "New York"
should be both indexed and aggregatable at the same time so that I can split it up by buckets. In pre 5.0.0 the following worked:
Index settings:
{
'settings': {
'analysis': {
'tokenizer': {
'comma': {
'type': 'pattern',
'pattern': ','
}
},
'analyzer': {
'comma': {
'type': 'custom',
'tokenizer': 'comma'
}
}
},
},
}
with the following mapping:
{
'city': {
'type': 'string',
'analyzer': 'comma'
},
}
Now in 5.3.0 and above the analyzer is no longer a valid property for the keyword type, and my understanding is that I want a keyword type here. How do I specify an aggregatable, indexed, searchable text type with custom analyzer?
You can use multifields to index the same fields in two different ways one for searching and other for aggregations.
Also i suugest you to add a filter for trim and lowercase the tokens produced to help you with better search.
Mappings
PUT commaindex2
{
"settings": {
"analysis": {
"tokenizer": {
"comma": {
"type": "pattern",
"pattern": ","
}
},
"analyzer": {
"comma": {
"type": "custom",
"tokenizer": "comma",
"filter": ["lowercase", "trim"]
}
}
}
},
"mappings": {
"city_document": {
"properties": {
"city": {
"type": "keyword",
"fields": {
"city_custom_analyzed": {
"type": "text",
"analyzer": "comma",
"fielddata": true
}
}
}
}
}
}
}
Index Document
POST commaindex2/city_document
{
"city" : "san fransisco, new york, london"
}
Search Query
POST commaindex2/city_document/_search
{
"query": {
"bool": {
"must": [{
"term": {
"city.city_custom_analyzed": {
"value": "new york"
}
}
}]
}
},
"aggs": {
"terms_agg": {
"terms": {
"field": "city",
"size": 10
}
}
}
}
Note
In case you want to run aggs on indexed fields, like you want to count for each city in buckets, you can run terms aggregation on city.city_custom_analyzed field.
POST commaindex2/city_document/_search
{
"query": {
"bool": {
"must": [{
"term": {
"city.city_custom_analyzed": {
"value": "new york"
}
}
}]
}
},
"aggs": {
"terms_agg": {
"terms": {
"field": "city.city_custom_analyzed",
"size": 10
}
}
}
}
Hope this helps
Since you're using ES 5.3, I suggest a different approach, using an ingest pipeline to split your field at indexing time.
PUT _ingest/pipeline/city-splitter
{
"description": "City splitter",
"processors": [
{
"split": {
"field": "city",
"separator": ","
}
},
{
"foreach": {
"field": "city",
"processor": {
"trim": {
"field": "_ingest._value"
}
}
}
}
]
}
Then you can index a new document:
PUT cities/city/1?pipeline=city-splitter
{ "city" : "San Francisco, Boston, New York" }
And finally you can search/sort on city and run an aggregation on the field city.keyword as if the cities had been split in your client application:
POST cities/_search
{
"query": {
"match": {
"city": "boston"
}
},
"aggs": {
"cities": {
"terms": {
"field": "city.keyword"
}
}
}
}

Resources