Exclude results based on key in elasticsearch

Exclude results based on key in elasticsearch - elasticsearch

I have the below mapping for a type in elastic search:
"properties": {
"userid": {
"type": "integer"
},
"engid": {
"type": "short"
},
"score": {
"type": "short",
},
"name": {
"type": "string",
"index": "not_analyzed"
},
"submitTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
}
}
And my search query as:
{
"size": 10,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"range": {
"submitTime": {
"gt": "now-18d"
}
}
}
}
},
"aggs": {
"name": {
"terms": {
"field": "name",
"order": {
"_term": "asc"
}
},
"aggs": {
"score": {
"terms": {
"field": "score"
}
}
}
}
}
}
This is giving my expected result as:
"aggregations": {
"name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "---",
"doc_count": 169529,
"score": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 0,
"doc_count": 160133
},
{
"key": 5,
"doc_count": 9395
},
{
"key": 4,
"doc_count": 1
}
]
}
},
{
"key": "John",
"doc_count": 1,
"score": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 5,
"doc_count": 1
}
]
}
}
Now I want to remove the bucket from my results where name='---'. I tried using 'not', but it didn't worked. Any hint will be appreciated.
PS: I am new to elasticsearch, and just trying to expand my knowledge.

You need to exclude the --- value in your query
{
"size": 10,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"range": {
"submitTime": {
"gt": "now-18d"
}
}
}
],
"must_not": [
{
"term": {
"name": "---"
}
}
]
}
}
}
},
"aggs": {
"name": {
"terms": {
"field": "name",
"order": {
"_term": "asc"
}
},
"aggs": {
"score": {
"terms": {
"field": "score"
}
}
}
}
}
}

Related

How to sort nested aggregation field based on parent document field in elasticsearch?

I have index of stores at various location. With each store I have a nested list of discount coupon.
Now I have query to get list of all unique coupons in a x km of radius sorted by the distance of the nearest applicable coupon on given location
Database :: Elasticsearch
Index Mapping ::
{
"mappings": {
"car_stores": {
"properties": {
"location": {
"type": "geo_point"
},
"discount_coupons": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
}
}
}
}
}
Sample Doc ::
{
"_index": "stores",
"_type": "car_stores",
"_id": "1258c81d-b6f2-400f-a448-bd728f524b55",
"_score": 1.0,
"_source": {
"location": {
"lat": 36.053757,
"lon": 139.525482
},
"discount_coupons": [
{
"name": "c1"
},
{
"name": "c2"
}
]
}
}
Old Query to get unique discount coupon names in x km area for given location ::
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"_key": "asc"
},
"size": 200
}
}
}
}
}
}
Updated Response ::
{
"took": 60,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 245328,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"coupon": {
"doc_count": 657442,
"name": {
"doc_count_error_upper_bound": -1,
"sum_other_doc_count": 641189,
"buckets": [
{
"key": "local20210211",
"doc_count": 1611,
"back_to_base": {
"doc_count": 1611,
"distance_script": {
"value": 160.61034409639765
}
}
},
{
"key": "local20210117",
"doc_count": 1621,
"back_to_base": {
"doc_count": 1621,
"distance_script": {
"value": 77.51459886447356
}
}
},
{
"key": "local20201220",
"doc_count": 1622,
"back_to_base": {
"doc_count": 1622,
"distance_script": {
"value": 84.15734462544432
}
}
},
{
"key": "kisekae1",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 88.23770888201268
}
}
},
{
"key": "local20210206",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 86.78376012847237
}
}
},
{
"key": "local20210106",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 384.12156408078397
}
}
},
{
"key": "local20210113",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 153.61681676703674
}
}
},
{
"key": "local20",
"doc_count": 1629,
"back_to_base": {
"doc_count": 1629,
"distance_script": {
"value": 168.74132991524073
}
}
},
{
"key": "local20210213",
"doc_count": 1630,
"back_to_base": {
"doc_count": 1630,
"distance_script": {
"value": 155.8335679860034
}
}
},
{
"key": "local20210208",
"doc_count": 1632,
"back_to_base": {
"doc_count": 1632,
"distance_script": {
"value": 99.58790590445102
}
}
}
]
}
}
}
}
Now the above query will return first 200 discount coupon default sorted by count but I want to return coupons sorted on distance based to given location i.e. the coupon that is nearest applicable should come first.
Is there any way to sort nested aggregations based on a parent key or can I solve this use case using a different data model?
Update Query ::
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"geo_distance": {
"distance": "100km",
"location": {
"lat": 35.699104,
"lon": 139.825211
}
}
},
{
"nested": {
"path": "discount_coupons",
"query": {
"bool": {
"filter": {
"exists": {
"field": "discount_coupons"
}
}
}
}
}
}
]
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 10
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(35.699104, 139.825211)"
}
}
}
}
}
}
}
}
}
}
}

Interesting question. You can always order a terms aggregation by the result of a numeric sub-aggregation. The trick here is to escape the nested context via a reverse_nested aggregation and then calculate the distance from the pivot using a script:
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 200
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(40, -70)"
}
}
}
}
}
}
}
}
}
}
}

Complex Elastic Search Query

I have the following documents in the elastic search index.
[{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_en-us",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "hi-in",
"user": "aadab2f531206e9d",
"version": 1
},
{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:36:55.514Z",
"locale": "en-us",
"user": "aadab2f531206e9d",
"version": 1
}
],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1",
"locale": "en-us",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:36:43.037Z",
"fields": []
}
},
{
"_index": "ten2",
"_type": "documents",
"_id": "c323c2244a4a4c22_mr-in",
"_source": {
"publish_details": [{
"environment": "603fe91adbdcff66",
"time": "2020-06-24T13:37:26.205Z",
"locale": "mr-in",
"user": "aadab2f531206e9d",
"version": 1
}],
"created_at": "2020-06-24T13:36:43.037Z",
"_in_progress": false,
"title": "Entry 1 marathi",
"locale": "mr-in",
"url": "/entry-1",
"tags": [],
"uid": "c323c2244a4a4c22",
"updated_at": "2020-06-24T13:37:20.092Z",
"fields": []
}
}
]
And I want Result [] blank from this. As here we can see that uid of both the documents is the same. I am using the following query to get result :
{
"query": {
"bool": {
"must": [{
"bool": {
"must_not": [{
"bool": {
"must": [{
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.environment": "603fe91adbdcff66"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "en-us"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "hi-in"
}
}
}
}, {
"nested": {
"path": "publish_details",
"query": {
"term": {
"publish_details.locale": "mr-in"
}
}
}
}]
}
}]
}
}]
}
}
}
But the above query gives me all 2 documents, but I want results as bank the reason here is here uid is common and that uid contains all three local in publishing details. So is way to get a valid result, Is any aggregation query that helps me here. it is just a sample I have so many documents to filter out. Kindle Helps me here.

{
"aggs": {
"agg1": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"agg2": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
This query will group you by uid first then publish_details.locale
It provides results as below
"aggregations": {
"agg1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"agg2": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"agg2": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
}
}
}
]
I have three docs where two has same id and other one is different.
I will update the query further to remove the first result where you have 3 buckets. You also can proceed further to handle it in the code.
You can do that. 10k documents is fine. But when you have in millions, you should have enough resources to execute this.
{
"size" : 0,
"query":{
"bool" :{
"must_not":{
"match":{
"publish_details.environment":"603fe91adbdcff66"
}
}
}
},
"aggs": {
"uids": {
"terms": {
"field": "uid.raw"
},
"aggs": {
"details": {
"nested": {
"path": "publish_details"
},
"aggs": {
"locales": {
"terms": {
"field": "publish_details.locale"
}
},
"unique_locales": {
"value_count": {
"field": "publish_details.locale"
}
}
}
}
}
}
}
}
Result:
"aggregations": {
"uids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "c323c2244a4a4c22",
"doc_count": 2,
"details": {
"doc_count": 3,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
},
{
"key": "mr-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 3
}
}
},
{
"key": "c323c2244rrffa4a4c22",
"doc_count": 1,
"details": {
"doc_count": 2,
"locales": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "en-us",
"doc_count": 1
},
{
"key": "hi-in",
"doc_count": 1
}
]
},
"unique_locales": {
"value": 2
}
}
}
]

Terms aggregation not returning other buckets

I'm not able to get other buckets with terms aggregation when combining a filter aggregation. Anyway to do this in elasticsearch?
Mapping: customer with nested address. address with nested properties.
I've tried the following,
{
"size": 0,
"aggs": {
"address": {
"nested": {
"path": "address"
},
"aggs": {
"shipping_to_address": {
"aggs": {
"city": {
"terms": {
"field": "address.city.name.keyword",
"size": 10,
"missing": "others"
}
}
},
"filter": {
"bool": {
"must": [
{
"nested": {
"path": "address.properties",
"query": {
"bool": {
"filter": [
{
"term": {
"address.properties.type": "shipping_to"
}
}
]
}
}
}
}
]
}
}
}
}
}
}
}
The above only returns the buckets matching the filter.
{
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"address": {
"doc_count": 3,
"shipping_to_address": {
"doc_count": 1,
"city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new york",
"doc_count": 1
}
]
}
}
}
}
}
I would like to see the other buckets as below:
"buckets": [
{
"key": "new york",
"doc_count": 1
},
{
"key": "others",
"doc_count": 2
}
]

You need to add "min_doc_count":0 to terms aggregation, it will return empty buckets.
Link for reference
{
"size": 0,
"aggs": {
"address": {
"nested": {
"path": "address"
},
"aggs": {
"shipping_to_address": {
"aggs": {
"city": {
"terms": {
"field": "address.city.name.keyword",
"size": 10,
"min_doc_count":0,
"missing": "others"
}
}
},
"filter": {
"bool": {
"must": [
{
"nested": {
"path": "address.properties",
"query": {
"bool": {
"filter": [
{
"term": {
"address.properties.type": "shipping_to"
}
}
]
}
}
}
}
]
}
}
}
}
}
}
}

Elasticsearch: filter aggregation using bucket value

Not sure how to formulate the question.
I'm using Elasticsearch 2.2.
Let's start with an example of the dataset, made of 5 documents:
[
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": {},
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": { "uuid": "b" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 0
}
}
]
called_entity always has a uuid.
coverage_entity can be empty, or have an uuid.
I use a script to aggregate on either called_entity.uuid or coverage_entity.uuid:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
}
}
}
}
}
So now, the aggregation has generated terms from either header.called_entity.uuid, or header.coverage_entity.uuid.
How can I filter my aggregation using the value of the aggregation key? For example, if I want to count, for each bucket, how many documents have their uuid taken from header.called_entity.uuid only. Something like that:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
},
"from_called_entity": {
"filter": {
"term": { "header.called_entity.uuid": BUCKET_KEY }
}
}
}
}
}
}

Not sure this is possible. The key itself is only available as a sorting option.
Could you use something like this:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script": "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.sucessful_transfers"
}
}
}
},
"called_entity_source": {
"terms": {
"field": "header.called_entity.uuid",
"size": 10
}
},
"coverage_entity_source": {
"terms": {
"field": "header.coverage_entity.uuid",
"size": 10
}
}
}
}
And the output will be something like this:
"called_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 2
}
]
},
"coverage_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 1
}
]
},
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 4,
"successful_transfers": {
"value": 3
}
},
{
"key": "b",
"doc_count": 3,
"successful_transfers": {
"value": 2
}
}
]
}
If you really need to have the json in that specific way, add another final step in your application where you post process the result a bit. The result above does contain the info you need but the keys from coverage_entity_source and called_entity_source are not under the dim aggregation.

Elasticsearch sum_bucket aggregation to sum the values contained in resulting buckets

I have a query as follows:
{
"size": 0,
"query": {
"filtered": {
"query": {
"bool": {
"must": [
{
"match": {
"_type": "grx-ipx"
}
},
{
"range": {
"#timestamp": {
"gte": "2015-09-08T15:00:00.000Z",
"lte": "2015-09-08T15:10:00.000Z"
}
}
}
]
}
},
"filter": {
"and": [
{
"terms": {
"inSightCustID": [
"ASD001",
"ZXC049"
]
}
},
{
"terms": {
"reportFamily": [
"GRXoIPX",
"LTEoIPX"
]
}
}
]
}
}
},
"_source": [
"inSightCustID",
"fiveMinuteIn",
"reportFamily",
"#timestamp"
],
"aggs": {
"timestamp": {
"terms": {
"field": "#timestamp",
"size": 5
},
"aggs": {
"reportFamily": {
"terms": {
"field": "reportFamily"
},
"aggs": {
"averageFiveMinute": {
"avg": {
"field": "fiveMinuteIn"
}
}
}
}
}
},
"distinct_timestamps": {
"cardinality": {
"field": "#timestamp"
}
}
}
}
This result of this query looks like:
...
"aggregations": {
"distinct_timestamps": {
"value": 3,
"value_as_string": "1970-01-01T00:00:00.003Z"
},
"timestamp": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1441724700000,
"key_as_string": "2015-09-08T15:05:00.000Z",
"doc_count": 10,
"reportFamily": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "GRXoIPX",
"doc_count": 5,
"averageFiveMinute": {
"value": 1687.6
}
},
{
"key": "LTEoIPX",
"doc_count": 5,
"averageFiveMinute": {
"value": 56710.6
}
}
]
}
},
...
What I want to do is for each bucket in the reportFamily aggregation, I want to show the sum of the averageFiveMinute values. So for instance, in the example above, I would also like to show the sum of 1687.6 and 56710.6. I want to do this for all reportFamily aggregations.
Here is what I have tried:
{
"size": 0,
"query": {
"filtered": {
"query": {
"bool": {
"must": [
{
"match": {
"_type": "grx-ipx"
}
},
{
"range": {
"#timestamp": {
"gte": "2015-09-08T15:00:00.000Z",
"lte": "2015-09-08T15:10:00.000Z"
}
}
}
]
}
},
"filter": {
"and": [
{
"terms": {
"inSightCustID": [
"ASD001",
"ZXC049"
]
}
},
{
"terms": {
"reportFamily": [
"GRXoIPX",
"LTEoIPX"
]
}
}
]
}
}
},
"_source": [
"inSightCustID",
"fiveMinuteIn",
"reportFamily",
"#timestamp"
],
"aggs": {
"timestamp": {
"terms": {
"field": "#timestamp",
"size": 5
},
"aggs": {
"reportFamily": {
"terms": {
"field": "reportFamily"
},
"aggs": {
"averageFiveMinute": {
"avg": {
"field": "fiveMinuteIn"
}
}
}
},
"sum_AvgFiveMinute": {
"sum_bucket": {
"buckets_path": "reportFamily>averageFiveMinute"
}
}
}
},
"distinct_timestamps": {
"cardinality": {
"field": "#timestamp"
}
}
}
}
I have added:
"sum_AvgFiveMinute": {
"sum_bucket": {
"buckets_path": "reportFamily>averageFiveMinute"
}
}
But unfortunately, this triggers an exception Parse Failure [Could not find aggregator type [sum_bucket] in [sum_AvgFiveMinute]
I expected the results to be something like:
...
"aggregations": {
"distinct_timestamps": {
"value": 3,
"value_as_string": "1970-01-01T00:00:00.003Z"
},
"timestamp": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1441724700000,
"key_as_string": "2015-09-08T15:05:00.000Z",
"doc_count": 10,
"reportFamily": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "GRXoIPX",
"doc_count": 5,
"averageFiveMinute": {
"value": 1687.6
}
},
{
"key": "LTEoIPX",
"doc_count": 5,
"averageFiveMinute": {
"value": 56710.6
}
}
]
},
"sum_AvgFiveMinute": {
"value": 58398.2
}
},
...
What is wrong with this query and how can I achieve the expected result?
Here is a link to the sum bucket aggregation docs.
Many thanks for the help.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Exclude results based on key in elasticsearch - elasticsearch

Related

How to sort nested aggregation field based on parent document field in elasticsearch?

Complex Elastic Search Query

Terms aggregation not returning other buckets

Elasticsearch: filter aggregation using bucket value

Elasticsearch sum_bucket aggregation to sum the values contained in resulting buckets

Categories

Resources