Document count greater than 'x', geo_point aggregations elasticsearch

Document count greater than 'x', geo_point aggregations elasticsearch - elasticsearch

I'm currently doing the following Query to aggregate all points within a certain precision in elastic search.
{
"aggs": {
"coordinates": {
"geohash_grid": {
"field": "properties.Geometry.geo_point",
"precision": 12
},
"aggs": {
"centroid": {
"geo_centroid": {
"field": "properties.Geometry.geo_point"
}
}
}
}
}
}
and the response is
"aggregations": {
"coordinates": {
"buckets": [
{
"key": "s00000000000",
"doc_count": 82571,
"centroid": {
"location": {
"lat": 0,
"lon": 0
},
"count": 82571
}
},
{
"key": "6gyf4bf8m0uh",
"doc_count": 58587,
"centroid": {
"location": {
"lat": -23.55052001774311,
"lon": -46.633309721946716
},
"count": 58587
}
},
{
"key": "7h2y8hz76g8m",
"doc_count": 14551,
"centroid": {
"location": {
"lat": -19.924501832574606,
"lon": -43.93523778766394
},
"count": 14551
}
}
}
I need to get all buckets that have a count greater than a certain number. How can I do that?

You can add the field
GET _search
{
"query": {
"range" : {
"age" : {
"gte" : 5000,
}
}
}
}
as descripe here in the documentation:
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html

Solved, I used the following filter:
{
"aggs": {
"coordinates": {
"geohash_grid": {
"field": "properties.Geometry.geo_point",
"precision": 12
},
"aggs": {
"sales_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"doc_count": "centroid.count"
},
"script": "params.doc_count > 500"
}
},
"centroid": {
"geo_centroid": {
"field": "properties.Geometry.geo_point"
}
}
}
}
}
}

Related

Order by sub-aggregation in composite aggregation (Elasticsearch)

I need to ordenate the results of an composite aggregation, but the value to be orderned is the sum of a specific field (my index is so much larger, so i need the composite for paginate values).
When send this GET:
GET /_search
{
"aggs" : {
"my_buckets": {
"composite" : {
"sources" : [
{ "date": { "date_histogram": { "field": "timestamp", "interval": "1d"} } },
{ "product": { "terms": {"field": "product" } } }
]
},
"aggregations": {
"the_sum": {
"sum": { "field": "price" } <--- i want order by this field aggregation
}
}
}
}
}
How can i get this response? (order by sum of each price)
{
...
"aggregations": {
"my_buckets": {
"after_key": {
"date": 1494374400000,
"product": "mad max"
},
"buckets": [
{
"key": {
"date": 1494460800000,
"product": "apocalypse now"
},
"doc_count": 1,
"the_sum": {
"value": 10.0
}
},
{
"key": {
"date": 1494288000000,
"product" : "mad max"
},
"doc_count": 2,
"the_sum": {
"value": 22.5
}
},
{
"key": {
"date": 1494374400000,
"product": "mad max"
},
"doc_count": 1,
"the_sum": {
"value": 290.0
}
}
]
}
}
}

Elasticsearch daily/monthly aggregation query

I would like to build daily/monthly aggregation query.
Is the only solution to create ranges -> from... to... for each day/month? I can generate ranges but it seems that it can be other way to achieve that.
How can I format from... to... epoch milis to yyyy-mm-dd for each result?
{
"aggs": {
"aggs_sum_amount": {
"filters": {
"filters": {
"Amount1": {
...
},
"Amount2": {
...
}
}
},
"aggs": {
"range": {
"date_range": {
"field": "dateField",
"ranges": [
{
"from": "1613347200000",
"to": "1613433600000"
},
{
"from": "1613433600000",
"to": "1613520000000"
}
...
]
},
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
}
}
}
}
}
}
}
Example response
{
"aggregations": {
"aggs_sum_amountPLN": {
"buckets": {
"Amount1": {
"doc_count": 26,
"range": {
"buckets": [
{
"key": "1613347200000-1613433600000",
"from": 1.6133472E12,
"from_as_string": "1613347200000",
"to": 1.6134336E12,
"to_as_string": "1613433600000",
"doc_count": 0,
"sum_amount": {
"value": 0.0
}
},
{
...
}
]
}
},
"Amount2": {
...
}
}
}
}
}

You can use the date_histogram aggregation
It lets you specify a range and an interval for which you want to get the different buckets for.
This example on the linked page is quite self explanatory.
I've updated it to match your use-case:
POST /sales/_search?size=0
{
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "date",
"calendar_interval": "1M",
"format": "yyyy-MM-dd"
}
}
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
}
}
}
}
Response should be something like this:
{
...
"aggregations": {
"sales_over_time": {
"buckets": [
{
"key_as_string": "2015-01-01",
"key": 1420070400000,
"doc_count": 3,
"sum_amount": {
"value": 15.0
}
},
{
"key_as_string": "2015-02-01",
"key": 1422748800000,
"doc_count": 2,
"sum_amount": {
"value": 10.0
}
},
{
"key_as_string": "2015-03-01",
"key": 1425168000000,
"doc_count": 2,
"sum_amount": {
"value": 25.0
}
}
]
}
}
}

How to sort before aggregation in Elasticsearch?

I have an Elasticsearch index structured like this
{
"mappings": {
"properties": {
"content": {
"type": "text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":20
}
}
},
"result_nums":{
"type":"integer"
}
}
}
}
and all documents in the index like this
{
"content": "this",
"result_nums": 40
},
{
"content": "this",
"result_nums": 40
},
{
"content": "that",
"result_nums": 40
},
{
"content": "what",
"result_nums": 50
},
{
"content": "what",
"result_nums": 50
},
{
"content": "but",
"result_nums": 100
},
{
"content": "like",
"result_nums": 20
}
I need to get the data, sorting by result_nums DESC and removing duplicate "content". For example, I used the query like this to get the first two data
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"size": 2
},
"aggs": {
"res_nums": {
"avg": {
"field": "result_nums"
}
},
"res_sort": {
"bucket_sort": {
"sort": [
{
"res_nums": "desc"
}
]
}
}
}
}
}
}
The data I expect to get is
{
"key": "but",
"doc_count": 1,
"res_nums": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
}
But what I actually get is
{
"key": "what",
"doc_count": 2,
"res_nums": {
"value": 50.0
}
},
{
"key": "this",
"doc_count": 2,
"res_nums": {
"value": 40.0
}
}
so I think es needs to be sorted before aggregation, because now it will only be sorted after aggregation, so I got results that did not match expectations.
I tried to use sort before aggregation but no effect
{
"size": 0,
"sort": [
{
"result_nums": "desc"
}
],
"aggs": {
...
}
...
}
So how to do sort before aggregation?

You need to use max aggregation along with term query to get the data, sorting by result_nums DESC and removing duplicate "content"
Adding a working example
Search Query:
{
"size": 0,
"aggs": {
"content": {
"terms": {
"field": "content.keyword",
"order": {
"max_num": "desc"
},
"size":2
},
"aggs": {
"max_num": {
"max": {
"field": "result_nums"
}
}
}
}
}
}
Search Result:
"aggregations": {
"content": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 4,
"buckets": [
{
"key": "but",
"doc_count": 1,
"max_num": {
"value": 100.0
}
},
{
"key": "what",
"doc_count": 2,
"max_num": {
"value": 50.0
}
}
]
}

How to sort nested aggregation field based on parent document field in elasticsearch?

I have index of stores at various location. With each store I have a nested list of discount coupon.
Now I have query to get list of all unique coupons in a x km of radius sorted by the distance of the nearest applicable coupon on given location
Database :: Elasticsearch
Index Mapping ::
{
"mappings": {
"car_stores": {
"properties": {
"location": {
"type": "geo_point"
},
"discount_coupons": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
}
}
}
}
}
Sample Doc ::
{
"_index": "stores",
"_type": "car_stores",
"_id": "1258c81d-b6f2-400f-a448-bd728f524b55",
"_score": 1.0,
"_source": {
"location": {
"lat": 36.053757,
"lon": 139.525482
},
"discount_coupons": [
{
"name": "c1"
},
{
"name": "c2"
}
]
}
}
Old Query to get unique discount coupon names in x km area for given location ::
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"_key": "asc"
},
"size": 200
}
}
}
}
}
}
Updated Response ::
{
"took": 60,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 245328,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"coupon": {
"doc_count": 657442,
"name": {
"doc_count_error_upper_bound": -1,
"sum_other_doc_count": 641189,
"buckets": [
{
"key": "local20210211",
"doc_count": 1611,
"back_to_base": {
"doc_count": 1611,
"distance_script": {
"value": 160.61034409639765
}
}
},
{
"key": "local20210117",
"doc_count": 1621,
"back_to_base": {
"doc_count": 1621,
"distance_script": {
"value": 77.51459886447356
}
}
},
{
"key": "local20201220",
"doc_count": 1622,
"back_to_base": {
"doc_count": 1622,
"distance_script": {
"value": 84.15734462544432
}
}
},
{
"key": "kisekae1",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 88.23770888201268
}
}
},
{
"key": "local20210206",
"doc_count": 1626,
"back_to_base": {
"doc_count": 1626,
"distance_script": {
"value": 86.78376012847237
}
}
},
{
"key": "local20210106",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 384.12156408078397
}
}
},
{
"key": "local20210113",
"doc_count": 1628,
"back_to_base": {
"doc_count": 1628,
"distance_script": {
"value": 153.61681676703674
}
}
},
{
"key": "local20",
"doc_count": 1629,
"back_to_base": {
"doc_count": 1629,
"distance_script": {
"value": 168.74132991524073
}
}
},
{
"key": "local20210213",
"doc_count": 1630,
"back_to_base": {
"doc_count": 1630,
"distance_script": {
"value": 155.8335679860034
}
}
},
{
"key": "local20210208",
"doc_count": 1632,
"back_to_base": {
"doc_count": 1632,
"distance_script": {
"value": 99.58790590445102
}
}
}
]
}
}
}
}
Now the above query will return first 200 discount coupon default sorted by count but I want to return coupons sorted on distance based to given location i.e. the coupon that is nearest applicable should come first.
Is there any way to sort nested aggregations based on a parent key or can I solve this use case using a different data model?
Update Query ::
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"geo_distance": {
"distance": "100km",
"location": {
"lat": 35.699104,
"lon": 139.825211
}
}
},
{
"nested": {
"path": "discount_coupons",
"query": {
"bool": {
"filter": {
"exists": {
"field": "discount_coupons"
}
}
}
}
}
}
]
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 10
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(35.699104, 139.825211)"
}
}
}
}
}
}
}
}
}
}
}

Interesting question. You can always order a terms aggregation by the result of a numeric sub-aggregation. The trick here is to escape the nested context via a reverse_nested aggregation and then calculate the distance from the pivot using a script:
{
"size": 0,
"query": {
"bool": {
"must": {
"match_all": {}
},
"filter": {
"geo_distance": {
"distance": "100km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
},
"aggs": {
"coupon": {
"nested": {
"path": "discount_coupons"
},
"aggs": {
"name": {
"terms": {
"field": "discount_coupons.name",
"order": {
"back_to_base": "asc"
},
"size": 200
},
"aggs": {
"back_to_base": {
"reverse_nested": {},
"aggs": {
"distance_script": {
"min": {
"script": {
"source": "doc['location'].arcDistance(40, -70)"
}
}
}
}
}
}
}
}
}
}
}

Using Elastic Search Geo Functionality To Find Most Common Locations Ordered By Time

I currently have a ES query which gives me a list of "geo buckets" using geohash_grid and date_histogram:
"aggregations": {
"zoomedInView": {
"filter": {
"geo_bounding_box": {
"location": {
"top_left": "-37, 140",
"bottom_right": "-38, 146"
}
}
},
"aggregations": {
"zoom1": {
"geohash_grid": {
"field": "location",
"precision": 6
},
"aggs": {
"ts": {
"date_histogram": {
"min_doc_count" : 1,
"field": "dateTime",
"interval": "1m",
"format": "DDD HH:mm"
}
},
"map_zoom": {
"geo_bounds": {
"field": "location"
}
}
}
}
}
}
which gives me results looking like:
{
"key": "r1r0fu",
"map_zoom": {
"bounds": {
"top_left": {
"lat": -38.81073913909495,
"lon": 124.96536672115326
},
"bottom_right": {
"lat": -38.81329075805843,
"lon": 124.96823584660888
}
}
},
"ts": {
"buckets": [
{
"key_as_string": "136 20:15",
"key": 1463354100000,
},
{
"key_as_string": "137 04:30",
"key": 1463365800000,
"doc_count": 1
},
....
{
"key": "r1r0gx",
"map_zoom": {
"bounds": {
"top_left": {
"lat": -38.798130828887224,
"lon": 124.99871227890253
},
"bottom_right": {
"lat": -38.79820383526385,
"lon": 124.99872468411922
}
}
},
"ts": {
"buckets": [
{
"key_as_string": "136 23:21",
"key": 1463354460000,
},
{
"key_as_string": "137 02:30",
"key": 1463365800000,
},
{
"key_as_string": "137 03:31",
"key": 1463369460000,
}
]
}
},
In the above example the results are ordered by the geo buckets r1r0fu and r1r0gx and within the buckets the ordered time (in the format day-of-year HHH:mm) of the events and their count.
What I'd really like is:
1) The results ordered by time, which may mean the same buckets will appear multiple times.
2) Only the minimum and maximum time shown within each bucket (if possible)
So the results above would ideally look like:
{
"key": "r1r0fu",
"map_zoom": {
"bounds": {
"top_left": {
"lat": -38.81073913909495,
"lon": 124.96536672115326
},
"bottom_right": {
"lat": -38.81329075805843,
"lon": 124.96823584660888
}
}
},
"ts": {
"buckets": [
{
"key_as_string": "136 20:15",
"key": 1463354100000,
},
]
}
},
{
"key": "r1r0gx",
"map_zoom": {
"bounds": {
"top_left": {
"lat": -38.798130828887224,
"lon": 124.99871227890253
},
"bottom_right": {
"lat": -38.79820383526385,
"lon": 124.99872468411922
}
}
},
"ts": {
"buckets": [
{
"key_as_string": "136 23:21",
"key": 1463354460000,
},
{
"key_as_string": "137 03:31",
"key": 1463369460000,
},
}
},
{
"key": "r1r0fu",
"map_zoom": {
"bounds": {
"top_left": {
"lat": -38.81073913909495,
"lon": 124.96536672115326
},
"bottom_right": {
"lat": -38.81329075805843,
"lon": 124.96823584660888
}
}
},
"ts": {
"buckets": [
{
"key_as_string": "137 04:30",
"key": 1463365800000,
}
]
}
},
...
Where the results are ordered by time so the bucket r1r0fu appears twice in this case. And the event "key_as_string": "137 02:30", has been hidden as it is not the minimum or maximum date.
Is this anyway possible?
Many thanks!

If you want the results ordered by time, maybe it'd be better to swap the date_histogram aggregation with the geohash_grid one, like this:
{
"aggregations": {
"zoomedInView": {
"filter": {
"geo_bounding_box": {
"location": {
"top_left": "-37, 140",
"bottom_right": "-38, 146"
}
}
},
"aggregations": {
"ts": {
"date_histogram": {
"min_doc_count": 1,
"field": "dateTime",
"interval": "1m",
"format": "DDD HH:mm"
},
"aggs": {
"zoom1": {
"geohash_grid": {
"field": "location",
"precision": 6
}
},
"map_zoom": {
"geo_bounds": {
"field": "location"
}
}
}
}
}
}
}
}
That would take care of the question 1). However, since now each main bucket will be a time bucket, you won't be able to have the min and max time anymore. Try it out and see if it works for your needs.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Document count greater than 'x', geo_point aggregations elasticsearch - elasticsearch

You can add the field GET _search { "query": { "range" : { "age" : { "gte" : 5000, } } } } as descripe here in the documentation: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html

Related

Order by sub-aggregation in composite aggregation (Elasticsearch)

Elasticsearch daily/monthly aggregation query

How to sort before aggregation in Elasticsearch?

How to sort nested aggregation field based on parent document field in elasticsearch?

Using Elastic Search Geo Functionality To Find Most Common Locations Ordered By Time

Categories

Resources