How to filter by aggregation bucket? - elasticsearch

I need a query that returns only result that has 1 bucket.
The query below returns me the access data of a visitor grouped by day.
{
"size" : 0,
"query" : {
"filtered" : {
"filter" : {
"bool" : {
"must" : [
{
"range" : {
"start_time" : {
"gte" : "2019-02-06 00:00:00",
"lte" : "2019-02-11 23:59:59"
}
}
}
]
}
}
}
},
"aggs" : {
"UNIQUE" : {
"terms" : {
"size" : 0,
"field" : "username"
},
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
}
}
}
}
}
I need to know which ones returned only once in the period. So when you have only 1 bucket, it's ONE. And if it has visited for more than a day (buckets> 1) then it is RECURRENT.

If I understand it correctly, you'd want a list of users who have had a unique date or like visited only once in a particular time frame and you'd want both the details, date and the username to be in the aggregation.
I've created a sample mapping, sample documents, aggregation query and how it would appear in the response
Mapping:
PUT mytest
{
"mappings": {
"mydocs": {
"properties": {
"username": {
"type": "keyword"
},
"start_time": {
"type": "date",
"format": "yyyy-MM-dd"
}
}
}
}
}
Sample Documents:
You can see that I've created 6 documents where John has visited twice on same date, Jack visits site on two different dates, while Jane and Rob visited only once in the time-frame for which I will write an aggregation.
POST mytest/mydocs/1
{
"username": "john",
"start_time": "2018-08-01"
}
POST mytest/mydocs/2
{
"username": "john",
"start_time": "2018-08-01"
}
POST mytest/mydocs/3
{
"username": "jane",
"start_time": "2018-08-01"
}
POST mytest/mydocs/4
{
"username": "rob",
"start_time": "2018-08-01"
}
POST mytest/mydocs/5
{
"username": "jack",
"start_time": "2018-08-01"
}
POST mytest/mydocs/6
{
"username": "jack",
"start_time": "2018-08-02"
}
Updated Aggregation Request
Note I've added two more documents with username Jack who visits the site on two different dates, username John visits the site twice on the same date.
POST mytest/_search
{
"size": 0,
"query": {
"range": {
"start_time": {
"gte": "2017-08-01",
"lte": "2019-08-01"
}
}
},
"aggs": {
"myterms": {
"terms": {
"size": 100,
"field": "username"
},
"aggs": {
"visit_date": {
"date_histogram": {
"field": "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
},
"count": {
"cardinality": {
"field": "start_time"
}
},
"equal_one":{
"bucket_selector":{
"buckets_path":{
"count":"count.value"
},
"script":"params.count == 1"
}
}
}
}
}
}
Response
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"myterms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "john",
"doc_count": 2,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 2
}
]
}
},
{
"key": "jane",
"doc_count": 1,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 1
}
]
}
},
{
"key": "rob",
"doc_count": 1,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 1
}
]
}
}
]
}
}
}
You can see that John now appears in the result as expected even if he has visited site multiple times on same date.
Let me know if you have any queries.

Solution found was:
{
"size" : 0,
"query" : {
{
"range" : {
"start_time" : {
"gte" : "2019-02-11 00:00:00",
"lte" : "2019-02-11 23:59:59"
}
}
}
},
"aggs" : {
"UNIQUE" : {
"terms" : {
"size" : 0,
"field" : "username"
},
"aggs":{
"visit_date": {
"date_histogram": {
"field" : "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
},
"count": {
"cardinality": {
"script": "new Date(doc['start_time'].value).format('yyyy-MM-dd')"
}
},
"equal_one":{
"bucket_selector":{
"buckets_path":{
"count":"count.value"
},
"script":"count == 1"
}
}
}
}
}
}
But performance remains a problem. In an environment with about 1 million records this query does not work very well.
Maybe some query using Scripted Metrics would solve, but demand more analysis (doc: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-scripted-metric-aggregation.html)

Related

Elasticsearch sorting based on multiple aggeration

I try to get my data with different aggeration criterias afterwards I want to order it based on one of aggeration criteria. In this specific case I want to get my data to be ordered descendly based on "Monthly_Income/ SUM" criteria.
I searched and tried lots of thing but none of them worked for me. Could you give me the answer because I am new on elasticsearch.
what I searched so far and couldn't solve the problem ;
"ordering_by_a_sub_aggregation,
Sorting Based on "Deep" Metrics,
search-aggregations-bucket-terms-aggregation-script,
search-aggregations-bucket-multi-terms-aggregation
To visualize the problem. I always get the belowing result however I tried lots of methods but I couldn't achieve to get desired result.
undesired result
desired result
Request
`
{
"query": {
"bool": {
"must": [],
"must_not": []
}
},
"size": 0,
"aggs": {
"GENDER": {
"terms": {
"field": "GENDER.keyword",
"size": 10000000,
"missing": "N/A"
// ,"order": {"MARTIAL_STATUS>Monthly_Income_0.max" : "desc" }
},
"aggs": {
"MARTIAL_STATUS": {
"terms": {
"field": "MARTIAL_STATUS.keyword",
"size": 10000000,
"missing": "N/A"
// ,"order": {"Monthly_Income_0.value" : "desc" }
},
"aggs": {
"Monthly_Income_0": {
"sum": {
"field": "Monthly_Income"
}
},
"Monthly_Income_1": {
"value_count": {
"field": "Monthly_Income"
}
},
"SALE_PRICE_2": {
"sum": {
"field": "SALE_PRICE"
}
}
// ,"sort_by_percentage": {
// "bucket_sort": {
// "sort": [ { "Monthly_Income_0.value": { "order": "desc" } } ]
// }
// }
}
}
}
}
}
}
`
Response
`
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"GENDER": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Male",
"doc_count": 40959,
"MARTIAL_STATUS": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Married",
"doc_count": 35559,
"SALE_PRICE_2": {
"value": 2.530239767013672E9
},
"Monthly_Income_0": {
"value": 3.59618565E8
},
"Monthly_Income_1": {
"value": 35559
}
},
{
"key": "Single",
"doc_count": 5399,
"SALE_PRICE_2": {
"value": 3.7742297754296875E8
},
"Monthly_Income_0": {
"value": 5.3465554E7
},
"Monthly_Income_1": {
"value": 5399
}
},
{
"key": "N/A",
"doc_count": 1,
"SALE_PRICE_2": {
"value": 87344.203125
},
"Monthly_Income_0": {
"value": 40000.0
},
"Monthly_Income_1": {
"value": 1
}
}
]
}
},
{
"key": "Female",
"doc_count": 7777,
"MARTIAL_STATUS": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Married",
"doc_count": 5299,
"SALE_PRICE_2": {
"value": 3.9976638293359375E8
},
"Monthly_Income_0": {
"value": 4.4994796E7
},
"Monthly_Income_1": {
"value": 5299
}
},
{
"key": "Single",
"doc_count": 2477,
"SALE_PRICE_2": {
"value": 1.8698677312695312E8
},
"Monthly_Income_0": {
"value": 1.8793502E7
},
"Monthly_Income_1": {
"value": 2477
}
},
{
"key": "N/A",
"doc_count": 1,
"SALE_PRICE_2": {
"value": 101006.8203125
},
"Monthly_Income_0": {
"value": 10000.0
},
"Monthly_Income_1": {
"value": 1
}
}
]
}
}
]
}
}
}
`
I try to order based on an aggerate column but I couldn't able to achieve
My understanding of your issue is that you want to group by on combination of gender and marital status
I have used runtime mapping to concatenate fields "gender" and marital status and used term aggregation to group by on run time field and sorted groups based on sum.
{
"size": 0,
"runtime_mappings": {
"gender-maritalstatus": {
"type": "keyword",
"script": {
"source": """
def gender='NA';
def maritalstatus='NA';
if(doc['Gender.keyword'].size()!=0)
gender= doc['Gender.keyword'].value;
if(doc['Marital_Status.keyword'].size()!=0)
maritalstatus= doc['Marital_Status.keyword'].value;
emit(gender+'-'+maritalstatus);
"""
}
}
},
"aggs": {
"gender-marital-grouping": {
"terms": {
"field": "gender-maritalstatus",
"order": {
"monthly_income": "desc"
},
"size": 10
},
"aggs": {
"monthly_income": {
"sum": {
"field": "Monthly_Income"
}
}
}
}
}
}
Result
"buckets" : [
{
"key" : "Female-Single",
"doc_count" : 2,
"monthly_income" : {
"value" : 300.0
}
},
{
"key" : "Male-Married",
"doc_count" : 2,
"monthly_income" : {
"value" : 200.0
}
},
{
"key" : "Female-NA",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
},
{
"key" : "Male-NA",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
},
{
"key" : "Male-Single",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
}
]

Sort Aggregation in elastic seach?

I have use case where I need to get all unique user ids from Elasticsearch and it should be sorted by timestamp.
What I'm using currently is composite term aggregation with sub aggregation which will return the latest timestamp.
(I can't sort it in client side as it slow down the script)
Sample data in elastic search
{
"_index": "logstash-2020.10.29",
"_type": "doc",
"_id": "L0Urc3UBttS_uoEtubDk",
"_version": 1,
"_score": null,
"_source": {
"#version": "1",
"#timestamp": "2020-10-29T06:56:00.000Z",
"timestamp_string": "1603954560",
"search_query": "example 3",
"user_uuid": "asdfrghcwehf",
"browsing_url": "https://www.google.com/search?q=example+3",
},
"fields": {
"#timestamp": [
"2020-10-29T06:56:00.000Z"
]
},
"sort": [
1603954560000
]
}
Expected Output:
[
{
"key" : "bjvexyducsls",
"doc_count" : 846,
"1" : {
"value" : 1.603948557E12,
"value_as_string" : "2020-10-29T05:15:57.000Z"
}
},
{
"key" : "lhmsbq2osski",
"doc_count" : 420,
"1" : {
"value" : 1.6039476E12,
"value_as_string" : "2020-10-29T05:00:00.000Z"
}
},
{
"key" : "m2wiaufcbvvi",
"doc_count" : 1,
"1" : {
"value" : 1.603893635E12,
"value_as_string" : "2020-10-28T14:00:35.000Z"
}
},
{
"key" : "rrm3vd5ovqwg",
"doc_count" : 1,
"1" : {
"value" : 1.60389362E12,
"value_as_string" : "2020-10-28T14:00:20.000Z"
}
},
{
"key" : "x42lk4t3frfc",
"doc_count" : 72,
"1" : {
"value" : 1.60389318E12,
"value_as_string" : "2020-10-28T13:53:00.000Z"
}
}
]
Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings":{
"properties":{
"user":{
"type":"keyword"
},
"date":{
"type":"date"
}
}
}
}
Index Data:
{
"date": "2015-01-01",
"user": "user1"
}
{
"date": "2014-01-01",
"user": "user2"
}
{
"date": "2015-01-11",
"user": "user3"
}
Search Query:
{
"size": 0,
"aggs": {
"user_id": {
"terms": {
"field": "user",
"order": {
"sort_user": "asc"
}
},
"aggs": {
"sort_user": {
"min": {
"field": "date"
}
}
}
}
}
}
Search Result:
"aggregations": {
"user_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user2",
"doc_count": 1,
"sort_user": {
"value": 1.3885344E12,
"value_as_string": "2014-01-01T00:00:00.000Z"
}
},
{
"key": "user1",
"doc_count": 1,
"sort_user": {
"value": 1.4200704E12,
"value_as_string": "2015-01-01T00:00:00.000Z"
}
},
{
"key": "user3",
"doc_count": 1,
"sort_user": {
"value": 1.4209344E12,
"value_as_string": "2015-01-11T00:00:00.000Z"
}
}
]
}

Elasticsearch aggregation by nested object

I'm trying to build a product search with facet filtering for a eCommerce app. For the product brand I have the following structure:
"brand": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"id": {
"type": "integer"
}
}
}
I want to make an aggregation by brand id and return the whole object and the count of the documents. Something like this:
"brands" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : {
"name": "Apple",
"id": 1
},
"doc_count" : 34
},
{
"key" : {
"name": "Samsung",
"id": 2
},
"doc_count" : 23
}
]
}
Currently I'm writing the aggregation like this:
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
}
}
}
},
}
and the result looks like this:
"aggregations" : {
"brands" : {
"doc_count" : 15,
"brandIds" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 4
},
{
"key" : 2,
"doc_count" : 2
}
]
}
}
}
You can use a Term Aggregation within a Terms Aggregation like this :
GET {index_name}/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
},
"aggs": {
"by name": {
"terms": {
"field": "brand.name.keyword",
"size": 10
}
}
}
}
}
}
}
}
This would result in something like this:
"aggregations": {
"brands": {
"doc_count": 68,
"brandIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 46,
"by name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Apple",
"doc_count": 46
}
]
}
},
{
"key": 2,
"doc_count": 22,
"ny id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Samsung",
"doc_count": 22
}
]
}
}
]
}
}
}
Hope this helps!!

Obtaining the sum of only the latest Elasticsearch docs, aggregated/grouped by some field

I have a requirement to calculate the sum of fields in documents, grouped by some other field, but only the entries for which some third field is the latest.
for example, for these docs:
{
"time": "2019-08-21T13:00:00",
"session_id": "1",
"byte_count": 200,
"ip": "1.1.1.1"
}
{
"time": "2019-08-21T12:00:00",
"session_id": "1",
"byte_count": 100,
"ip": "1.1.1.1"
}
{
"time": "2019-08-21T12:00:00",
"session_id": "2",
"byte_count": 123,
"ip": "2.2.2.2"
}
{
"time": "2019-08-21T14:00:00",
"session_id": "3",
"byte_count": 100,
"ip": "1.1.1.1"
}
need to be grouped by session_id, but only have the latest entry, and those results must be grouped by ip, and summed on byte_count
I have seen one can do grouping with collapse, and it works. I’ve also seen one can do sum with aggregation (aggs), but when I use the two together, it looks like the aggs operates on the actual docs, not the result of the collapse. This search
{
"collapse": {
"field": "session_id",
"inner_hits": {
"name": "most_recent",
"size": 1,
"sort": [
{
"time": "desc"
}
]
}
},
"aggs": {
"by_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
}
}
}
}
}
Has the correct three hits, but the aggregation output has these buckets:
buckets": [
{
"key": "1.1.1.1",
"doc_count": 3,
"total_bytes": {
"value": 400
}
},
{
"key": "2.2.2.2",
"doc_count": 1,
"total_bytes": {
"value": 123
}
}
]
I’ve also seen one can group with aggs top_hits, but when I try to do an aggs sum on the result as such:
{
"size": 0,
"aggs": {
"by_session": {
"terms": {
"field": "session_id"
},
"aggs": {
"per_session": {
"top_hits": {
"sort": [
{
"time": "desc"
}
],
"size": 1
},
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
}
}
}
}
}
}
}
}
}
I get error:
Aggregator [per_session] of type [top_hits] cannot accept sub-aggregations
How can I update the search params to have the expected result of:
{
"key": "1.1.1.1",
"doc_count": 2,
"total_bytes": {
"value": 300
}
},
{
"key": "2.2.2.2",
"doc_count": 1,
"total_bytes": {
"value": 123
}
}
?
Test data used:
PUT test
POST test/_doc
{
"time": "2019-08-21T13:00:00",
"session_id": 1,
"byte_count": 200,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T13:00:00",
"session_id": 1,
"byte_count": 700,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T12:00:00",
"session_id": 1,
"byte_count": 100,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T12:00:00",
"session_id": 2,
"byte_count": 123,
"ip": "2.2.2.2"
}
POST test/_doc
{
"time": "2019-08-21T14:00:00",
"session_id": 3,
"byte_count": 100,
"ip": "1.1.1.1"
}
I am not sure if I understand correctly. You said "the latest entry":
"need to be grouped by session_id, but only have the latest entry, and those results must be grouped by ip, and summed on byte_count"
Does this mean the latest "time"?
From your input data don't you expect to get for "ip": "1.1.1.1" the following result?:
{
"time": "2019-08-21T14:00:00",
"session_id": "3",
"byte_count": 100,
"ip": "1.1.1.1"
}
Because this has a newer "time" compared with the other documents with ip:1.1.1.1?
Anyway here is a query that groups by IP and then groups by session_id. The seesion id buckets are then sorted by the newest "time"
{
"size": 0,
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"per_Session": {
"terms": {
"field": "session_id",
"order" : { "my_max_date" : "desc" }
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
},
"my_max_date" : { "max" : { "field" : "time" } }
}
}
}
}
}
}
If you want to get just the first bucket , just add "size":1 after the order.
Then you need to extract the documents from your aggregations.
This is what I got:
"per_ip" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1.1.1.1",
"doc_count" : 3,
"per_Session" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 3,
"doc_count" : 1,
"total_bytes" : {
"value" : 100.0
},
"my_max_date" : {
"value" : 1.566396E12,
"value_as_string" : "2019-08-21T14:00:00.000Z"
}
},
{
"key" : 1,
"doc_count" : 2,
"total_bytes" : {
"value" : 300.0
},
"my_max_date" : {
"value" : 1.5663924E12,
"value_as_string" : "2019-08-21T13:00:00.000Z"
}
}
]
}
},
{
"key" : "2.2.2.2",
"doc_count" : 1,
"per_Session" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2,
"doc_count" : 1,
"total_bytes" : {
"value" : 123.0
},
"my_max_date" : {
"value" : 1.5663888E12,
"value_as_string" : "2019-08-21T12:00:00.000Z"
}
}
]
}
}
]
}
}
UPDATE:
After first 2 comment discussion added following query which adds "time" field aggregation buckets to the session_id buckets. This allows selecting the newest entry. This still needs to groups the results, but this can maybe be done on the client side by reading the buckets and doing a sum or maybe it can be possible with "Sum Bucket Aggregation"
{
"size": 5,
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"per_Session": {
"terms": {
"field": "session_id"
},
"aggs": {
"my_max_date" : {
"terms": {
"field": "time",
"order": [
{
"_key": "desc"
}
],
"size":1
},
"aggs" :
{
"total_bytes": {
"terms": {
"field": "byte_count",
"size":2
}
}
}
}
}
}
}
}
}}

Not able to aggregate on nested fields in elasticsearch

I have set a field to nested and now i am not able to aggregate on it.
Sample document -
{
"attributes" : [
{ "name" : "snake" , "type" : "reptile" },
{ "name" : "cow" , "type" : "mamal" }
]
}
attributes field is nested.
Following terms query is not working on this
{
"aggs" : {
"terms" : { "field" : "attributes.name" }
}
}
How can I do the aggregation in elasticsearch?
Use a nested aggregation.
As a simple example, I created an index with a nested property matching what you posted:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"attributes": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
Then added your document:
PUT /test_index/doc/1
{
"attributes": [
{ "name": "snake", "type": "reptile" },
{ "name": "cow", "type": "mammal" }
]
}
Now I can get "attribute.name" terms as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_attributes": {
"nested": {
"path": "attributes"
},
"aggs": {
"name_terms": {
"terms": {
"field": "attributes.name"
}
}
}
}
}
}
...
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_attributes": {
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cow",
"doc_count": 1
},
{
"key": "snake",
"doc_count": 1
}
]
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/0e3ed9c700f240e523be08a27551707d4448a9df

Resources