Elasticsearch aggregation by nested object - elasticsearch

I'm trying to build a product search with facet filtering for a eCommerce app. For the product brand I have the following structure:
"brand": {
"type": "nested",
"properties": {
"name": {
"type": "text"
},
"id": {
"type": "integer"
}
}
}
I want to make an aggregation by brand id and return the whole object and the count of the documents. Something like this:
"brands" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : {
"name": "Apple",
"id": 1
},
"doc_count" : 34
},
{
"key" : {
"name": "Samsung",
"id": 2
},
"doc_count" : 23
}
]
}
Currently I'm writing the aggregation like this:
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
}
}
}
},
}
and the result looks like this:
"aggregations" : {
"brands" : {
"doc_count" : 15,
"brandIds" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 1,
"doc_count" : 4
},
{
"key" : 2,
"doc_count" : 2
}
]
}
}
}

You can use a Term Aggregation within a Terms Aggregation like this :
GET {index_name}/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"brands": {
"nested": {
"path": "brand"
},
"aggs": {
"brandIds": {
"terms": {
"field": "brand.id"
},
"aggs": {
"by name": {
"terms": {
"field": "brand.name.keyword",
"size": 10
}
}
}
}
}
}
}
}
This would result in something like this:
"aggregations": {
"brands": {
"doc_count": 68,
"brandIds": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 46,
"by name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Apple",
"doc_count": 46
}
]
}
},
{
"key": 2,
"doc_count": 22,
"ny id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Samsung",
"doc_count": 22
}
]
}
}
]
}
}
}
Hope this helps!!

Related

multiple field aggregation on documents with multiple elements gives unexpected result

I have documents with the following structure (very much simplified for the example):
"documents": [
{
"name": "Document 1",
"collections" : [
{
"id": 30,
"title" : "Research"
},
{
"id": 45,
"title" : "Events"
},
{
"id" : 52,
"title" : "International"
}
]
},
{
"name": "Document 2",
"collections" : [
{
"id": 45,
"title" : "Events"
},
{
"id" : 63,
"title" : "Development"
}
]
}
]
I want an aggregation of the collection. It works fine when I do it like this:
"aggs": {
"collections": {
"terms": {
"field": "collections.title",
"size": 30
}
}
}
I get a nice result as expected:
"aggregations" : {
"collections" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Research",
"doc_count" : 18
},
{
"key" : "Events",
"doc_count" : 14
},
{
"key" : "International",
"doc_count" : 13
},
{
"key" : "Development",
"doc_count" : 8
}
]
}
}
However, I want the id included as well. So I tried this:
"aggs": {
"collections": {
"terms": {
"field": "collections.title",
"size": 30
}
},
"aggs": {
"id": {
"terms": {
"field": "collections.id",
"size": 1
}
}
}
}
This is the result:
"aggregations" : {
"collections" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Research",
"doc_count" : 18,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "30",
"doc_count" : 1
}
]
}
},
{
"key" : "Events",
"doc_count" : 14,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "45",
"doc_count" : 1
}
]
}
},
{
"key" : "International",
"doc_count" : 13,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "52",
"doc_count" : 1
}
]
}
},
{
"key" : "Development",
"doc_count" : 8,
"id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "45",
"doc_count" : 1
}
]
}
}
]
}
}
At glance it looks good. But at a closer look the at the last element with Development (scroll down). The id should be 63, but is 45.
I have vague idea why this is, but I cannot find a solution for it. I also tried the multi_terms, but it gives a similar result. I think the issue has to do with the fact there are multiple collections within the document.
Does anyone know the correct solution to solve this issue?
The reason is in an object type mapping there is no relation between "title" and "id" , everything is flatenned by Elasticsearch under the hood, so:
"collections" : [
{
"id": 30,
"title" : "Research"
},
{
"id": 45,
"title" : "Events"
},
{
"id" : 52,
"title" : "International"
}
]
Becomes:
"collections.id": [30,45,52],
"collections.title": [Research, Events, International]
Elasticsearch doesn't know id 30 belongs to Research, or id 45 to Events.
You must use "nested" type to keep the relation between nested properties.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Solution: Use nested field type
Mappings
PUT test_nestedaggs
{
"mappings": {
"properties": {
"name": {
"type": "text"
},
"collections": {
"type": "nested",
"properties": {
"title": {
"type": "keyword"
},
"id": {
"type": "keyword"
}
}
}
}
}
}
Documents
POST test_nestedaggs/_doc
{
"name": "Document 1",
"collections": [
{
"id": 30,
"title": "Research"
},
{
"id": 45,
"title": "Events"
},
{
"id": 52,
"title": "International"
}
]
}
POST test_nestedaggs/_doc
{
"name": "Document 2",
"collections": [
{
"id": 45,
"title": "Events"
},
{
"id": 63,
"title": "Development"
}
]
}
Query
POST test_nestedaggs/_search?size=0
{
"aggs": {
"nested_collections": {
"nested": {
"path": "collections"
},
"aggs": {
"collections": {
"terms": {
"field": "collections.title"
},
"aggs": {
"ids": {
"terms": {
"field": "collections.id"
}
}
}
}
}
}
}
}
Results
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"nested_collections": {
"doc_count": 5,
"collections": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Events",
"doc_count": 2,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "45",
"doc_count": 2
}
]
}
},
{
"key": "Development",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "63",
"doc_count": 1
}
]
}
},
{
"key": "International",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "52",
"doc_count": 1
}
]
}
},
{
"key": "Research",
"doc_count": 1,
"ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "30",
"doc_count": 1
}
]
}
}
]
}
}
}
}
You can read an article I wrote about that for details:
https://opster.com/guides/elasticsearch/data-architecture/elasticsearch-nested-field-object-field/
NOTE: If the number of child documents is too big and you are doing a lot of updates, consider changing the data model because each child document is an independent document in the index, and on each update on a child document the whole structure will reindex and that may affect the performance, there are also limits in the maximum of nested documents you can add. If the number is small like the example then it's fine.

Show aggregations of same category after filter

I would like to create filter and aggregations after filter for procuts in Elasticsearch.
I am having base aggregations for all products:
"size": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 87,
"buckets": [
{
"key": "6",
"doc_count": 89
},
{
"key": "5,5",
"doc_count": 60
}
]
}
},
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 87,
"buckets": [
{
"key": "Apple",
"doc_count": 89
},
{
"key": "Samsung",
"doc_count": 60
},
{
"key": "Xiaomi",
"doc_count": 48
},
{
"key": "Huawei",
"doc_count": 33
}
]
}
}
After I make query for one of those brands and size like:
query": {
"bool": {
"filter": [
"term": {
"brand": "Samsung"
},
"term": {
"size": "6"
}
]
}
}
I am getting back aggregations only for selected brand. But i still want to see in aggregations all others brands with same size.
Is this possible with ES?
Thank you so much for all answers.
I'm not sure I understand what you are looking for.
You could try this query (terms aggregation on brand and then sub-aggregations of size):
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"byBrand": {
"terms": {
"field": "brand"
},
"aggs": {
"bySize": {
"terms": {
"field": "size"
}
}
}
}
}
}
Otherwise, you could filter by size first and then aggregate by brand:
{
"query": {
"bool": {
"filter": {
"term": {
"size": "6"
}
}
}
},
"size": 0,
"aggs": {
"byBrand": {
"terms": {
"field": "brand"
}
}
}
}
If this does not help you, please add your expected output to the question.
There are multiple ways to do it
Using global and filter aggregation
Global aggregation - looks for all documents even those filtered by query.
Query
{
"query": {
"bool": {
"filter": [
{
"term": {
"brand.keyword": "Samsung"
}
},
{
"term": {
"size": "6"
}
}
]
}
},
"aggs": {
"all_brands": {
"global": {}, --> refer all documents
"aggs": {
"size_filter": { --> filter size 6
"filter": {
"term": {
"size": 6
}
},
"aggs": {
"all_brands_terms": {
"terms": {
"field": "brand.keyword",
"size": 10
}
}
}
}
}
}
}
}
Result
hits" : [
{
"_index" : "index67",
"_type" : "_doc",
"_id" : "FyERPYQBJutE-yZcbYF4",
"_score" : 0.0,
"_source" : {
"brand" : "Samsung",
"size" : 6
}
}
]
},
"aggregations" : {
"all_documents" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Apple",
"doc_count" : 1
},
{
"key" : "Samsung",
"doc_count" : 1
}
]
}
}
Post_filter
When you use the post_filter parameter to filter search results, the search hits are filtered after the aggregations are calculated. A post filter has no impact on the aggregation results.
Query
{
"query": {
"bool": {
"filter": [
{
"term": {
"size": "6"
}
}
]
}
},
"aggs": {
"all_documents": {
"terms": {
"field": "brand.keyword",
"size": 10
}
}
},
"post_filter": {
"term": {
"brand.keyword": "Samsung"
}
}
}
Result
hits" : [
{
"_index" : "index67",
"_type" : "_doc",
"_id" : "FyERPYQBJutE-yZcbYF4",
"_score" : 0.0,
"_source" : {
"brand" : "Samsung",
"size" : 6
}
}
]
},
"aggregations" : {
"all_documents" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Apple",
"doc_count" : 1
},
{
"key" : "Samsung",
"doc_count" : 1
}
]
}
}

Elasticsearch sorting based on multiple aggeration

I try to get my data with different aggeration criterias afterwards I want to order it based on one of aggeration criteria. In this specific case I want to get my data to be ordered descendly based on "Monthly_Income/ SUM" criteria.
I searched and tried lots of thing but none of them worked for me. Could you give me the answer because I am new on elasticsearch.
what I searched so far and couldn't solve the problem ;
"ordering_by_a_sub_aggregation,
Sorting Based on "Deep" Metrics,
search-aggregations-bucket-terms-aggregation-script,
search-aggregations-bucket-multi-terms-aggregation
To visualize the problem. I always get the belowing result however I tried lots of methods but I couldn't achieve to get desired result.
undesired result
desired result
Request
`
{
"query": {
"bool": {
"must": [],
"must_not": []
}
},
"size": 0,
"aggs": {
"GENDER": {
"terms": {
"field": "GENDER.keyword",
"size": 10000000,
"missing": "N/A"
// ,"order": {"MARTIAL_STATUS>Monthly_Income_0.max" : "desc" }
},
"aggs": {
"MARTIAL_STATUS": {
"terms": {
"field": "MARTIAL_STATUS.keyword",
"size": 10000000,
"missing": "N/A"
// ,"order": {"Monthly_Income_0.value" : "desc" }
},
"aggs": {
"Monthly_Income_0": {
"sum": {
"field": "Monthly_Income"
}
},
"Monthly_Income_1": {
"value_count": {
"field": "Monthly_Income"
}
},
"SALE_PRICE_2": {
"sum": {
"field": "SALE_PRICE"
}
}
// ,"sort_by_percentage": {
// "bucket_sort": {
// "sort": [ { "Monthly_Income_0.value": { "order": "desc" } } ]
// }
// }
}
}
}
}
}
}
`
Response
`
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"GENDER": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Male",
"doc_count": 40959,
"MARTIAL_STATUS": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Married",
"doc_count": 35559,
"SALE_PRICE_2": {
"value": 2.530239767013672E9
},
"Monthly_Income_0": {
"value": 3.59618565E8
},
"Monthly_Income_1": {
"value": 35559
}
},
{
"key": "Single",
"doc_count": 5399,
"SALE_PRICE_2": {
"value": 3.7742297754296875E8
},
"Monthly_Income_0": {
"value": 5.3465554E7
},
"Monthly_Income_1": {
"value": 5399
}
},
{
"key": "N/A",
"doc_count": 1,
"SALE_PRICE_2": {
"value": 87344.203125
},
"Monthly_Income_0": {
"value": 40000.0
},
"Monthly_Income_1": {
"value": 1
}
}
]
}
},
{
"key": "Female",
"doc_count": 7777,
"MARTIAL_STATUS": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Married",
"doc_count": 5299,
"SALE_PRICE_2": {
"value": 3.9976638293359375E8
},
"Monthly_Income_0": {
"value": 4.4994796E7
},
"Monthly_Income_1": {
"value": 5299
}
},
{
"key": "Single",
"doc_count": 2477,
"SALE_PRICE_2": {
"value": 1.8698677312695312E8
},
"Monthly_Income_0": {
"value": 1.8793502E7
},
"Monthly_Income_1": {
"value": 2477
}
},
{
"key": "N/A",
"doc_count": 1,
"SALE_PRICE_2": {
"value": 101006.8203125
},
"Monthly_Income_0": {
"value": 10000.0
},
"Monthly_Income_1": {
"value": 1
}
}
]
}
}
]
}
}
}
`
I try to order based on an aggerate column but I couldn't able to achieve
My understanding of your issue is that you want to group by on combination of gender and marital status
I have used runtime mapping to concatenate fields "gender" and marital status and used term aggregation to group by on run time field and sorted groups based on sum.
{
"size": 0,
"runtime_mappings": {
"gender-maritalstatus": {
"type": "keyword",
"script": {
"source": """
def gender='NA';
def maritalstatus='NA';
if(doc['Gender.keyword'].size()!=0)
gender= doc['Gender.keyword'].value;
if(doc['Marital_Status.keyword'].size()!=0)
maritalstatus= doc['Marital_Status.keyword'].value;
emit(gender+'-'+maritalstatus);
"""
}
}
},
"aggs": {
"gender-marital-grouping": {
"terms": {
"field": "gender-maritalstatus",
"order": {
"monthly_income": "desc"
},
"size": 10
},
"aggs": {
"monthly_income": {
"sum": {
"field": "Monthly_Income"
}
}
}
}
}
}
Result
"buckets" : [
{
"key" : "Female-Single",
"doc_count" : 2,
"monthly_income" : {
"value" : 300.0
}
},
{
"key" : "Male-Married",
"doc_count" : 2,
"monthly_income" : {
"value" : 200.0
}
},
{
"key" : "Female-NA",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
},
{
"key" : "Male-NA",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
},
{
"key" : "Male-Single",
"doc_count" : 1,
"monthly_income" : {
"value" : 100.0
}
}
]

Obtaining the sum of only the latest Elasticsearch docs, aggregated/grouped by some field

I have a requirement to calculate the sum of fields in documents, grouped by some other field, but only the entries for which some third field is the latest.
for example, for these docs:
{
"time": "2019-08-21T13:00:00",
"session_id": "1",
"byte_count": 200,
"ip": "1.1.1.1"
}
{
"time": "2019-08-21T12:00:00",
"session_id": "1",
"byte_count": 100,
"ip": "1.1.1.1"
}
{
"time": "2019-08-21T12:00:00",
"session_id": "2",
"byte_count": 123,
"ip": "2.2.2.2"
}
{
"time": "2019-08-21T14:00:00",
"session_id": "3",
"byte_count": 100,
"ip": "1.1.1.1"
}
need to be grouped by session_id, but only have the latest entry, and those results must be grouped by ip, and summed on byte_count
I have seen one can do grouping with collapse, and it works. I’ve also seen one can do sum with aggregation (aggs), but when I use the two together, it looks like the aggs operates on the actual docs, not the result of the collapse. This search
{
"collapse": {
"field": "session_id",
"inner_hits": {
"name": "most_recent",
"size": 1,
"sort": [
{
"time": "desc"
}
]
}
},
"aggs": {
"by_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
}
}
}
}
}
Has the correct three hits, but the aggregation output has these buckets:
buckets": [
{
"key": "1.1.1.1",
"doc_count": 3,
"total_bytes": {
"value": 400
}
},
{
"key": "2.2.2.2",
"doc_count": 1,
"total_bytes": {
"value": 123
}
}
]
I’ve also seen one can group with aggs top_hits, but when I try to do an aggs sum on the result as such:
{
"size": 0,
"aggs": {
"by_session": {
"terms": {
"field": "session_id"
},
"aggs": {
"per_session": {
"top_hits": {
"sort": [
{
"time": "desc"
}
],
"size": 1
},
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
}
}
}
}
}
}
}
}
}
I get error:
Aggregator [per_session] of type [top_hits] cannot accept sub-aggregations
How can I update the search params to have the expected result of:
{
"key": "1.1.1.1",
"doc_count": 2,
"total_bytes": {
"value": 300
}
},
{
"key": "2.2.2.2",
"doc_count": 1,
"total_bytes": {
"value": 123
}
}
?
Test data used:
PUT test
POST test/_doc
{
"time": "2019-08-21T13:00:00",
"session_id": 1,
"byte_count": 200,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T13:00:00",
"session_id": 1,
"byte_count": 700,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T12:00:00",
"session_id": 1,
"byte_count": 100,
"ip": "1.1.1.1"
}
POST test/_doc
{
"time": "2019-08-21T12:00:00",
"session_id": 2,
"byte_count": 123,
"ip": "2.2.2.2"
}
POST test/_doc
{
"time": "2019-08-21T14:00:00",
"session_id": 3,
"byte_count": 100,
"ip": "1.1.1.1"
}
I am not sure if I understand correctly. You said "the latest entry":
"need to be grouped by session_id, but only have the latest entry, and those results must be grouped by ip, and summed on byte_count"
Does this mean the latest "time"?
From your input data don't you expect to get for "ip": "1.1.1.1" the following result?:
{
"time": "2019-08-21T14:00:00",
"session_id": "3",
"byte_count": 100,
"ip": "1.1.1.1"
}
Because this has a newer "time" compared with the other documents with ip:1.1.1.1?
Anyway here is a query that groups by IP and then groups by session_id. The seesion id buckets are then sorted by the newest "time"
{
"size": 0,
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"per_Session": {
"terms": {
"field": "session_id",
"order" : { "my_max_date" : "desc" }
},
"aggs": {
"total_bytes": {
"sum": {
"field": "byte_count"
}
},
"my_max_date" : { "max" : { "field" : "time" } }
}
}
}
}
}
}
If you want to get just the first bucket , just add "size":1 after the order.
Then you need to extract the documents from your aggregations.
This is what I got:
"per_ip" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1.1.1.1",
"doc_count" : 3,
"per_Session" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 3,
"doc_count" : 1,
"total_bytes" : {
"value" : 100.0
},
"my_max_date" : {
"value" : 1.566396E12,
"value_as_string" : "2019-08-21T14:00:00.000Z"
}
},
{
"key" : 1,
"doc_count" : 2,
"total_bytes" : {
"value" : 300.0
},
"my_max_date" : {
"value" : 1.5663924E12,
"value_as_string" : "2019-08-21T13:00:00.000Z"
}
}
]
}
},
{
"key" : "2.2.2.2",
"doc_count" : 1,
"per_Session" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2,
"doc_count" : 1,
"total_bytes" : {
"value" : 123.0
},
"my_max_date" : {
"value" : 1.5663888E12,
"value_as_string" : "2019-08-21T12:00:00.000Z"
}
}
]
}
}
]
}
}
UPDATE:
After first 2 comment discussion added following query which adds "time" field aggregation buckets to the session_id buckets. This allows selecting the newest entry. This still needs to groups the results, but this can maybe be done on the client side by reading the buckets and doing a sum or maybe it can be possible with "Sum Bucket Aggregation"
{
"size": 5,
"aggs": {
"per_ip": {
"terms": {
"field": "ip"
},
"aggs": {
"per_Session": {
"terms": {
"field": "session_id"
},
"aggs": {
"my_max_date" : {
"terms": {
"field": "time",
"order": [
{
"_key": "desc"
}
],
"size":1
},
"aggs" :
{
"total_bytes": {
"terms": {
"field": "byte_count",
"size":2
}
}
}
}
}
}
}
}
}}

How to filter by aggregation bucket?

I need a query that returns only result that has 1 bucket.
The query below returns me the access data of a visitor grouped by day.
{
"size" : 0,
"query" : {
"filtered" : {
"filter" : {
"bool" : {
"must" : [
{
"range" : {
"start_time" : {
"gte" : "2019-02-06 00:00:00",
"lte" : "2019-02-11 23:59:59"
}
}
}
]
}
}
}
},
"aggs" : {
"UNIQUE" : {
"terms" : {
"size" : 0,
"field" : "username"
},
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
}
}
}
}
}
I need to know which ones returned only once in the period. So when you have only 1 bucket, it's ONE. And if it has visited for more than a day (buckets> 1) then it is RECURRENT.
If I understand it correctly, you'd want a list of users who have had a unique date or like visited only once in a particular time frame and you'd want both the details, date and the username to be in the aggregation.
I've created a sample mapping, sample documents, aggregation query and how it would appear in the response
Mapping:
PUT mytest
{
"mappings": {
"mydocs": {
"properties": {
"username": {
"type": "keyword"
},
"start_time": {
"type": "date",
"format": "yyyy-MM-dd"
}
}
}
}
}
Sample Documents:
You can see that I've created 6 documents where John has visited twice on same date, Jack visits site on two different dates, while Jane and Rob visited only once in the time-frame for which I will write an aggregation.
POST mytest/mydocs/1
{
"username": "john",
"start_time": "2018-08-01"
}
POST mytest/mydocs/2
{
"username": "john",
"start_time": "2018-08-01"
}
POST mytest/mydocs/3
{
"username": "jane",
"start_time": "2018-08-01"
}
POST mytest/mydocs/4
{
"username": "rob",
"start_time": "2018-08-01"
}
POST mytest/mydocs/5
{
"username": "jack",
"start_time": "2018-08-01"
}
POST mytest/mydocs/6
{
"username": "jack",
"start_time": "2018-08-02"
}
Updated Aggregation Request
Note I've added two more documents with username Jack who visits the site on two different dates, username John visits the site twice on the same date.
POST mytest/_search
{
"size": 0,
"query": {
"range": {
"start_time": {
"gte": "2017-08-01",
"lte": "2019-08-01"
}
}
},
"aggs": {
"myterms": {
"terms": {
"size": 100,
"field": "username"
},
"aggs": {
"visit_date": {
"date_histogram": {
"field": "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
},
"count": {
"cardinality": {
"field": "start_time"
}
},
"equal_one":{
"bucket_selector":{
"buckets_path":{
"count":"count.value"
},
"script":"params.count == 1"
}
}
}
}
}
}
Response
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"myterms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "john",
"doc_count": 2,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 2
}
]
}
},
{
"key": "jane",
"doc_count": 1,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 1
}
]
}
},
{
"key": "rob",
"doc_count": 1,
"count": {
"value": 1
},
"visit_date": {
"buckets": [
{
"key_as_string": "2018-08-01",
"key": 1533081600000,
"doc_count": 1
}
]
}
}
]
}
}
}
You can see that John now appears in the result as expected even if he has visited site multiple times on same date.
Let me know if you have any queries.
Solution found was:
{
"size" : 0,
"query" : {
{
"range" : {
"start_time" : {
"gte" : "2019-02-11 00:00:00",
"lte" : "2019-02-11 23:59:59"
}
}
}
},
"aggs" : {
"UNIQUE" : {
"terms" : {
"size" : 0,
"field" : "username"
},
"aggs":{
"visit_date": {
"date_histogram": {
"field" : "start_time",
"interval" : "day",
"format" : "yyyy-MM-dd"
}
},
"count": {
"cardinality": {
"script": "new Date(doc['start_time'].value).format('yyyy-MM-dd')"
}
},
"equal_one":{
"bucket_selector":{
"buckets_path":{
"count":"count.value"
},
"script":"count == 1"
}
}
}
}
}
}
But performance remains a problem. In an environment with about 1 million records this query does not work very well.
Maybe some query using Scripted Metrics would solve, but demand more analysis (doc: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-scripted-metric-aggregation.html)

Resources