Elasticsearch Histogram of visits - elasticsearch

I'm quite new to Elasticsearch and I fail to build a histogram based on ranges of visits. I am not even sure that it's possible to create this kind of chart by using a single query in Elasticsearch, but I'm the feeling that could be possible with pipeline aggregation or may be scripted aggregation.
Here is a test dataset with which I'm working:
PUT /test_histo
{ "settings": { "number_of_shards": 1 }}
PUT /test_histo/_mapping/visit
{
"properties": {
"user": {"type": "string" },
"datevisit": {"type": "date"},
"page": {"type": "string"}
}
}
POST test_histo/visit/_bulk
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Jean","page":"productXX.hmtl","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Robert","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Mary","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"Mary","page":"media_center.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"home.html","datevisit":"2015-11-25"}
{"index":{"_index":"test_histo","_type":"visit"}}
{"user":"John","page":"media_center.html","datevisit":"2015-11-26"}
If we consider the ranges [1,2[, [2,3[, [3, inf.[
The expected result should be :
[1,2[ = 2
[2,3[ = 1
[3, inf.[ = 1
All my efforts to find the histogram showing a customer visit frequency remained to date unsuccessful. I would be pleased to have a few tips, tricks or ideas to get a response to my problem.

There are two ways you can do it.
First is doing it in ElasticSearch which will require Scripted Metric Aggregation. You can read more about it here.
Your query would look like this
{
"size": 0,
"aggs": {
"visitors_over_time": {
"date_histogram": {
"field": "datevisit",
"interval": "week"
},
"aggs": {
"no_of_visits": {
"scripted_metric": {
"init_script": "_agg['values'] = new java.util.HashMap();",
"map_script": "if (_agg.values[doc['user'].value]==null) {_agg.values[doc['user'].value]=1} else {_agg.values[doc['user'].value]+=1;}",
"combine_script": "someHashMap = new java.util.HashMap();for(x in _agg.values.keySet()) {value=_agg.values[x];if(value<3){key='[' + value +',' + (value + 1) + '[';}else{key='[' + value +',inf[';}; if(someHashMap[key]==null){someHashMap[key] = 1}else{someHashMap[key] += 1}}; return someHashMap;"
}
}
}
}
}
}
where you can change period of time in date_histogram object in the field interval by values like day, week, month.
Your response would look like this
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"visitors_over_time": {
"buckets": [
{
"key_as_string": "2015-11-23T00:00:00.000Z",
"key": 1448236800000,
"doc_count": 7,
"no_of_visits": {
"value": [
{
"[2,3[": 1,
"[3,inf[": 1,
"[1,2[": 2
}
]
}
}
]
}
}
}
Second method is to the work of scripted_metric in client side. You can use the result of Terms Aggregation. You can read more about it here.
Your query will look like this
GET test_histo/visit/_search
{
"size": 0,
"aggs": {
"visitors_over_time": {
"date_histogram": {
"field": "datevisit",
"interval": "week"
},
"aggs": {
"no_of_visits": {
"terms": {
"field": "user",
"size": 10
}
}
}
}
}
}
and the response will be
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 7,
"max_score": 0,
"hits": []
},
"aggregations": {
"visitors_over_time": {
"buckets": [
{
"key_as_string": "2015-11-23T00:00:00.000Z",
"key": 1448236800000,
"doc_count": 7,
"no_of_visits": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "john",
"doc_count": 3
},
{
"key": "mary",
"doc_count": 2
},
{
"key": "jean",
"doc_count": 1
},
{
"key": "robert",
"doc_count": 1
}
]
}
}
]
}
}
}
where on the response you can do count for each doc_count for each period.

Have a look at:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-datehistogram-aggregation.html
If you whant to show it in fancy already fixed UI use Kibana.
A query like this:
GET _search
{
"query": {
"match_all": {}
},
{
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "datevisit",
"interval" : "month"
}
}
}
}
}
Should give you a histogram, I don't have elastic here at the moment so I might have some fat finggered typos.
Then you could ad query terms to only show histogram for specific page our you could have an aouter aggregation bucket wich aggregates / page or user.
Something like this:
GET _search
{
"query": {
"match_all": {}
},
{
{
"aggs" : {
"users" : {
"terms" : {
"field" : "user",
},
"aggs" : {
"visits" : {
"date_histogram" : {
"field" : "datevisit",
"interval" : "month"
}
}
}
}
}

Have a look to this solution:
{
"query": {
"match_all": {}
},
"aggs": {
"periods": {
"filters": {
"filters": {
"1-2": {
"range": {
"datevisit": {
"gte": "2015-11-25",
"lt": "2015-11-26"
}
}
},
"2-3": {
"range": {
"datevisit": {
"gte": "2015-11-26",
"lt": "2015-11-27"
}
}
},
"3-": {
"range": {
"datevisit": {
"gte": "2015-11-27",
}
}
}
}
},
"aggs": {
"users": {
"terms": {"field": "user"}
}
}
}
}
}
Step by step:
Filter aggregation: You can define ranged values for the next aggregation, in this case we define 3 periods based on date range filter
Nested Users aggregation: This aggregation returns as many results as filters you'd defined. So, in this case, you'll get 3 values using range date filtering
You'll get a result like this:
{
...
"aggregations" : {
"periods" : {
"buckets" : {
"1-2" : {
"users" : {
"buckets" : [
{"key" : XXX,"doc_count" : NNN},
{"key" : YYY,"doc_count" : NNN},
]
}
},
"2-3" : {
"users" : {
"buckets" : [
{"key" : XXX1,"doc_count" : NNN1},
{"key" : YYY1,"doc_count" : NNN1},
]
}
},
"3-" : {
"users" : {
"buckets" : [
{"key" : XXX2,"doc_count" : NNN2},
{"key" : YYY2,"doc_count" : NNN2},
]
}
},
}
}
}
}
Try it, and tell if it works

Related

Elastic Search return object with sum aggregation

I am trying to get a list of the top 100 guests by revenue generated with Elastic Search. To do this I am using a terms and a sum aggregation. However it does return the correct values, I wan to return the entire guest object with the aggregation.
This is my query:
GET reservations/_search
{
"size": 0,
"aggs": {
"top_revenue": {
"terms": {
"field": "total",
"size": 100,
"order": {
"top_revenue_hits": "desc"
}
},
"aggs": {
"top_revenue_sum": {
"sum": {
"field": "total"
}
}
}
}
}
}
This returns a list of the top 100 guests but only the amount they spent:
{
"aggregations" : {
"top_revenue" : {
"doc_count_error_upper_bound" : -1,
"sum_other_doc_count" : 498,
"buckets" : [
{
"key" : 934.9500122070312,
"doc_count" : 8,
"top_revenue_hits" : {
"value" : 7479.60009765625
}
},
{
"key" : 922.0,
"doc_count" : 6,
"top_revenue_hits" : {
"value" : 5532.0
}
},
...
]
}
}
}
How can I get the query to return the entire guests object, not only the sum amount.
When I run GET reservations/_search it returns:
{
"hits": [
{
"_index": "reservations",
"_id": "1334620",
"_score": 1.0,
"_source": {
"id": "1334620",
"total": 110.8,
"payment": "unpaid",
"contact": {
"name": "John Doe",
"email": "john#mail.com"
}
}
},
... other reservations
]
}
I want to get this to return with the sum aggregation.
I have tried to use a top_hits aggregation, using _source it does return the entire guest object but it does not show the total amount spent. And when adding _source to the sum aggregation it gives an error.
Can I return the entire guest object with a sum aggregation or is this not the correct way?
I assumed that contact.name is keyword in the mapping. Following query should work for you.
{
"size": 0,
"aggs": {
"guests": {
"terms": {
"field": "contact.name",
"size": 100
},
"aggs": {
"sum_total": {
"sum": {
"field": "total"
}
},
"sortBy": {
"bucket_sort": {
"sort": [
{ "sum_total": { "order": "desc" } }
]
}
},
"guest": {
"top_hits": {
"size": 1
}
}
}
}
}
}

Count number of inner elements of array property (Including repeated values)

Given I have the following records.
[
{
"profile": "123",
"inner": [
{
"name": "John"
}
]
},
{
"profile": "456",
"inner": [
{
"name": "John"
},
{
"name": "John"
},
{
"name": "James"
}
]
}
]
I want to get something like:
"aggregations": {
"name": {
"buckets": [
{
"key": "John",
"doc_count": 3
},
{
"key": "James",
"doc_count": 1
}
]
}
}
I'm a beginner using Elasticsearch, and this seems to be a pretty simple operation to do, but I can't find how to achieve this.
If I try a simple aggs using term, it returns 2 for John, instead of 3.
Example request I'm trying:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
}
}
}
}
How can I possibly achieve this?
Additional Info: It will be used on Kibana later.
I can change mapping to whatever I want, but AFAIK Kibana doesn't like the "Nested" type. :(
You need to do a value_count aggregation, by default terms only does a doc_count, but the value_count aggregation will count the number of times a given field exists.
So, for your purposes:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
},
"aggs": {
"total": {
"value_count": {
"field": "inner.name"
}
}
}
}
}
}
Which returns:
"aggregations" : {
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "John",
"doc_count" : 2,
"total" : {
"value" : 3
}
},
{
"key" : "James",
"doc_count" : 1,
"total" : {
"value" : 2
}
}
]
}
}

Filter on TOP aggregation - elasticsearch 5.6

Note: This kind of query has been asked previously 2 or 3 times years ago but did not have any satisfactory answer. I am posting my specific problem here. Hope, someone suggests some good solution.
I am facing a challenge fetching desired records from elasticsearch. We strictly need filtering on the results returned by TOP aggregation. Anyway, below is my scenario:
Given: We have an entity named "service" which have attributes like below:
{
"id": "servicer-id-1",
"status": "OPEN", // These may be CLOSED, RESOLVED
"timeRaised": "2019-03-21T15:09:17.015Z",
"timeChanged": "2019-03-21T15:09:17.015Z"
}
I have an elastic index where any change in the above service is stored as a whole service document(a kind of history of service). There are more than one service with same id. We update timeChanges attribute everytime.
There are millions of service documents in the index.
Problem Statement: We need particular services which were the latest state during a given time frame(timeChanged) and status OPEN.
What I did:
I used below query with the scroll API with 10000 bacth size to resolve our problem:
{
"size" : 1000, //given by user
"query" : {
"constant_score" : {
"filter" : {
"bool" : {
"must" : [
{
"range" : {
"timeChanged" : {
"from" : 1552940830000,
"to" : 1553498830000,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"boost" : 1.0
}
},
"post_filter": {
"bool": {
"must": [{
{
"constant_score": {
"filter": {
"terms": {
"status": ["OPEN"],
"boost": 1.0
}
},
"boost": 1.0
}
}
}],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"_source" : false,
"aggregations" : {
"by_serviceId" : {
"terms" : {
"field" : "id",
"size" : 50000, // we set it with total number of services exist
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
},
"aggregations" : {
"top" : {
"top_hits" : {
"from" : 0,
"size" : 1,
"version" : false,
"explain" : false,
"sort" : [
{
"timeChanged" : {
"order" : "desc"
}
}
]
}
}
}
}
}
}
From above query, we are getting aggregation from first hit of scroll which are the list of latest state of service in aggregation. And by Post filter we are fetching OPEN service in batches of 10000 and try to match the ids(by java code) with aggregation list to find out our candidate.
It is taking too much time to return the desired output. Around 8 mins for 4.4M records in the index.
This problem can be solved if you suggest a way to put filter on returned aggregated data. But after searching so many places, I found out that it is not supported in elastic. Is it so?
Ref of same problem:
Elasticsearch: filter top hits aggregation
Elasticsearch exclude top hit on field value
Please help and suggest better way to fulfill the scenario.
Thanks.
Disclaimer: Please do not suggest to apply query and then aggregation because it won't solve the problem. e.g. If I filter on OPEN status first and then aggregate so, for a given date I always get OPEN service but in reality for a given day, service might be RESOLVED.
here is my attempt to fulfill your need. I have a proof of concept aggregation by it cant work with a string status. So we need first to translate the string status to a number ( maybe an update by query could do the job for you)
In my example
OPEN => status_number = 1
CLOSED => status_number = 2
RESOLVED => status_number = 3
And here is my 50 cents request :D
POST <index>/doc/_search
{
"size": 0,
"query": {
"bool": {
"filter": {
"range": {
"timeChanged": {
"gte": "2019-03-21T15:09:17.015Z",
"lte": "2019-03-21T15:09:18.015Z"
}
}
}
}
},
"aggs": {
"service": {
"terms": {
"field": "id.keyword",
"size": 10
},
"aggs": {
"last_status": {
"terms": {
"field": "status_number",
"size": 1,
"order": {
"last_change": "desc" // order to keep the last status of the timespan with the size of 1
}
},
"aggs": {
"last_change": {
"max": {
"field": "timeChanged"
}
}
}
},
"min_status": {
"min_bucket": {
"buckets_path": "last_status._key" // used to transforms a bucket list in a single value for the bucket_selector beneath
}
},
"filtered": {
"bucket_selector": {
"buckets_path": {
"key": ">min_status"
},
"script": """
params.key == 1 // filter buckets where last status_number is 1 si status = OPEN
"""
}
}
}
}
}
}
The output is quite verbose :
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"service": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "servicer-id-4",
"doc_count": 1,
"last_status": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 1,
"last_change": {
"value": 1553180958015,
"value_as_string": "2019-03-21T15:09:18.015Z"
}
}
]
},
"min_status": {
"value": 1,
"keys": [
"1"
]
}
}
]
}
}
}
But you just need the aggregations.service.buckets.key values
I hope it can help you but of course without data i cant evaluate the performance of this query.

Pipeline aggregation with Date histogram doesn’t return expected result

I'm facing an issue regarding to use Pipeline aggregation with Date histogram.
I need to filter data from: "2019-03-08T06:00:00Z" to "2019-03-09T10:00:00Z" and do histogram aggregation on that. Then calculate avg value after aggregating by cardinality agg.
{
"size": 0,
"query": {
"bool" : {
"filter": {
"range" : {
"recordTime" : {
"gte" : "2019-03-08T06:00:00Z",
"lte" : "2019-03-09T10:00:00Z"
}
}
}
}
},
"aggs" : {
"events_per_bucket" : {
"date_histogram" : {
"field" : "eventTime",
"interval" : "1h"
},
"aggs": {
"cards_per_bucket": {
"cardinality": {
"field": "KANBAN_PKKEY.keyword"
}
}
}
},
"avg_cards_per_bucket": {
"avg_bucket": {
"buckets_path": "events_per_bucket>cards_per_bucket.value"
}
}
}
}
Result:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"events_per_bucket": {
"buckets": [
{
"key_as_string": "2019-03-08T06:00:00.000Z",
"key": 1552024800000,
"doc_count": 1,
"cards_per_bucket": {
**"value": 1**
}
},
{
"key_as_string": "2019-03-08T07:00:00.000Z",
"key": 1552028400000,
"doc_count": 0,
"cards_per_bucket": {
**"value": 0**
}
},
{
"key_as_string": "2019-03-08T08:00:00.000Z",
"key": 1552032000000,
"doc_count": 1,
"cards_per_bucket": {
**"value": 1**
}
}
]
},
"avg_cards_per_bucket": {
**"value": 1**
}
}
}
The problem is why avg value is "1"? It should be: 2/3 = 0.6666
Why 0 value cardinality bucket is ignored?
If i remove cardinality agg and do avg on doc_count (events_per_bucket>_count) then it works fine.
The same thing happens for MAX, MIN, SUM as well.
Any help would be appreciated!
Thank you.
you should tell the aggregation pipeline what to do in the case of gaps in your buckets, like your bucket with key 1552028400000. By default, gaps are ignored. You might want instead to replace the missing values with a zero. This can be done by adding the gap_policy parameter to your aggregation pipeline:
...
"avg_cards_per_bucket": {
"avg_bucket": {
"buckets_path": "events_per_bucket>cards_per_bucket.value",
"gap_policy": "insert_zeros"
}
}
...
More details in the Elastic documentation.

ElasticSearch 1x - aggregate on object conditions

I want to aggregate on data, which has inner objects. For example:
{
"_index": "product_index-en",
"_type": "elasticproductmodel",
"_id": "000001111",
"_score": 6.3316255,
"_source": {
"productId": "11111111111",
"productIdOnlyLetterAndDigit": "11111111111",
"productIdOnlyDigit": "11111111111",
"productNumber": "11111111111",
"name": "Glow Plug",
"nameOnlyLetterAndDigit": "glowplug",
"productImageLarge": "11111111111.jpg",
"itemGroupId": "11111",
"relatedProductIds": [],
"dataAreaCountries": [
"fra",
"pol",
"uk",
"sie",
"sve",
"atl",
"ita",
"hol",
"dk"
],
"oemItems": [
{
"manufactorName": "BERU",
"manufacType": "0"
},
{
"manufactorName": "LUCAS",
"manufacType": "0"
}
]
}
}
I need to be able aggregates oemItems.manufactorName values, but only where oemItems.manufacType is "0". I have tried a number of examples, such as the accepted one here ( Elastic Search Aggregate into buckets on conditions ), but I just cannot seem to wrap my head around it.
I've tried following, hopeing it will aggragate on manufacType first, which it does, and then manufactorName for each type, which it seems to display correct hit count. However, buckets for manufactorName are empty:
GET /product_index-en/_search
{
"size": 0,
"aggs": {
"baked_goods": {
"nested": {
"path": "oemItems"
},
"aggs": {
"test1": {
"terms": {
"field": "oemItems.manufacType",
"size": 500
},
"aggs": {
"test2": {
"terms": {
"field": "oemItems.manufactorName",
"size": 500
}
}
}
}
}
}
}
}
And the result:
{
"took": 27,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 471214,
"max_score": 0,
"hits": []
},
"aggregations": {
"baked_goods": {
"doc_count": 677246,
"test1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "0",
"doc_count": 436557,
"test2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key": "1",
"doc_count": 240689,
"test2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
]
}
}
}
}
I have also tried to add a nested term filter, to only look at oemItems which have manufacType 1 with following query. However, it returns Objects where oemItems include manufacType 1, meaning it oemItems within products still contain either 1 or 0 manufacType. I don't see how doing an aggregate on this response will only return oemItems.manufactorName where oemItems.manufacType is 0
GET /product_index-en/_search
{
"query" : { "match_all" : {} },
"filter" : {
"nested" : {
"path" : "oemItems",
"filter" : {
"bool" : {
"must" : [
{
"term" : {"oemItems.manufacType" : "1"}
}
]
}
}
}
}
}
Good start so far. Just try it like this:
POST /product_index-en/_search
{
"size": 0,
"query": {
"nested": {
"path": "oemItems",
"query": {
"term": {
"oemItems.manufacType": "0"
}
}
}
},
"aggs": {
"baked_goods": {
"nested": {
"path": "oemItems"
},
"aggs": {
"test1": {
"terms": {
"field": "oemItems.manufactorName",
"size": 500
}
}
}
}
}
}

Resources