elasticsearch date_histogram offset and extended_bounds don't work together? - elasticsearch

I'm doing a date_histogram with week interval. i need my weeks to start on sundays rather than mondays, and i need the result to include weeks in which there are no docs (empty records).
to get that i use offset = -1d to change the start to sunday, and extended_bounds to get the empty records.
elasticsearch nicely figures out the first day of the interval, so if i supply a start date that's, say wednesday, i get a record for the week starting the previous sunday.
the problem is, if i set offset = -1d, i get an extra week. my hypothesis is that it calculates the first day of the interval without taking the offset into account.
in the example shown, i would not expect to get the 2017-09-24 record:
query:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"utility.utility_uuid.orig": "17245998142979832061"
}
},
{
"range": {
"user.date_created": {
"gte": "2017-10-01",
"lt": "2017-10-31"
}
}
}
]
}
},
"aggs": {
"eow_accounts_and_users": {
"date_histogram": {
"format": "yyyy-MM-dd",
"interval": "week",
"offset": "-1d",
"time_zone": "US/Pacific",
"field": "user.date_created",
"min_doc_count": 0,
"extended_bounds": {
"min": "2017-10-01",
"max": "2017-10-31"
}
}
}
}
}
result:
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 0,
"max_score": 0,
"hits": []
},
"aggregations": {
"eow_accounts_and_users": {
"buckets": [
{
"key_as_string": "2017-09-24",
"key": 1506236400000,
"doc_count": 0
},
{
"key_as_string": "2017-10-01",
"key": 1506841200000,
"doc_count": 0
},
{
"key_as_string": "2017-10-08",
"key": 1507446000000,
"doc_count": 0
},
{
"key_as_string": "2017-10-15",
"key": 1508050800000,
"doc_count": 0
},
{
"key_as_string": "2017-10-22",
"key": 1508655600000,
"doc_count": 0
},
{
"key_as_string": "2017-10-29",
"key": 1509260400000,
"doc_count": 0
}
]
}
}
}

Add an extra day to the extended bounds:
"extended_bounds": {
"min": dateParams.startTime+86400000, // an extra day
"max": dateParams.endTime
}

Related

Boosting elastic aggregation result

I have an elastic index for products, each product has Brand attribution and I "have to" create an aggregation that returns Brands of the products.
My Sample Query:
GET /products/product/_search
{
"size": 0,
"aggs": {
"myFancyFilter": {
"filter": {
"match_all": {}
},
"aggs": {
"inner": {
"terms": {
"field": "Brand",
"size": 3
}
}
}
}
},
"query": {
"match_all": {}
}
}
And the result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 236952,
"max_score": 0,
"hits": []
},
"aggregations": {
"myFancyFilter": {
"doc_count": 236952,
"inner": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 139267,
"buckets": [
{
"key": "Brand1",
"doc_count": 3144
},
{
"key": "Brand2",
"doc_count": 1759
},
{
"key": "Brand3",
"doc_count": 1737
}
]
}
}
}
}
It works perfect for me. Elastic sorts buckets according to doc_count, however I would like to manipulate the bucket order in result. For example, assume that I have Brand5 and I want to increment its order to #2. I want result coming in order Brand1, Brand5 and Brand3.
If it was not in an aggregation, but in a query, I could use function_score, but now, I don't have an idea. Any clues?
What you are looking for is to define your own sorting definition and that to be applied in aggregation in elasticsearch. I've been able to come up with a solution by renaming the aggregation terms in below manner:
Brand1 to a_Brand1
Brand5 to b_Brand5
Brand3 to c_Brand3
And then apply sorting on the terms so that sorting happens lexicographically.
Of course this may not be the exact or the best solution but I felt this can help.
Below is the query that I've used. Please note that my field name is brand and it is a multifield and I'm using the field brand.keyword.
POST testdataindex/_search
{
"size":0,
"query":{
"match_all":{
}
},
"aggs":{
"myFancyFilter":{
"filter":{
"match_all":{
}
},
"aggs":{
"inner":{
"terms":{
"script":{
"lang":"painless",
"inline":"if(params.newNames.containsKey(doc['brand.keyword'].value)) { return params.newNames[doc['brand.keyword'].value];} return null;",
"params":{
"newNames":{
"Brand1":"a_Brand1",
"Brand5":"b_Brand5",
"Brand3":"c_Brand3"
}
}
},
"order":{
"_term":"asc"
}
}
}
}
}
}
}
I've created a sample data with brand names Brand1, Brand3 and Brand5 and below how the results would appear. Note the change in the term names.
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 8,
"max_score": 0,
"hits": []
},
"aggregations": {
"myFancyFilter": {
"doc_count": 8,
"inner": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a_Brand1",
"doc_count": 2
},
{
"key": "b_Brand5",
"doc_count": 4
},
{
"key": "c_Brand3",
"doc_count": 2
}
]
}
}
}
}
Hope it helps!

Get Percentage of Values in Elasticsearch

I have some test documents that look like
"hits": {
...
"_source": {
"student": "DTWjkg",
"name": "My Name",
"grade": "A"
...
"student": "ggddee",
"name": "My Name2",
"grade": "B"
...
"student": "ggddee",
"name": "My Name3",
"grade": "A"
And I wanted to get the percentage of students that have a grade of B, the result would be "33%", assuming there were only 3 students.
How would I do this in Elasticsearch?
So far I have this aggregation, which I feel like is close:
"aggs": {
"gradeBPercent": {
"terms": {
"field" : "grade",
"script" : "_value == 'B'"
}
}
}
This returns:
"aggregations": {
"gradeBPercent": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "false",
"doc_count": 2
},
{
"key": "true",
"doc_count": 1
}
]
}
}
I'm not looking necessarily looking for an exact answer, perhaps what I could terms and keywords I could google. I've read over the elasticsearch docs and not found anything that could help.
First off, you shouldn't need a script for this aggregation. If you want to limit your results to everyone where `value == 'B' then you should do that using a filter, not a script.
ElasticSearch won't return you a percentage exactly, but you can easily calculate that using the result from a TERMS AGGREGATION.
Example:
GET devdev/audittrail/_search
{
"size": 0,
"aggs": {
"a1": {
"terms": {
"field": "uIDRequestID"
}
}
}
}
That returns:
{
"took": 12,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 25083,
"max_score": 0,
"hits": []
},
"aggregations": {
"a1": {
"doc_count_error_upper_bound": 9,
"sum_other_doc_count": 1300,
"buckets": [
{
"key": 556,
"doc_count": 34
},
{
"key": 393,
"doc_count": 28
},
{
"key": 528,
"doc_count": 15
}
]
}
}
}
So what does that return mean?
the hits.total field is the total number of records matching your query.
the doc_count is telling you how many items are in each bucket.
So for my example here: I could say that the key "556" shows up in 34 of 25083 documents, so it has a percentage of (34 / 25083) * 100

Elasticsearch: accuracy on a filter aggregation

I'm fairly new to Elasticsearch (using version 2.2).
To simplify my question, I have documents that have a field named termination, which can sometimes take the value transfer.
I currently do this request to aggregate by month the number of documents which have that termination :
{
"size": 0,
"sort": [{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}],
"query": { "match_all": {} },
"aggs": {
"report": {
"date_histogram": {
"field": "#timestamp",
"interval": "month",
"min_doc_count": 0
},
"aggs": {
"documents_with_termination_transfer": {
"filter": {
"term": {
"termination": "transfer"
}
}
}
}
}
}
}
Here is the response :
{
"_shards": {
"failed": 0,
"successful": 206,
"total": 206
},
"aggregations": {
"report": {
"buckets": [
{
"calls_with_termination_transfer": {
"doc_count": 209163
},
"doc_count": 278100,
"key": 1451606400000,
"key_as_string": "2016-01-01T00:00:00.000Z"
},
{
"calls_with_termination_transfer": {
"doc_count": 107244
},
"doc_count": 136597,
"key": 1454284800000,
"key_as_string": "2016-02-01T00:00:00.000Z"
}
]
}
},
"hits": {
"hits": [],
"max_score": 0.0,
"total": 414699
},
"timed_out": false,
"took": 90
}
Why is the number of hits (414699) greater than the total number of document counts (278100 + 136597 = 414697)? I had read about accuracy problems but it didn't seem to apply in the case of filters...
Is there also an accuracy problem if I sum the total numbers of documents with transfer termination ?
My guess is that some documents have a missing #timestamp.
You could verify this by running exists query on this field.

ElasticSearch count multiple fields grouped by

I have documents like
{"domain":"US", "zipcode":"11111", "eventType":"click", "id":"1", "time":100}
{"domain":"US", "zipcode":"22222", "eventType":"sell", "id":"2", "time":200}
{"domain":"US", "zipcode":"22222", "eventType":"click", "id":"3","time":150}
{"domain":"US", "zipcode":"11111", "eventType":"sell", "id":"4","time":350}
{"domain":"US", "zipcode":"33333", "eventType":"sell", "id":"5","time":225}
{"domain":"EU", "zipcode":"44444", "eventType":"click", "id":"5","time":120}
I want to filter these documents by eventType=sell and time between 125 and 400, group by domain followed by zipcode and count the documents in each bucket. So my output would be like (first and last docs would be ignored by the filters)
US, 11111,1
US, 22222,1
US, 33333,1
In SQL, this should have been straightforward. But I am not able to get this to work on ElasticSearch. Could someone please help me out here?
How do I write ElasticSearch query to accomplish the above?
This query seems to do what you want:
POST /test_index/_search
{
"size": 0,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"eventType": "sell"
}
},
{
"range": {
"time": {
"gte": 125,
"lte": 400
}
}
}
]
}
}
}
},
"aggs": {
"zipcode_terms": {
"terms": {
"field": "zipcode"
}
}
}
}
returning
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"zipcode_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "11111",
"doc_count": 1
},
{
"key": "22222",
"doc_count": 1
},
{
"key": "33333",
"doc_count": 1
}
]
}
}
}
(Note that there is only 1 "sell" at "22222", not 2).
Here is some code I used to test it:
http://sense.qbox.io/gist/1c4cb591ab72a6f3ae681df30fe023ddfca4225b
You might want to take a look at terms aggregations, the bool filter, and range filters.
EDIT: I just realized I left out the domain part, but it should be straightforward to add in a bucket aggregation on that as well if you need to.

Elasticsearch Terms or Cardinality Aggregation - Order by number of distinct values

Friends,
I am doing some analysis to find unique pairs from 100s of millions of documents. The mock example is as shown below:
doc field1 field2
AAA : BBB
AAA : CCC
PPP : QQQ
PPP : QQQ
XXX : YYY
XXX : YYY
MMM : NNN
90% of the document contains an unique pair as shown above in doc 3, 4, 5, 6 and 7 which I am not interested on my aggregation result. I am interested to aggregate doc 1 and 2.
Terms Aggregation Query:
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
Term Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "QQQ",
"doc_count": 2
}
]
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "YYY",
"doc_count": 2
}
]
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"buckets": [
{
"key": "BBB",
"doc_count": 1
},
{
"key": "CCC",
"doc_count": 1
}
]
}
}
]
}
}
I am interested only on key AAA to be in the aggregation result. What is the best way to filter the aggregation result containing distinct pairs?
I tried with cardinality aggregation which result unque value count. However I am not able to filter out what I am not interested from the aggregation results.
Cardinality Aggregation Query
"aggs": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggs": {
"f2": {
"cardinality": {
"field": "FIELD2"
}
}
}
}
}
Cardinality Aggregation Result
"aggregations": {
"f1": {
"buckets": [
{
"key": "PPP",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "XXX",
"doc_count": 2,
"f2": {
"value" : 1
}
},
{
"key": "AAA",
"doc_count": 2,
"f2": {
"value" : 2
}
}
]
}
}
Atleast if I could sort by cardinal value, that would be help me to find some workarounds. Please help me in this regard.
P.S: Writing a spark/mapreduce program to post process/filter the aggregation result is not expected solution for this issue.
I suggest to use filter query along with aggregations, since you are only interested in field1=AAA.
I have a similar example here.
For example, I have an index of all patients in my hospital. I store their drug use in a nested object DRUG. Each patient could take different drugs, and each could take a single drug for multiple times.
Now if I wanted to find the number of patients who took aspirin at least once, the query could be:
{
"size": 0,
"_source": false,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"nested": {
"path": "DRUG",
"filter": {
"bool": {
"must": [{ "term": { "DRUG.NAME": "aspirin" } }]
}}}}}},
"aggs": {
"DRUG_FACETS": {
"nested": {
"path": "DRUG"
},
"aggs": {
"DRUG_NAME_FACETS": {
"terms": { "field": "DRUG.NAME", "size": 0 },
"aggs": {
"DISTINCT": { "cardinality": { "field": "DRUG.PATIENT" } }
}
}}}}
}
Sample result:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"DRUG_FACETS": {
"doc_count": 11,
"DRUG_NAME_FACETS": {
"buckets": [
{
"key": "aspirin",
"doc_count": 6,
"DISTINCT": {
"value": 6
}
},
{
"key": "vitamin-b",
"doc_count": 3,
"DISTINCT": {
"value": 2
}
},
{
"key": "vitamin-c",
"doc_count": 2,
"DISTINCT": {
"value": 2
}
}
]
}
}
}
}
The first one in the buckets would be aspirin. But you can see other 2 patients had also taken vitamin-b when they took aspirin.
If you change the field value of DRUG.NAME to another drug name for example "vitamin-b", I suppose you would get vitamin-b in the first position of the buckets.
Hopefully this is helpful to your question.
A bit late, hope it would help for others.
A simple approach is to filter only 'AAA' records in top aggregation:
{
"size": 0,
"aggregations": {
"filterAAA": {
"filter": {
"term": {
"FIELD1": "AAA"
}
},
"aggregations": {
"f1": {
"terms": {
"field": "FIELD1",
"min_doc_count": 2
},
"aggregations": {
"f2": {
"terms": {
"field": "FIELD2"
}
}
}
}
}
}
}
}

Resources