I'm executing a query in elasticsearch. I need to have the number of hits for my attribute "end_date_ut" (type is Date and format is dateOptionalTime) for each month represented in the index.
For that, I'm using a date_histogram aggregation.
My query just bellow:
GET inc/_search
{
"size": 0,
"aggs": {
"appli": {
"date_histogram": {
"field": "end_date_ut",
"interval": "month"
}
}
}
}
And here is a part of the result:
"hits": {
"total": 517478,
"max_score": 0,
"hits": []
},
"aggregations": {
"appli": {
"buckets": [
{
"key_as_string": "2009-08-01T00:00:00.000Z",
"key": 1249084800000,
"doc_count": 0
},
{
"key_as_string": "2009-09-01T00:00:00.000Z",
"key": 1251763200000,
"doc_count": 1
},
{
"key_as_string": "2009-10-01T00:00:00.000Z",
"key": 1254355200000,
"doc_count": 2362
},
{
"key_as_string": "2009-11-01T00:00:00.000Z",
"key": 1257033600000,
"doc_count": 5336
},
{
"key_as_string": "2009-12-01T00:00:00.000Z",
"key": 1259625600000,
"doc_count": 7536
},
{
"key_as_string": "2010-01-01T00:00:00.000Z",
"key": 1262304000000,
"doc_count": 8864
}
The problem is that I have too many buckets (results). When I'm using "terms aggregation", I don't have any problems because I can set a size, but with "date_histogram aggregation" I can't find a way to put a limit on my query result.
{
"size": 0,
"aggs": {
"by_minute": {
"date_histogram": {
"field": "createTime",
"interval": "1m",
"order": {
"_count": "desc"
}
},
"aggs": {
"top2": {
"bucket_sort": {
"sort": [],
"size": 2
}
}
}
}
}
}
{
"took": 28,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 999999,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"by_minute": {
"buckets": [
{
"key_as_string": "2019-12-21T16:13:00.000Z",
"key": 1576944780000,
"doc_count": 6374
},
{
"key_as_string": "2019-12-21T16:10:00.000Z",
"key": 1576944600000,
"doc_count": 6327
}
]
}
}
}
I suggest to use min_doc_count to only include buckets that have data, i.e. the buckets with 0 documents would not come back in the response.
GET inc/_search
{
"size": 0,
"aggs": {
"appli": {
"date_histogram": {
"field": "end_date_ut",
"interval": "month",
"min_doc_count": 1 <--- add this
}
}
}
}
If you can, you can also add a range query in order to restrain the time interval on which the aggregation is run.
Related
So I know my total hits are 182 documents
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
And then I make a aggregation to know how many documents have the source instagagram or twitter and it returns me:
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
}
Is it possible to get the percentage of documents that have source twitter and instagram?
So the percentage of documents that have source instagram is 89 % and twitter 11%.
My aggregation code its like this:
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
}
}
Let me know if this is possible.
Thank you
Sure, it is possible using the 'Bucket Script Aggregation'.
An example query might look like this:
{
"size": 0,
"aggs": {
"filters_agg": {
"filters": {
"filters": {
"sourceCount": {
"match_all": {}
}
}
},
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
},
"instagram_count_percentage": {
"bucket_script": {
"buckets_path": {
"instagram_count": "bySource['instagram']>_count",
"total_count": "_count"
},
"script": "Math.round((params.instagram_count * 100)/params.total_count)"
}
},
"twitter_count_percentage": {
"bucket_script": {
"buckets_path": {
"twitter_count": "bySource['twitter']>_count",
"total_count": "_count"
},
"script": "Math.round((params.twitter_count * 100)/params.total_count)"
}
}
}
}
}
}
And the response could be something like this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"filters_agg": {
"buckets": {
"sourceCount": {
"doc_count": 182,
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
},
"instagram_count_percentage": {
"value": 89
},
"twitter_count_percentage": {
"value": 11
}
}
}
}
}
}
Try to adjust it or get inspired depending on your case and your mapping.
I requested like this ( I twigged just some terms for you to understand ) :
{
"size": 0,
"aggs": {
"byMonth": {
"date_histogram": {
"field": "date_time",
"order": {
"_key": "desc"
},
"interval": "month",
"format": "yyyy-MM",
"extended_bounds": {
"max": "2022-02",
"min": "2022-01"
}
},
"aggs": {
"byTest": {
"terms": {
"field": "test_cate_m",
"size": 100,
"order": {
"_count": "desc"
}
}
}
}
}
}
}
and response is :
{ -
"took": 15,
"timed_out": false,
"_shards": { -
"total": 183,
"successful": 183,
"skipped": 0,
"failed": 0
},
"hits": { -
"total": { -
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": [ -
]
},
"aggregations": { -
"byMonth": { -
"buckets": [ -
{ -
"key_as_string": "2022-02",
"key": 1643673600000,
"doc_count": 600,
"byTest": { -
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [ -
{ -
"key": "test1",
"doc_count": 100
},
{ -
"key": "test2",
"doc_count": 200
},
{ -
"key": "test3",
"doc_count": 300
}
]
}
},
{ -
"key_as_string": "2022-01",
"key": 1640995200000,
"doc_count": 100,
"byTest": { -
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [ -
{ -
"key": "test3",
"doc_count": 100
}
]
}
}
]
}
}
}
in the nested buckets, there are no 'test1' , 'test2'. I'd like to get 'test1' and 'test2' in the buckets for comparison with both, even if there is no data.
and if i can, can i calculate with those both result within the query? like, i'd like to compare the each of key's doc_count in one query, not only just get the data. Can i do this?
If you help me out, it'll be a huge help :)
Sorry if this has been asked already but been lurking around SO and couldn't find anything which suits my needs.
Basically, what I'm trying to achieve in my first quick tries with ES is to add further counters within a Terms Aggregation.
Giving it a quick try I'm sending the following request to ES.
POST http://localhost:9200/people/_search
{
"size": 0,
"aggs": {
"agg_by_name": {
"terms": { "field": "name"}
}
}
}
And what I'm getting right now is just what the sample shows in the docs.
{
"took": 89,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"agg_by_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 9837,
"buckets": [
{
"key": "James",
"doc_count": 437
},
{
"key": "Eduard",
"doc_count": 367
},
{
"key": "Leonardo",
"doc_count": 235
},
{
"key": "George",
"doc_count": 209
},
{
"key": "Harrison",
"doc_count": 180
}, ...
However, I can't really get how to include further inner aggregations in the bucket. Something that would result in a document like this.
{
"key": "Harrison",
"doc_count": 180,
"lives_in_NY": 40,
"lives_in_CA": 140,
"distinct_surnames": [ ... ]
}
How should I structure my aggregation so that those are included bucket-wise?
You could try sometihng like this:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheSurnames": {
"terms": {
"field": "surname",
"size": 100
}
}
}
}
}
}
For living city could be something like:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheCities": {
"terms": {
"field": "city",
"size": 100
}
}
}
}
}
}
I need a query that would return data from the last year, grouped by days. So far I have written a query that returns data for the entire year (I hope its correct), but I dont know how to group the data by day.
"query" : {
"range" : {
"timestamp" : {
"gt" : "2017-01-01 00:00:00",
"lt" : "2018-01-01 00:00:00"
}
}
}
Any help would be much appreciated.
I am using Elasticsearch 6.2.2.
You can check date_histogram aggregation
POST my_index/my_type/_search
{
"size": 0,
"aggs": {
"bucketName": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"min_doc_count": 1,
"format": "yyyy-MM-dd",
"order": {"_key": "desc"}
}
}
}
}
It will return you something like this
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"failed": 0
},
"hits": {
"total": 112233,
"max_score": 0,
"hits": []
},
"aggregations": {
"bucketName": {
"buckets": [
{
"key_as_string": "2018-03-07",
"key": 1520380800000,
"doc_count": 1
},
{
"key_as_string": "2018-03-06",
"key": 1520294400000,
"doc_count": 93
},
{
"key_as_string": "2018-03-05",
"key": 1520208000000,
"doc_count": 99
},
{
"key_as_string": "2018-03-04",
"key": 1520121600000,
"doc_count": 33
},
{
"key_as_string": "2018-03-03",
"key": 1520035200000,
"doc_count": 29
}
]
}
}
}
We're using ElasticSearch to find offers based on 5 fields, such like some 'free text', offer state and client name. We also need to aggregate on the two fields client name and offer state. So when someone enters some free text and we found say 10 docs with state closed and 8 with state open, the 'state filter' should contain closed(10) and open(8).
Now the problem is, when I select the state 'closed' to be included in the filter, the aggregation result for open changes to 0. I want this to remain 8. So how can I prevent the filter on the aggregations to influence the aggregation itself?
Here is the first query, searching for 'java':
{
"query": {
"bool": {
"filter": [
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
The result is this:
{
"hits": {
"total": 960,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "40542",
"fields": {
"offer_id_ft": [
"40542"
],
"offer_state_f": [
"REJECTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 778
},
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "WITHDRAWN",
"doc_count": 13
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 2
}
As you see, the sum of the client_state_f buckets is equal to the total hits (960). Now, I include one of the states in the query, say 'ACCEPTED'. So my query becomes:
{
"query": {
"bool": {
"filter": [
{
"bool": {
"should": [
{
"term": {
"offer_state_f": "ACCEPTED"
}
}
]
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
What I want is 130 results, but the client_state_f buckets stilling summing up to 960. But what I got is this:
{
"hits": {
"total": 130,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "16884",
"fields": {
"offer_id_ft": [
"16884"
],
"offer_state_f": [
"ACCEPTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 0
},
{
"key": "INTAKE",
"doc_count": 0
},
{
"key": "LONGLIST",
"doc_count": 0
},
{
"key": "REJECTED",
"doc_count": 0
},
{
"key": "SHORTLIST",
"doc_count": 0
},
{
"key": "WITHDRAWN",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 10
}
As you can see, only the ACCEPTED bucket is filled, all the others are 0.
You need to move your filters into the post_filter section instead of the query section.
That way, the filtering will applied after the aggregations are computed and you'll be able to aggregate the whole set of data, but only get result hits matching your filters.
Ok, I found the answer with the help of a colleague, and the thing is, Val i is right. +1 for him. What I did was placing ALL of my query filters in the post_filter, and that's the problem. I only have to place the filters for the fields on which I want to agregate in the post_filter. Thus:
{
"query": {
"bool": {
"filter": [
{
"term": {
"broker_f": "false"
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"post_filter" : {
"bool": {
"should": [
{
"term": {
"offer_state_f": "SHORTLIST"
}
}
]
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
And now the result is correct:
{
"hits": {
"total": 5,
"max_score": 0.76667790000000002,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "24454",
"fields": {
"offer_id_ft": [
"24454"
],
"offer_state_f": [
"SHORTLIST"
]
},
"_score": 0.76667790000000002
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 777
},
{
"key": "ACCEPTED",
"doc_count": 52
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "WITHDRAWN",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 4
}