Showing the selected value in a pipeline aggregation - elasticsearch

I'm running an aggregation on the hash of the docs in my set.
Within each bucket I select the oldest and most recent.
I want an overview:
total number of docs
most recent
oldest
I have managed to get the total to work but am struggling with the oldest and most recent.
My query (limited to 2 results in the aggregation until I get it right):
{
"size": 0,
"query": {
"bool": {
"must_not": [
{
"term": {
"Text_SHA2": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
}
}
]
}
},
"aggs": {
"overall_Total": {
"sum_bucket": {
"buckets_path": "by_SHA2>_count"
}
},
"overall_MostRecent": {
"max_bucket": {
"buckets_path": "by_SHA2>the_MostRecent"
}
},
"by_SHA2": {
"terms": {
"field": "Text_SHA2",
"size": 2
},
"aggs": {
"the_MostRecent": {
"max": {
"field": "ReceivedDateUTC"
}
},
"the_Oldest": {
"min": {
"field": "ReceivedDateUTC"
}
}
}
}
}
}
What I get back:
{
"took": 341,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1163611,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"by_SHA2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1163388,
"buckets": [
{
"key": "0683dcdcd26c16315292ecf02307e9d819a08522b35dff933b406688d8d3edb9",
"doc_count": 119,
"the_Oldest": {
"value": 1.54284803E12,
"value_as_string": "2018-11-22T00:53:50.000"
},
"the_MostRecent": {
"value": 1.572209574E12,
"value_as_string": "2019-10-27T20:52:54.000"
}
},
{
"key": "e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378",
"doc_count": 104,
"the_Oldest": {
"value": 1.545930842E12,
"value_as_string": "2018-12-27T17:14:02.000"
},
"the_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
]
},
"overall_Total": {
"value": 223.0
},
"overall_MostRecent": {
"value": 1.572340576E12,
"keys": [
"e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378"
]
}
}
}
What I'd like to get back (please see difference in "overall_MostRecent" at the end):
{
"took": 341,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1163611,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"by_SHA2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1163388,
"buckets": [
{
"key": "0683dcdcd26c16315292ecf02307e9d819a08522b35dff933b406688d8d3edb9",
"doc_count": 119,
"the_Oldest": {
"value": 1.54284803E12,
"value_as_string": "2018-11-22T00:53:50.000"
},
"the_MostRecent": {
"value": 1.572209574E12,
"value_as_string": "2019-10-27T20:52:54.000"
}
},
{
"key": "e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378",
"doc_count": 104,
"the_Oldest": {
"value": 1.545930842E12,
"value_as_string": "2018-12-27T17:14:02.000"
},
"the_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
]
},
"overall_Total": {
"value": 223.0
},
"overall_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
}
There's obviously something wrong with my "overall_MostRecent" section of the query. If anyone could point that out to me I'd be much obliged.

Related

Aggs percentage doc_count

So I know my total hits are 182 documents
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
And then I make a aggregation to know how many documents have the source instagagram or twitter and it returns me:
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
}
Is it possible to get the percentage of documents that have source twitter and instagram?
So the percentage of documents that have source instagram is 89 % and twitter 11%.
My aggregation code its like this:
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
}
}
Let me know if this is possible.
Thank you
Sure, it is possible using the 'Bucket Script Aggregation'.
An example query might look like this:
{
"size": 0,
"aggs": {
"filters_agg": {
"filters": {
"filters": {
"sourceCount": {
"match_all": {}
}
}
},
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
},
"instagram_count_percentage": {
"bucket_script": {
"buckets_path": {
"instagram_count": "bySource['instagram']>_count",
"total_count": "_count"
},
"script": "Math.round((params.instagram_count * 100)/params.total_count)"
}
},
"twitter_count_percentage": {
"bucket_script": {
"buckets_path": {
"twitter_count": "bySource['twitter']>_count",
"total_count": "_count"
},
"script": "Math.round((params.twitter_count * 100)/params.total_count)"
}
}
}
}
}
}
And the response could be something like this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"filters_agg": {
"buckets": {
"sourceCount": {
"doc_count": 182,
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
},
"instagram_count_percentage": {
"value": 89
},
"twitter_count_percentage": {
"value": 11
}
}
}
}
}
}
Try to adjust it or get inspired depending on your case and your mapping.

Elasticsearch - aggregate over filtered data

I have a query that returns a set of documents (100). Over these I want to apply an aggregation, because these are most relevant. When I try to aggregate, that returns aggregations over all results, not over the first 100.
Query:
{
"size": 100,
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"from": 0,
"query": {
.......
},
"aggregations": {
"category.category_id": {
"nested": {
"path": "category"
},
"aggregations": {
"category.category_id": {
"terms": {
"field": "category.category_id",
"size": 2,
"order": {
"_count": "desc"
}
}
}
}
}
}
Result:
{
"took": 33,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1042,
"max_score": 60,
"hits": [...100 hits...]
},
"aggregations": {
"category.category_id": {
"doc_count": 5186,
"category.category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 196,
"buckets": [
{
"key": 2,
"doc_count": 1042
},
{
"key": 2764,
"doc_count": 272
}
....
]
}
}
}
Expected:
{
"took": 33,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1042,
"max_score": 60,
"hits": [...100 hits...]
},
"aggregations": {
"category.category_id": {
"doc_count": 100,
"category.category_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": x,
"buckets": [
{
"key": 2,
"doc_count": x (x< 100) (eg 37)
},
{
"key": 2764,
"doc_count": y (y <= 100 -x) (eg 10)
}
....
]
}
}
}
Is possible to aggregate over filtered data? or haw can I aggregate over most relevant data?
You can use a filter aggregation as described by elasticsearch documentation
{
"aggs" : {
"agg_name" : {
"filter" : { //Add your query },
"aggs" : {
"terms": {
"field": "category.category_id",
"size": 2,
"order": {
"_count": "desc"
}
}
}
}
}
If you want you can add one more aggregation inside the 2nd aggs

Appending further aggregations within Terms Aggregation

Sorry if this has been asked already but been lurking around SO and couldn't find anything which suits my needs.
Basically, what I'm trying to achieve in my first quick tries with ES is to add further counters within a Terms Aggregation.
Giving it a quick try I'm sending the following request to ES.
POST http://localhost:9200/people/_search
{
"size": 0,
"aggs": {
"agg_by_name": {
"terms": { "field": "name"}
}
}
}
And what I'm getting right now is just what the sample shows in the docs.
{
"took": 89,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"agg_by_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 9837,
"buckets": [
{
"key": "James",
"doc_count": 437
},
{
"key": "Eduard",
"doc_count": 367
},
{
"key": "Leonardo",
"doc_count": 235
},
{
"key": "George",
"doc_count": 209
},
{
"key": "Harrison",
"doc_count": 180
}, ...
However, I can't really get how to include further inner aggregations in the bucket. Something that would result in a document like this.
{
"key": "Harrison",
"doc_count": 180,
"lives_in_NY": 40,
"lives_in_CA": 140,
"distinct_surnames": [ ... ]
}
How should I structure my aggregation so that those are included bucket-wise?
You could try sometihng like this:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheSurnames": {
"terms": {
"field": "surname",
"size": 100
}
}
}
}
}
}
For living city could be something like:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheCities": {
"terms": {
"field": "city",
"size": 100
}
}
}
}
}
}

Get count of particular field in a document using Elasticsearch

Requirement:
I want to find the count of aID for a particular category ID.
(i.e for categoryID 2532 i want the count as 2 that means it is assigned to two aID's).
I tried with aggregations but with that i can able to get only the doc count rather than field count.
Mappings
"List": {
"properties": {
"aId": {
"type": "long"
},
"CategoryList": {
"properties": {
"categoryId": {
"type": "long"
},
"categoryName": {
"type": "string"
}
}
}
}
}
Sample Document:
"List": [
{
"aId": 33074,
"CategoryList": [
{
"categoryId": 2532,
"categoryName": "VODAFONE"
}
]
},
{
"aId": 12074,
"CategoryList": [
{
"categoryId": 2532,
"categoryName": "VODAFONE"
}
]
},
{
"aId": 120755,
"CategoryList": [
{
"categoryId": 1234,
"categoryName": "SMPLKE"
}
]
}
]
using cardinality aggregation will not help you getting the desired results. Cardinality aggregation returns the count of distinct values for the field, where are you want to find the count of appearance for number of times for a field.
You can use the following query, Here you can first filter the document for CategoryList.categoryId and then run a simple terms aggregation on this field
POST index_name1111/_search
{
"query": {
"bool": {
"must": [{
"term": {
"CategoryList.categoryId": {
"value": 2532
}
}
}]
}
},
"aggs": {
"count_is": {
"terms": {
"field": "CategoryList.categoryId",
"size": 10
}
}
}
}
Response of above query -
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"count_is": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2532,
"doc_count": 2
}
]
}
}
}
Or you can also chuck away the filter and running the aggregation only will return you all categoryId with their count of appearance.
POST index_name1111/_search
{
size: 0,
"aggs": {
"count_is": {
"terms": {
"field": "CategoryList.categoryId",
"size": 10
}
}
}
}
Response of above query
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"count_is": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2532,
"doc_count": 2
},
{
"key": 1234,
"doc_count": 1
}
]
}
}
}
Using cardinality aggregation you will get the following response with following query
POST index_name1111/_search
{
"size": 0,
"query": {
"bool": {
"must": [{
"term": {
"CategoryList.categoryId": {
"value": 2532
}
}
}]
}
},
"aggs": {
"id_count": {
"cardinality": {
"field": "CategoryList.categoryId"
}
}
}
}
Response of above query which doesn't give you desired result, since two documents matched both with categoryId as 252 so count of distinct is 1.
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"id_count": {
"value": 1
}
}
}
Hope this helps
Thanks

Elasticsearch, Nested Aggregations

Im writing dynamic query generation which allows to aggregate by any fields combination in the mapping. As the mapping(truncated) below, there are fields in nested type. e.g aggregate by [activities.activity,duration], or [activities.activity, activities.duration] or [applicationName, duration]
Mapping:
{
nested: {
properties: {
#timestamp: {
type: "date",
format: "dateOptionalTime"
},
activities: {
type: "nested",
include_in_parent: true,
properties: {
activity: {
type: "string",
index: "not_analyzed"
},
duration: {
type: "long"
},
entry: {
properties: {
blockName: {
type: "string",
index: "not_analyzed"
},
blockid: {
type: "string"
},
time: {
type: "date",
format: "dateOptionalTime"
}
}
},
exit: {
properties: {
blockName: {
type: "string",
index: "not_analyzed"
},
blockid: {
type: "string"
},
time: {
type: "date",
format: "dateOptionalTime"
}
}
},
seq: {
type: "integer"
}
}
},
applicationName: {
type: "string",
index: "not_analyzed"
},
duration: {
type: "long"
}
}
}}
Sample document:
{
"#timestamp": "2015-09-15T17:35:24.020Z",
"duration": "37616",
"applicationName": "my application name",
"activities": [{
"duration": "20362",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:24.493Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:35:44.855Z",
"blockName": "My Self Service"
},
"seq": 1,
"activity": "Prompter v2.3"
}, {
"duration": "96",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:45.268Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:35:45.364Z",
"blockName": "My Self Service"
},
"seq": 2,
"activity": "Start v2.5"
}, {
"duration": "15931",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:45.669Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:36:01.600Z",
"blockName": "My Self Service"
},
"seq": 3,
"activity": "System v2.3"
}]}
Sample query:
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse":{
"reverse_nested":{},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}
}
}}
Question,
as you can see in the query, when averaging on a root level field under a nested field. reverse_nested must be included so that the root level field "duration" can be seen. That means when generating the query, we need to check the combination of fields to see if the parent/child fields are the cases of fields are nested, nested under the same path or at the root level, then generate the proper query. This may be more complicated when aggregating on more fields, for example, aggregate by [applicationName, activities.duration, duration,activities.activity]. Does anyone know more elegant way to do that? the logic may be simpler if we can specify absolute path
Not real an answer to my question but adding more examples as it may help others to understand nested aggregation better.
aggs field average field
case1 yes yes
case2 yes no
case3 no yes
case4 no no
yes->nested type, no->not nested type
Case1 with same path
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}
}
}}
Result:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": 20362.0
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": 96.0
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": 15931.0
}
}]
}
}
}}
case1, both fields are nested, but reverse_nested to have the same average value on all the "activities.duration"
query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse1":{
"reverse_nested":{
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}
}
}
}
}}
result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}, {
"key": "System v2.3",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}]
}
}
}}
Case3
Query
{
"size": 0,
"aggs": {
"dim1": {
"terms": {
"field": "applicationName"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "my application name",
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}]
}
}}
Case2 includes reserver_nested to back to the root level
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse":{
"reverse_nested":{},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}
}
}}
Result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}, {
"key": "System v2.3",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}]
}
}
}}
Case2, without specify the nested path
Query
{
"size": 0,
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}}
Result The result is identical to the previous one
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}]
}
}
}
Case2, without specifying reserver_nested, "duration" at the root level is not seen
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": null
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": null
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": null
}
}]
}
}
}}

Resources