Elasticsearch, Nested Aggregations - elasticsearch

Im writing dynamic query generation which allows to aggregate by any fields combination in the mapping. As the mapping(truncated) below, there are fields in nested type. e.g aggregate by [activities.activity,duration], or [activities.activity, activities.duration] or [applicationName, duration]
Mapping:
{
nested: {
properties: {
#timestamp: {
type: "date",
format: "dateOptionalTime"
},
activities: {
type: "nested",
include_in_parent: true,
properties: {
activity: {
type: "string",
index: "not_analyzed"
},
duration: {
type: "long"
},
entry: {
properties: {
blockName: {
type: "string",
index: "not_analyzed"
},
blockid: {
type: "string"
},
time: {
type: "date",
format: "dateOptionalTime"
}
}
},
exit: {
properties: {
blockName: {
type: "string",
index: "not_analyzed"
},
blockid: {
type: "string"
},
time: {
type: "date",
format: "dateOptionalTime"
}
}
},
seq: {
type: "integer"
}
}
},
applicationName: {
type: "string",
index: "not_analyzed"
},
duration: {
type: "long"
}
}
}}
Sample document:
{
"#timestamp": "2015-09-15T17:35:24.020Z",
"duration": "37616",
"applicationName": "my application name",
"activities": [{
"duration": "20362",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:24.493Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:35:44.855Z",
"blockName": "My Self Service"
},
"seq": 1,
"activity": "Prompter v2.3"
}, {
"duration": "96",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:45.268Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:35:45.364Z",
"blockName": "My Self Service"
},
"seq": 2,
"activity": "Start v2.5"
}, {
"duration": "15931",
"entry": {
"blockid": "2",
"time": "2015-09-15T17:35:45.669Z",
"blockName": "My Self Service"
},
"exit": {
"blockid": "2",
"time": "2015-09-15T17:36:01.600Z",
"blockName": "My Self Service"
},
"seq": 3,
"activity": "System v2.3"
}]}
Sample query:
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse":{
"reverse_nested":{},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}
}
}}
Question,
as you can see in the query, when averaging on a root level field under a nested field. reverse_nested must be included so that the root level field "duration" can be seen. That means when generating the query, we need to check the combination of fields to see if the parent/child fields are the cases of fields are nested, nested under the same path or at the root level, then generate the proper query. This may be more complicated when aggregating on more fields, for example, aggregate by [applicationName, activities.duration, duration,activities.activity]. Does anyone know more elegant way to do that? the logic may be simpler if we can specify absolute path

Not real an answer to my question but adding more examples as it may help others to understand nested aggregation better.
aggs field average field
case1 yes yes
case2 yes no
case3 no yes
case4 no no
yes->nested type, no->not nested type
Case1 with same path
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}
}
}}
Result:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": 20362.0
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": 96.0
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": 15931.0
}
}]
}
}
}}
case1, both fields are nested, but reverse_nested to have the same average value on all the "activities.duration"
query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse1":{
"reverse_nested":{
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}
}
}
}
}}
result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}, {
"key": "System v2.3",
"doc_count": 1,
"dim_reverse1": {
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}
}]
}
}
}}
Case3
Query
{
"size": 0,
"aggs": {
"dim1": {
"terms": {
"field": "applicationName"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "activities.duration"
}
}
}
}
}}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "my application name",
"doc_count": 1,
"avg_duration": {
"value": 12129.666666666666
}
}]
}
}}
Case2 includes reserver_nested to back to the root level
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs": {
"dim_reverse":{
"reverse_nested":{},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}
}
}}
Result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}, {
"key": "System v2.3",
"doc_count": 1,
"dim_reverse": {
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}
}]
}
}
}}
Case2, without specify the nested path
Query
{
"size": 0,
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}}
Result The result is identical to the previous one
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": 37616.0
}
}]
}
}
}
Case2, without specifying reserver_nested, "duration" at the root level is not seen
Query
{
"size": 0,
"aggs": {
"dim0": {
"nested" : {
"path": "activities"
},
"aggs": {
"dim1": {
"terms": {
"field": "activities.activity"
},
"aggs":{
"avg_duration": {
"avg": {
"field": "duration"
}
}
}
}
}
}
}}
Result
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"dim0": {
"doc_count": 3,
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Prompter v2.3",
"doc_count": 1,
"avg_duration": {
"value": null
}
}, {
"key": "Start v2.5",
"doc_count": 1,
"avg_duration": {
"value": null
}
}, {
"key": "System v2.3",
"doc_count": 1,
"avg_duration": {
"value": null
}
}]
}
}
}}

Related

Elasticsearch - Count number of occurrence perd field per document

Is it possible to calculate the number of occurence of distinct values in a list field.
For example, let the following data:
[
{
"page":1,
"colors":[
{
"color": red
},
{
"color": white
},
{
"color": red
}
]
},
{
"page":2,
"colors":[
{
"color": yellow
},
{
"color": yellow
}
]
}
]
Is it possible to get a result as the follwing:
{
"page":1,
"colors_count":[
{
"Key": red,
"Count": 2
},
{
"Key": white,
"Count": 1
},
]
},
{
"page":2,
"colors_count":[
{
"Key": yellow,
"Count": 2
}
]
}
I tried using term aggregation but I got the number of distinct values, so for page:1 i got red:1 and white:1.
Yes, you can do it. you will have to use nested_field type and nested_Agg
Mapping:
PUT colors
{
"mappings": {
"properties": {
"page" : { "type": "keyword" },
"colors": {
"type": "nested",
"properties": {
"color": {
"type": "keyword"
}
}
}
}
}
}
Insert Documents:
PUT colors/_doc/1
{
"page": 1,
"colors": [
{
"color": "red"
},
{
"color": "white"
},
{
"color": "red"
}
]
}
PUT colors/_doc/2
{
"page": 2,
"colors": [
{
"color": "yellow"
},
{
"color": "yellow"
}
]
}
Query:
GET colors/_search
{
"size" :0,
"aggs": {
"groupByPage": {
"terms": {
"field": "page"
},
"aggs": {
"colors": {
"nested": {
"path": "colors"
},
"aggs": {
"genres": {
"terms": {
"field": "colors.color"
}
}
}
}
}
}
}
}
Output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"groupByPage": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", // page field value
"doc_count": 1,
"colors": {
"doc_count": 3,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "red",
"doc_count": 2
},
{
"key": "white",
"doc_count": 1
}
]
}
}
},
{
"key": "2", // page field value
"doc_count": 1,
"colors": {
"doc_count": 2,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "yellow",
"doc_count": 2
}
]
}
}
}
]
}
}
}

Showing the selected value in a pipeline aggregation

I'm running an aggregation on the hash of the docs in my set.
Within each bucket I select the oldest and most recent.
I want an overview:
total number of docs
most recent
oldest
I have managed to get the total to work but am struggling with the oldest and most recent.
My query (limited to 2 results in the aggregation until I get it right):
{
"size": 0,
"query": {
"bool": {
"must_not": [
{
"term": {
"Text_SHA2": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
}
}
]
}
},
"aggs": {
"overall_Total": {
"sum_bucket": {
"buckets_path": "by_SHA2>_count"
}
},
"overall_MostRecent": {
"max_bucket": {
"buckets_path": "by_SHA2>the_MostRecent"
}
},
"by_SHA2": {
"terms": {
"field": "Text_SHA2",
"size": 2
},
"aggs": {
"the_MostRecent": {
"max": {
"field": "ReceivedDateUTC"
}
},
"the_Oldest": {
"min": {
"field": "ReceivedDateUTC"
}
}
}
}
}
}
What I get back:
{
"took": 341,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1163611,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"by_SHA2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1163388,
"buckets": [
{
"key": "0683dcdcd26c16315292ecf02307e9d819a08522b35dff933b406688d8d3edb9",
"doc_count": 119,
"the_Oldest": {
"value": 1.54284803E12,
"value_as_string": "2018-11-22T00:53:50.000"
},
"the_MostRecent": {
"value": 1.572209574E12,
"value_as_string": "2019-10-27T20:52:54.000"
}
},
{
"key": "e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378",
"doc_count": 104,
"the_Oldest": {
"value": 1.545930842E12,
"value_as_string": "2018-12-27T17:14:02.000"
},
"the_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
]
},
"overall_Total": {
"value": 223.0
},
"overall_MostRecent": {
"value": 1.572340576E12,
"keys": [
"e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378"
]
}
}
}
What I'd like to get back (please see difference in "overall_MostRecent" at the end):
{
"took": 341,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1163611,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"by_SHA2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1163388,
"buckets": [
{
"key": "0683dcdcd26c16315292ecf02307e9d819a08522b35dff933b406688d8d3edb9",
"doc_count": 119,
"the_Oldest": {
"value": 1.54284803E12,
"value_as_string": "2018-11-22T00:53:50.000"
},
"the_MostRecent": {
"value": 1.572209574E12,
"value_as_string": "2019-10-27T20:52:54.000"
}
},
{
"key": "e757c30feeea67425ba02d8821295954d23bb9f6bf979fb8113d2cdf8f79b378",
"doc_count": 104,
"the_Oldest": {
"value": 1.545930842E12,
"value_as_string": "2018-12-27T17:14:02.000"
},
"the_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
]
},
"overall_Total": {
"value": 223.0
},
"overall_MostRecent": {
"value": 1.572340576E12,
"value_as_string": "2019-10-29T09:16:16.000"
}
}
}
There's obviously something wrong with my "overall_MostRecent" section of the query. If anyone could point that out to me I'd be much obliged.

Elasticsearch - Count duplicated and unique values for a nested field

Elasticsearch - Count duplicated and unique values
I need also same kind of count but that field is in nested properties as
[{
"firstname": "john",
"lastname": "doe",
"addressList": [{
"addressId": 39640,
"txt": "sdf",
}, {
"addressId": 39641,
"txt": "NEW",
}, {
"addressId": 39640,
"txt": "sdf",
}, {
"addressId": 39641,
"txt": "NEW"
}
]
}, {
"firstname": "jane",
"lastname": "smith",
"addressList": [{
"addressId": 39644,
"txt": "sdf",
}, {
"addressId": 39642,
"txt": "NEW",
}, {
"addressId": 39644,
"txt": "sdf",
}, {
"addressId": 39642,
"txt": "NEW"
}
]
}
]
what would be the query for addressId duplicate counts ? Need you help on this user:3838328
I got the answer for nested field duplicate counts as
POST <your_index_name>/_search
{
"size": 0,
"aggs": {
"prop_counts": {
"nested": {
"path": "addressList"
},
"aggs": {
"duplicate_aggs": {
"terms": {
"field": "addressList.addressId",
"min_doc_count": 2,
"size": 100 <----- Note this
}
},
"duplicate_bucketcount": {
"stats_bucket": {
"buckets_path": "duplicate_aggs._count"
}
},
"nonduplicate_aggs": {
"terms": {
"field": "addressList.addressId",
"size": 100 <---- Note this
},
"aggs": {
"equal_one": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count == 1"
}
}
}
},
"nonduplicate_bucketcount": {
"sum_bucket": {
"buckets_path": "nonduplicate_aggs._count"
}
}
}
}
}
}
Response as
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop_counts": {
"doc_count": 8,
"duplicate_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": 39640,
"doc_count": 2
}, {
"key": 39641,
"doc_count": 2
}, {
"key": 39644,
"doc_count": 2
}, {
"key": 39642,
"doc_count": 2
}
]
},
"nonduplicate_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"duplicate_bucketcount": {
"count": 4,
"min": 2,
"max": 2,
"avg": 2,
"sum": 8
},
"nonduplicate_bucketcount": {
"value": 0
}
}
}
}

Convert sql query to elasticsearch

I need to convert this query into elastic search, but I am facing the problem that in elastic search (having) is not supported yet.
Select sum(count) as count,prop1
from
(
SELECT Count(*) as count,prop1 FROM [table1] group by prop1,prop2
having count = 1
)
group by prop1
order by count desc limit 10
I try this query in elastic search:
`GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term":
{
"field": "test"
}
}
]
}
},
"aggs": {
"aggregation": {
"terms": {
"field": "prop1"
},
"aggs": {
"subaggregation": {
"terms": {
"field": "prop2",
"order": {
"_count": "desc"
}
}
},
"test":{
"bucket_selector": {
"buckets_path":
{
"test1": "_count"
},
"script":"params.test1 == 1"
}
}
}
}
}
}`
Here is the mapping that I use:
PUT /index
{
"mappings" : {
"timeline" : {
"properties" : {
"prop1" : {
"type" : "keyword"
},
"prop2" : {
"type" : "keyword"
}
}
}
}
}
but I cannot get the sub-aggregation buckets who have count == 1
Here is the output of the suggested answer :
{
"took": 344,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 852146,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 646,
"sum_other_doc_count": 37299,
"buckets": [
{
"key": "porp1-key",
"doc_count": 348178,
"prop2": {
"doc_count_error_upper_bound": 130,
"sum_other_doc_count": 345325,
"buckets": [
{
"key": "e1552d2d-da84-4588-9b65-16c33848bb94_1",
"doc_count": 558,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "04b1a8eb-f876-459b-af9b-855493318dca_426",
"doc_count": 383,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "b165d2c7-6a23-4a4d-adbb-3b2a79d4c627_80",
"doc_count": 344,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c4ea55dc-c3b3-492b-98a2-1ad004212c3d_99",
"doc_count": 297,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "dfc1ae22-5c7f-49ab-8488-207661b43716_294",
"doc_count": 264,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "28815490-e7ce-420b-bab8-57a6ffc3f56a_572",
"doc_count": 239,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c3c56ec8-e0ff-46ea-841d-cc22b2dc65f6_574",
"doc_count": 217,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "473289b8-fb73-4cbb-b8d7-a5386846745f_34",
"doc_count": 187,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "670cb862-7976-4fd5-ba3f-3f8b7c03d615_11",
"doc_count": 185,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "41870755-96dd-4a00-ab76-632a1dfaecb5_341",
"doc_count": 179,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
}
]
},
"final": {
"value": 0
}
} ]
}
}
}
Try this. Aggregation final will give you the desired output.
GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"field": "test"
}
}
]
}
},
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final": {
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Working code :
PUT prop
{
"mappings": {
"prop": {
"properties": {
"prop1": {
"type": "keyword"
},
"prop2": {
"type": "keyword"
}
}
}
}
}
POST _bulk
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q1"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q5"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q6"}
GET prop/prop/_search
{
"size": 0,
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final":{
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Output :
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "p1",
"doc_count": 3,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q2",
"doc_count": 2,
"prop2_count": {
"value": 2
},
"prop2_check": {
"value": 0
}
},
{
"key": "q1",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 1
}
},
{
"key": "p2",
"doc_count": 2,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q5",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
},
{
"key": "q6",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 2
}
}
]
}
}
}

query for elasticsearch returning count

I am struggling to create the query/rule that will help me create an alerting script. I want to query the elasticsearch API for counts on a specific index so that I can get alerted when the count reaches a certain threshold.
The following query is an attempt as I have no experience with this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"analyze_wildcard": true,
"query": "*"
}
},
"filter": {
"bool": {
"must": [
{
"query": {
"match": {
"PStream": {
"query": "*",
"type": "phrase"
}
}
}
},
{
"range": {
"#timestamp": {
"gte": 1447789445320,
"lte": 1447793045320
}
}
}
],
"must_not": []
}
}
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
},
"size": 500,
"sort": [
{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"aggs": {
"2": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"pre_zone": "-05:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 0,
"extended_bounds": {
"min": 1447789445317,
"max": 1447793045317
}
}
}
},
The field PStream is the field that I am focused on
EDIT:
An example of the data going to the index:
{
"_index": "logstash-2015.11.17",
"_type": "logs",
"_id": "AVEXMKu2YVnF1NOjr9YT",
"_score": null,
"_source": {
"authorUrl": "",
"postUrl": "",
"pubDate": "2015-11-17T15:18:24",
"scrapeDate": "2015-11-17T15:44:03",
"clientId": "136902834",
"query": "Jenny Balatsinou",
"PType": "post",
"tLatency": 1539,
"PLang": "en",
"PStream": "864321",
"PName": "xStackOverflow",
"#version": "1",
"#timestamp": "2015-11-17T20:44:03.400Z"
},
"fields": {
"#timestamp": [
1447793043400
],
"pubDate": [
1447773504000
],
"scrapeDate": [
1447775043000
]
},
"sort": [
1447793043400
]
there are about 20 million of these messages getting indexed daily into Elasticsearch. I have created a dashboard in Kibana where I view this data and stats. I would like to write the proper query that I can use in a java program that periodically runs and checks this index using this query. It should return the hourly total count grouped by the PStream variable which has multiple values. So anytime the value is 0 it will send an alert.
Eg. Output:
"result": {
"total": 74,
"successful": 63,
"failed": 11,
{
{
"index": "logstash-2015.11.08",
"PStream": "37647338933",
"Count": 1234532
},
{
"index": "logstash-2015.11.08",
"PStream": "45345343566",
"Count": 156532
},
As a quick example (per comments above), I just set up a trivial index:
DELETE /test_index
PUT /test_index
added some (simplified) data:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":2}}
{"PStream": "864321","#timestamp": "2015-11-17T21:44:03.400Z"}
{"index":{"_id":3}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":4}}
{"PStream": "864322","#timestamp": "2015-11-17T21:44:03.400Z"}
And now I can get the "PStream" terms inside an hour histogram:
POST /test_index/_search
{
"size": 0,
"aggs" : {
"timestamp_histogram" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
}
}
}
}
}
}
...
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 1
},
{
"key": "864322",
"doc_count": 1
}
]
}
}
]
}
}
}
or the other way around:
POST /test_index/_search
{
"size": 0,
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
},
"aggs": {
"timestamp_histogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "hour"
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 3,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
},
{
"key": "864322",
"doc_count": 1,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/6c0c30db1cf0fb8529bcfec21c0ce5c02a5ae94c

Resources