query for elasticsearch returning count - elasticsearch

I am struggling to create the query/rule that will help me create an alerting script. I want to query the elasticsearch API for counts on a specific index so that I can get alerted when the count reaches a certain threshold.
The following query is an attempt as I have no experience with this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"analyze_wildcard": true,
"query": "*"
}
},
"filter": {
"bool": {
"must": [
{
"query": {
"match": {
"PStream": {
"query": "*",
"type": "phrase"
}
}
}
},
{
"range": {
"#timestamp": {
"gte": 1447789445320,
"lte": 1447793045320
}
}
}
],
"must_not": []
}
}
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
},
"size": 500,
"sort": [
{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"aggs": {
"2": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"pre_zone": "-05:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 0,
"extended_bounds": {
"min": 1447789445317,
"max": 1447793045317
}
}
}
},
The field PStream is the field that I am focused on
EDIT:
An example of the data going to the index:
{
"_index": "logstash-2015.11.17",
"_type": "logs",
"_id": "AVEXMKu2YVnF1NOjr9YT",
"_score": null,
"_source": {
"authorUrl": "",
"postUrl": "",
"pubDate": "2015-11-17T15:18:24",
"scrapeDate": "2015-11-17T15:44:03",
"clientId": "136902834",
"query": "Jenny Balatsinou",
"PType": "post",
"tLatency": 1539,
"PLang": "en",
"PStream": "864321",
"PName": "xStackOverflow",
"#version": "1",
"#timestamp": "2015-11-17T20:44:03.400Z"
},
"fields": {
"#timestamp": [
1447793043400
],
"pubDate": [
1447773504000
],
"scrapeDate": [
1447775043000
]
},
"sort": [
1447793043400
]
there are about 20 million of these messages getting indexed daily into Elasticsearch. I have created a dashboard in Kibana where I view this data and stats. I would like to write the proper query that I can use in a java program that periodically runs and checks this index using this query. It should return the hourly total count grouped by the PStream variable which has multiple values. So anytime the value is 0 it will send an alert.
Eg. Output:
"result": {
"total": 74,
"successful": 63,
"failed": 11,
{
{
"index": "logstash-2015.11.08",
"PStream": "37647338933",
"Count": 1234532
},
{
"index": "logstash-2015.11.08",
"PStream": "45345343566",
"Count": 156532
},

As a quick example (per comments above), I just set up a trivial index:
DELETE /test_index
PUT /test_index
added some (simplified) data:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":2}}
{"PStream": "864321","#timestamp": "2015-11-17T21:44:03.400Z"}
{"index":{"_id":3}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":4}}
{"PStream": "864322","#timestamp": "2015-11-17T21:44:03.400Z"}
And now I can get the "PStream" terms inside an hour histogram:
POST /test_index/_search
{
"size": 0,
"aggs" : {
"timestamp_histogram" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
}
}
}
}
}
}
...
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 1
},
{
"key": "864322",
"doc_count": 1
}
]
}
}
]
}
}
}
or the other way around:
POST /test_index/_search
{
"size": 0,
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
},
"aggs": {
"timestamp_histogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "hour"
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 3,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
},
{
"key": "864322",
"doc_count": 1,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/6c0c30db1cf0fb8529bcfec21c0ce5c02a5ae94c

Related

How to count number of values per group?

I have an index with the following mapping:
"my_index":{
"mapping": {
"properties": {
"rec_values": {
"type": "nested",
"properties": {
"name": {
"type:" "keyword"
},
"schm_p": {
"type:" "keyword"
},
"tbl_p": {
"type:" "keyword"
},
I want to count number values for each schm_p
something like:
select count(*)
from my_index
group by rec_values.schm_p
How can I do it ?
You need to do a Composite Aggregation, like this:
{
"size": 0,
"aggs": {
"parameters": {
"nested": {
"path": "rec_values"
},
"aggs": {
"group": {
"composite": {
"size": 100, // your size
"sources": [{
"count_schm_p": {
"terms": {
"field": "rec_values.schm_p"
}
}
}]
}
}
}
}
}
}
you need to use the aggregation for this query something like this:
GET my_index/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"count_schm_p": {
"terms": {
"field": "rec_values.schm_p.keyword",
"size": 100
}
}
}
}
this query would return a response like this
{
"took": 561,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"count_schm_p": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 829099,
"buckets": [
{
"key": "type_a",
"doc_count": 1704640
},
{
"key": "type_b",
"doc_count": 1454079
},
{
"key": "type_c",
"doc_count": 894678
},
{
"key": "type_d",
"doc_count": 208489
}
]
}
}
}
the count of each schm_p is inside your aggregation key
note: the size inside your query need to match with how many schm_p types do you have.

ElasticSearch - aggregations on nested fields to return additional field in buckets

I don't know if it is possible to return additional fields in the response for each bucket.
The current request returns correct results, but I'm missing additional field information required for later processing.
{
"query": {
"bool": {
"must": {
"match_all": {}
}
}
},
"track_total_hits": true,
"from": 0,
"size": 0,
"aggs": {
"strings": {
"nested": {
"path": "filter_data.string_facet"
},
"aggs": {
"names": {
"terms": {
"field": "filter_data.string_facet.facet-name"
},
"aggs": {
"values": {
"terms": {
"field": "filter_data.string_facet.facet-value"
}
}
}
}
}
}
}
Here is the result. Note the data in field filter_data how nested fields are structured.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1,
"hits": [{
"_index": "my_index",
"_type": "_doc",
"_id": "7000043",
"_score": 1,
"_source": {
"item_data": {
"doc_id": 7000043,
"id": 7000043,
"live_state": 1,
"item_sku": "7000043",
"manufacturer_id": 1394
},
"filter_data": {
"string_facet": [{
"facet-name": "Thread size",
"facet-value": "G1/2",
"facet-name-id": 12,
"facet-value-id": 34
}]
}
}
}]
},
"aggregations": {
"strings": {
"doc_count": 5,
"names": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Thread size",
"doc_count": 2,
"values": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "G1 1/4",
"doc_count": 1
}, {
"key": "G1/2",
"doc_count": 1
}]
}
}]
}
}
}
Is it possible to add additional fields to each bucket? It would be ideal to have such a format in the response. Basically add field facet-name-id anf facet-value-id to each bucket.
....
"buckets": [{
"key": "Thread size",
"doc_count": 2,
"values": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "G1 1/4",
"facet-name-id": 12,
"facet-value-id": 34
"doc_count": 1
}, {
"key": "G1/2",
"facet-name-id": 12,
"facet-value-id": 35
"doc_count": 1
}]
}
}]
...
If this is not possible, what would you recommend?
Thanx.
Sure, you can use top_hits as a sub-aggrgation of your deepest facet-value aggregation:
POST my_index/_search?filter_path=aggregations.*.*.buckets.key,aggregations.*.*.buckets.values.buckets.key,aggregations.*.*.buckets.values.buckets.*.hits.hits._source
{
"query": {
"bool": {
"must": {
"match_all": {}
}
}
},
"track_total_hits": true,
"from": 0,
"size": 0,
"aggs": {
"strings": {
"nested": {
"path": "filter_data.string_facet"
},
"aggs": {
"names": {
"terms": {
"field": "filter_data.string_facet.facet-name"
},
"aggs": {
"values": {
"terms": {
"field": "filter_data.string_facet.facet-value"
},
"aggs": {
"my_top_hits": {
"top_hits": {
"size": 10,
"_source": ["filter_data.string_facet"]
}
}
}
}
}
}
}
}
}
}
which'd yield:
{
"aggregations" : {
"strings" : {
"names" : {
"buckets" : [
{
"key" : "Thread size",
"values" : {
"buckets" : [
{
"key" : "G1/2",
"my_top_hits" : {
"hits" : {
"hits" : [
{
"_source" : {
"facet-value" : "G1/2",
"facet-name" : "Thread size",
"facet-value-id" : 34,
"facet-name-id" : 12
}
}
]
}
}
}
]
}
}
]
}
}
}
}
Notice that my_top_hits is an array of string_facet objects instead of an object as you requested. That's because although you're already 2 facets deep (facet-name and then facet-value), there may still be multiple different facet-value-id and facet-name-id combinations covered by a given facet-value bucket.
Having said that, you can of course limit the top_hits count with the size parameter but then you wouldn't be able to say with certainty whether or not the first top hit's facets are representative of the whole bucket .

Buckets size filter in Elasticsearch

Here is my query result
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 502,
"max_score": 0,
"hits": []
},
"aggregations": {
"HIGH_RISK_USERS": {
"doc_count": 1004,
"USERS_COUNT": {
"doc_count_error_upper_bound": 5,
"sum_other_doc_count": 437,
"buckets": [
{
"key": "49",
"doc_count": 502,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key": "02122219455#53.205.223.157",
"doc_count": 44,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 42
},
{
"key": "CallFrom",
"doc_count": 2
}
]
}
},
{
"key": "+02129916178#53.205.223.157",
"doc_count": 2,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 2
}
]
}
}
]
}
}
}
}
Here is my query
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "x_nova_extensions.entities",
"query": {
"bool": {
"filter": [
{
"match": {
"x_nova_extensions.entities.text": "49"
}
},
{
"terms": {
"x_nova_extensions.entities.type": [
"sourceCountryCode",
"CallerIPCountryCode",
"CallerIPCountryName",
"CallerIPCountryCode",
"CallerPhoneCountryName"
]
}
}
]
}
}
}
}
]
}
},
"aggs": {
"HIGH_RISK_USERS": {
"nested": {
"path": "x_nova_extensions.entities"
},
"aggs": {
"USERS_COUNT": {
"terms": {
"field": "x_nova_extensions.entities.text",
"size": 10,
"order": {
"_count": "desc"
}
},
"aggs": {
"NAME": {
"terms": {
"field": "x_nova_extensions.entities.type",
"include": [
"caller",
"callee",
"CallFrom",
"CallTo"
]
}
}
}
}
}
}
}
}
I want my query to return only bucket[].size > 0
I searched on the internet and I couldn't find any specific keyword or something else. Even I am not sure if Elasticsearch supports this or not. I want to sure that Elasticsearch supports this
Are there any keyword or how can I handle it ?
Thanks
I think the thing that you are looking for is Aggregation Pipeline
By that way, you can reach the bucket size and filter the result accordingly.
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"nameCount": "NAME._bucket_count"
},
"script": {
"source": "params.nameCount != 0"
}
}
}
}
}
But please pay attention to the elasticsearch version. The way how it is applied can be different according to the version.

ElasticSearch filter on aggregations without affecting aggregation counts

We're using ElasticSearch to find offers based on 5 fields, such like some 'free text', offer state and client name. We also need to aggregate on the two fields client name and offer state. So when someone enters some free text and we found say 10 docs with state closed and 8 with state open, the 'state filter' should contain closed(10) and open(8).
Now the problem is, when I select the state 'closed' to be included in the filter, the aggregation result for open changes to 0. I want this to remain 8. So how can I prevent the filter on the aggregations to influence the aggregation itself?
Here is the first query, searching for 'java':
{
"query": {
"bool": {
"filter": [
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
The result is this:
{
"hits": {
"total": 960,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "40542",
"fields": {
"offer_id_ft": [
"40542"
],
"offer_state_f": [
"REJECTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 778
},
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "WITHDRAWN",
"doc_count": 13
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 2
}
As you see, the sum of the client_state_f buckets is equal to the total hits (960). Now, I include one of the states in the query, say 'ACCEPTED'. So my query becomes:
{
"query": {
"bool": {
"filter": [
{
"bool": {
"should": [
{
"term": {
"offer_state_f": "ACCEPTED"
}
}
]
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
What I want is 130 results, but the client_state_f buckets stilling summing up to 960. But what I got is this:
{
"hits": {
"total": 130,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "16884",
"fields": {
"offer_id_ft": [
"16884"
],
"offer_state_f": [
"ACCEPTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 0
},
{
"key": "INTAKE",
"doc_count": 0
},
{
"key": "LONGLIST",
"doc_count": 0
},
{
"key": "REJECTED",
"doc_count": 0
},
{
"key": "SHORTLIST",
"doc_count": 0
},
{
"key": "WITHDRAWN",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 10
}
As you can see, only the ACCEPTED bucket is filled, all the others are 0.
You need to move your filters into the post_filter section instead of the query section.
That way, the filtering will applied after the aggregations are computed and you'll be able to aggregate the whole set of data, but only get result hits matching your filters.
Ok, I found the answer with the help of a colleague, and the thing is, Val i is right. +1 for him. What I did was placing ALL of my query filters in the post_filter, and that's the problem. I only have to place the filters for the fields on which I want to agregate in the post_filter. Thus:
{
"query": {
"bool": {
"filter": [
{
"term": {
"broker_f": "false"
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"post_filter" : {
"bool": {
"should": [
{
"term": {
"offer_state_f": "SHORTLIST"
}
}
]
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
And now the result is correct:
{
"hits": {
"total": 5,
"max_score": 0.76667790000000002,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "24454",
"fields": {
"offer_id_ft": [
"24454"
],
"offer_state_f": [
"SHORTLIST"
]
},
"_score": 0.76667790000000002
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 777
},
{
"key": "ACCEPTED",
"doc_count": 52
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "WITHDRAWN",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 4
}

Elasticsearch include field in result set of aggregation

How can field of type string be included in the result set of an aggregation?
For example given the following mapping:
{
"sport": {
"mappings": {
"runners": {
"properties": {
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"region": {
"type": "string"
},
"sport": {
"type": "string"
}
}
}
}
}
}
Sample data:
curl -XPOST "http://localhost:9200/sport/_bulk" -d'
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Gary", "city":"New York","region":"A","sport":"Soccer"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Bob", "city":"New York","region":"A","sport":"Tennis"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Mike", "city":"Atlanta","region":"B","sport":"Soccer"}
'
How can the field name be included in result set of the aggregation:
{
"size": 0,
"aggregations": {
"agg": {
"terms": {
"field": "city"}
}
}
}
This seems to do what you want, if I'm understanding you correctly:
POST /sport/_search
{
"size": 0,
"aggregations": {
"city_terms": {
"terms": {
"field": "city"
},
"aggs": {
"name_terms": {
"terms": {
"field": "name"
}
}
}
}
}
}
With the data you provided, it returns:
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"city_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "york",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "atlanta",
"doc_count": 1,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mike",
"doc_count": 1
}
]
}
}
]
}
}
}
(You may want to add "index":"not_analyzed" to one or both fields in your mapping, if these results are not what you were expecting.)
Here's the code I used to test it:
http://sense.qbox.io/gist/07735aadc082c1c60409931c279f3fd85a340dbb

Resources