Elasticsearch aggregation query missing data

Elasticsearch aggregation query missing data - elasticsearch

I have 2 indexes and running same query but getting different results. Second index thinks data is missing. Why?
Query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"term": {
"g_cst": {
"value": "73198483380633600",
"boost": 1
}
}
}
]
}
},
"aggs": {
"aggr_per_connection_type": {
"terms": {
"field": "tunnel_type",
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggs": {
"aggr_per_broker": {
"terms": {
"field": "g_brk",
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggs": {
"date_histogram": {
"date_histogram": {
"field": "time",
"fixed_interval": "3600m",
"offset": 0,
"order": {
"_key": "asc"
},
"keyed": false,
"min_doc_count": 0
},
"aggs": {
"app_rtt_us": {
"max": {
"field": "app_rtt_us",
"missing": -1
}
},
"tcp_rtt_us": {
"max": {
"field": "tcp_rtt_us",
"missing": 0
}
}
}
}
}
}
}
}
}
}
Getting weird result from second index, it misses data in avg bucket, however doc_count > 0.
{
"took": 53,
"timed_out": false,
"_shards": { "total": 56, "successful": 56, "skipped": 0, "failed": 0 },
"hits": {
"total": { "value": 10000, "relation": "gte" },
"max_score": null,
"hits": []
},
"aggregations": {
"aggr_per_connection_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "TUNNEL_LOG",
"doc_count": 16327,
"aggr_per_broker": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "72057594037937044",
"doc_count": 11902,
"date_histogram": {
"buckets": [
{
"key_as_string": "20211211T12:00:00.000Z",
"key": 1639224000000,
"doc_count": 363,
"app_rtt_us": {
"value": 1
},
"tcp_rtt_us": {
"value": 0
}
},
{
"key_as_string": "20211214T00:00:00.000Z",
"key": 1639440000000,
"doc_count": 1398,
"app_rtt_us": {
"value": 1
},
"tcp_rtt_us": {
"value": 0
}
}
]
}
}
]
}
}
]
}
}
}
Not sure even where to look at.

Values mappings must be created before data ingestion, so they will be
indexed!

Related

Elasticsearch exclude key from composite aggregation

i need to perform an exclusion of some key in a composite aggregation.
here is one document of my index as an example :
{
"end_date": 1230314400000,
"parameter_codes": [28, 35, 30],
"platform_code": "41012",
"start_date": 1230314400000,
"station_id": 7833246
}
I perform a search request allowing me to : get a result for each platform_code/parameter_codes couple, plus getting the station_id correspounding plus a paging on the bucket.
here is the request :
{
"size": 0,
"query": {
"match_all": {
"boost": 1.0
}
},
"_source": false,
"aggregations": {
"compositeAgg": {
"composite": {
"size": 10,
"sources": [{
"platform_code": {
"terms": {
"field": "platform_code",
"missing_bucket": false,
"order": "asc"
}
}
}, {
"parameter_codes": {
"terms": {
"field": "parameter_codes",
"missing_bucket": false,
"order": "asc"
}
}
}]
},
"aggregations": {
"aggstation_id": {
"terms": {
"field": "station_id",
"size": 2147483647,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": {
"_key": "asc"
}
}
},
"pipe": {
"bucket_sort": {
"sort": [{
"_key": {
"order": "asc"
}
}],
"from": 0,
"size": 10,
"gap_policy": "SKIP"
}
}
}
}
}
}
this request give me the following results :
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 8,
"successful": 8,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"composite#compositeAgg": {
"after_key": {
"platform_code": "41012",
"parameter_codes": 60
},
"buckets": [{
"key": {
"platform_code": "41012",
"parameter_codes": 28
},
"doc_count": 1,
"lterms#aggstation_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": 7833246,
"doc_count": 1
}]
}
}, {
"key": {
"platform_code": "41012",
"parameter_codes": 30
},
"doc_count": 2,
"lterms#aggstation_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": 7833246,
"doc_count": 1
}, {
"key": 12787501,
"doc_count": 1
}]
}
}, {
"key": {
"platform_code": "41012",
"parameter_codes": 35
},
"doc_count": 2,
"lterms#aggstation_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": 7833246,
"doc_count": 1
}, {
"key": 12787501,
"doc_count": 1
}]
}
}]
}
}
}
this works very well but i need to exclude one or many parameter_code.
For example by excluding '35', i want only the keys :
{
"platform_code": "41012",
"parameter_codes": 28
}
and
{
"platform_code": "41012",
"parameter_codes": 30
}
i tried, many options but can not succeed to perform this.
Can anybody know how can i do that?

A script query can be used in composite source to return only specific values of array.
{
"size": 0,
"query": {
"match_all": {
"boost": 1
}
},
"_source": false,
"aggregations": {
"compositeAgg": {
"composite": {
"size": 10,
"sources": [
{
"platform_code": {
"terms": {
"field": "platform_code.keyword",
"missing_bucket": false,
"order": "asc"
}
}
},
{
"parameter_codes": {
"terms": {
"script": {
"source": """
def arr=[];
for (item in doc['parameter_codes']) {
if(item !=35)
{
arr.add(item);
}
}
return arr"""
}
}
}
}
]
},
"aggregations": {
"aggstation_id": {
"terms": {
"field": "station_id",
"size": 2147483647,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": {
"_key": "asc"
}
}
},
"pipe": {
"bucket_sort": {
"sort": [
{
"_key": {
"order": "asc"
}
}
],
"from": 0,
"size": 10,
"gap_policy": "SKIP"
}
}
}
}
}
}

You can try to exclude "parameter_codes=35" this option from the query.
{
"query": {
"bool": {
"must_not": [
{
"term": {
"parameter_codes": {
"value": "35"
}
}
}
]
}
}
}

Appending further aggregations within Terms Aggregation

Sorry if this has been asked already but been lurking around SO and couldn't find anything which suits my needs.
Basically, what I'm trying to achieve in my first quick tries with ES is to add further counters within a Terms Aggregation.
Giving it a quick try I'm sending the following request to ES.
POST http://localhost:9200/people/_search
{
"size": 0,
"aggs": {
"agg_by_name": {
"terms": { "field": "name"}
}
}
}
And what I'm getting right now is just what the sample shows in the docs.
{
"took": 89,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 10000,
"relation": "gte"
},
"max_score": null,
"hits": []
},
"aggregations": {
"agg_by_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 9837,
"buckets": [
{
"key": "James",
"doc_count": 437
},
{
"key": "Eduard",
"doc_count": 367
},
{
"key": "Leonardo",
"doc_count": 235
},
{
"key": "George",
"doc_count": 209
},
{
"key": "Harrison",
"doc_count": 180
}, ...
However, I can't really get how to include further inner aggregations in the bucket. Something that would result in a document like this.
{
"key": "Harrison",
"doc_count": 180,
"lives_in_NY": 40,
"lives_in_CA": 140,
"distinct_surnames": [ ... ]
}
How should I structure my aggregation so that those are included bucket-wise?

You could try sometihng like this:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheSurnames": {
"terms": {
"field": "surname",
"size": 100
}
}
}
}
}
}
For living city could be something like:
{
"size": 0,
"aggs": {
"getAllTheNames": {
"terms": {
"field": "name",
"size": 100
},
"aggs": {
"getAllTheCities": {
"terms": {
"field": "city",
"size": 100
}
}
}
}
}
}

Is there a way to compare string alphabetically in painless

I would like to execute this kind of operation in painless :
if (_value >= 'c)' {
return _value
} else {
return '__BAD__'
}
value is a string and I would like this following behaviour :
if value is foo I want to replace it with __BAD__ if the value is bar, I want to keep bar. only values alphabetically after 'c' should be set to __BAD__.
I got this exception :
"lang": "painless",
"caused_by": {
"type": "class_cast_exception",
"reason": "Cannot apply [>] operation to types [java.lang.String] and [java.lang.String]."
}
Is there a way to perform string alphabetical comparaison between string in painless ?
My documents are looking :
{
"id": "doca",
"categoryId": "aaa",
"parentNames": "a$aa$aaa"
},
{
"id": "docb",
"categoryId": "bbb",
"parentNames": "a$aa$bbb"
},
{
"id": "docz",
"categoryId": "zzz",
"parentNames": "a$aa$zzz"
}
and my query is like :
{
"query": {
"bool": {
"filter": []
}
},
"size": 0,
"aggs": {
"catNames": {
"terms": {
"size": 10000,
"order": {
"_key": "asc"
},
"script": {
"source": "if(doc['parentNames'].value < 'a$aa$ccc') {return doc['parentNames'].value} return '__BAD__'",
"lang": "painless"
}
},
"aggs": {
"sort": {
"bucket_sort": {
"size": 2
}
},
"catId": {
"terms": {
"field": "categoryId",
"size": 1
}
}
}
}
}
}
I am expecting the result :
{
"took": 29,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"catNames": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "__BAD__",
"doc_count": 1,
"catId": {
"buckets": [
{
"key": "aaa",
"doc_count": 1
}
]
}
},
{
"key": "a$aa$bbb",
"doc_count": 1,
"catId": {
"buckets": [
{
"key": "bbb",
"doc_count": 1
}
]
}
},
{
"key": "a$aa$zzz",
"doc_count": 1,
"catId": {
"buckets": [
{
"key": "zzz",
"doc_count": 1
}
]
}
}
]
}
}
}

In fact, I can use the compareTo function of java.lang.String.
if (_value.compareTo('c') > 0) {
return _value
} else {
return '__BAD__'
}
My query is becoming :
{
"query": {
"bool": {
"filter": []
}
},
"size": 0,
"aggs": {
"catNames": {
"terms": {
"size": 10000,
"order": {
"_key": "asc"
},
"script": {
"source": "if(doc['parentNames'].value.compareTo('a$aa$ccc')) {return doc['parentNames'].value} return '__BAD__'",
"lang": "painless"
}
},
"aggs": {
"sort": {
"bucket_sort": {
"size": 2
}
},
"catId": {
"terms": {
"field": "categoryId",
"size": 1
}
}
}
}
}
}

Buckets size filter in Elasticsearch

Here is my query result
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 502,
"max_score": 0,
"hits": []
},
"aggregations": {
"HIGH_RISK_USERS": {
"doc_count": 1004,
"USERS_COUNT": {
"doc_count_error_upper_bound": 5,
"sum_other_doc_count": 437,
"buckets": [
{
"key": "49",
"doc_count": 502,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key": "02122219455#53.205.223.157",
"doc_count": 44,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 42
},
{
"key": "CallFrom",
"doc_count": 2
}
]
}
},
{
"key": "+02129916178#53.205.223.157",
"doc_count": 2,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 2
}
]
}
}
]
}
}
}
}
Here is my query
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "x_nova_extensions.entities",
"query": {
"bool": {
"filter": [
{
"match": {
"x_nova_extensions.entities.text": "49"
}
},
{
"terms": {
"x_nova_extensions.entities.type": [
"sourceCountryCode",
"CallerIPCountryCode",
"CallerIPCountryName",
"CallerIPCountryCode",
"CallerPhoneCountryName"
]
}
}
]
}
}
}
}
]
}
},
"aggs": {
"HIGH_RISK_USERS": {
"nested": {
"path": "x_nova_extensions.entities"
},
"aggs": {
"USERS_COUNT": {
"terms": {
"field": "x_nova_extensions.entities.text",
"size": 10,
"order": {
"_count": "desc"
}
},
"aggs": {
"NAME": {
"terms": {
"field": "x_nova_extensions.entities.type",
"include": [
"caller",
"callee",
"CallFrom",
"CallTo"
]
}
}
}
}
}
}
}
}
I want my query to return only bucket[].size > 0
I searched on the internet and I couldn't find any specific keyword or something else. Even I am not sure if Elasticsearch supports this or not. I want to sure that Elasticsearch supports this
Are there any keyword or how can I handle it ?
Thanks

I think the thing that you are looking for is Aggregation Pipeline
By that way, you can reach the bucket size and filter the result accordingly.
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"nameCount": "NAME._bucket_count"
},
"script": {
"source": "params.nameCount != 0"
}
}
}
}
}
But please pay attention to the elasticsearch version. The way how it is applied can be different according to the version.

query for elasticsearch returning count

I am struggling to create the query/rule that will help me create an alerting script. I want to query the elasticsearch API for counts on a specific index so that I can get alerted when the count reaches a certain threshold.
The following query is an attempt as I have no experience with this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"analyze_wildcard": true,
"query": "*"
}
},
"filter": {
"bool": {
"must": [
{
"query": {
"match": {
"PStream": {
"query": "*",
"type": "phrase"
}
}
}
},
{
"range": {
"#timestamp": {
"gte": 1447789445320,
"lte": 1447793045320
}
}
}
],
"must_not": []
}
}
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
},
"size": 500,
"sort": [
{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"aggs": {
"2": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"pre_zone": "-05:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 0,
"extended_bounds": {
"min": 1447789445317,
"max": 1447793045317
}
}
}
},
The field PStream is the field that I am focused on
EDIT:
An example of the data going to the index:
{
"_index": "logstash-2015.11.17",
"_type": "logs",
"_id": "AVEXMKu2YVnF1NOjr9YT",
"_score": null,
"_source": {
"authorUrl": "",
"postUrl": "",
"pubDate": "2015-11-17T15:18:24",
"scrapeDate": "2015-11-17T15:44:03",
"clientId": "136902834",
"query": "Jenny Balatsinou",
"PType": "post",
"tLatency": 1539,
"PLang": "en",
"PStream": "864321",
"PName": "xStackOverflow",
"#version": "1",
"#timestamp": "2015-11-17T20:44:03.400Z"
},
"fields": {
"#timestamp": [
1447793043400
],
"pubDate": [
1447773504000
],
"scrapeDate": [
1447775043000
]
},
"sort": [
1447793043400
]
there are about 20 million of these messages getting indexed daily into Elasticsearch. I have created a dashboard in Kibana where I view this data and stats. I would like to write the proper query that I can use in a java program that periodically runs and checks this index using this query. It should return the hourly total count grouped by the PStream variable which has multiple values. So anytime the value is 0 it will send an alert.
Eg. Output:
"result": {
"total": 74,
"successful": 63,
"failed": 11,
{
{
"index": "logstash-2015.11.08",
"PStream": "37647338933",
"Count": 1234532
},
{
"index": "logstash-2015.11.08",
"PStream": "45345343566",
"Count": 156532
},

As a quick example (per comments above), I just set up a trivial index:
DELETE /test_index
PUT /test_index
added some (simplified) data:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":2}}
{"PStream": "864321","#timestamp": "2015-11-17T21:44:03.400Z"}
{"index":{"_id":3}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":4}}
{"PStream": "864322","#timestamp": "2015-11-17T21:44:03.400Z"}
And now I can get the "PStream" terms inside an hour histogram:
POST /test_index/_search
{
"size": 0,
"aggs" : {
"timestamp_histogram" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
}
}
}
}
}
}
...
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 1
},
{
"key": "864322",
"doc_count": 1
}
]
}
}
]
}
}
}
or the other way around:
POST /test_index/_search
{
"size": 0,
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
},
"aggs": {
"timestamp_histogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "hour"
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 3,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
},
{
"key": "864322",
"doc_count": 1,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/6c0c30db1cf0fb8529bcfec21c0ce5c02a5ae94c

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch aggregation query missing data - elasticsearch

Values mappings must be created before data ingestion, so they will be indexed!

Related

Elasticsearch exclude key from composite aggregation

Appending further aggregations within Terms Aggregation

Is there a way to compare string alphabetically in painless

Buckets size filter in Elasticsearch

query for elasticsearch returning count

Categories

Resources