Elasticsearch: Alert on significant data change - elasticsearch

I have an index with exchange rates change log. Documents inside index look like this
{
"sourceId": "gh-ghs",
"targetCountry": "gh",
"targetCurrency": "ghs",
"rate": 2.3,
"modified": "2021-04-07T12:00:57.2760000Z",
},
{
"sourceId": "gh-ghs",
"targetCountry": "gh",
"targetCurrency": "ghs",
"rate": 2.5,
"modified": "2021-04-06T12:00:57.2760000Z",
},
{
"sourceId": "mx-mxn",
"targetCountry": "mx",
"targetCurrency": "mxn",
"rate": 20.3,
"modified": "2021-04-08T12:00:57.2760000Z",
},
{
"sourceId": "mx-mxn",
"targetCountry": "mx",
"targetCurrency": "mxn",
"rate": 2.2,
"modified": "2021-04-07T12:00:57.2760000Z",
},
{
"sourceId": "mx-mxn",
"targetCountry": "mx",
"targetCurrency": "mxn",
"rate": 2.23,
"modified": "2021-04-06T12:00:57.2760000Z",
}
As you can see we have exchange rate change per destination (country+currency). Take a look mx-mxn documents. Last change was from 2.2 to 20.3. Probably it is human mistake, and we want to alert about such cases.
I tried to create following query to find significant changes
{
"query": {
"bool": {
"must": [],
{
"range": {
"modified": {
"format": "strict_date_optional_time",
"gte": "now",
"lte": "now - 5h"
}
}
}
],
"should": [],
"must_not": []
}
},
"aggs": {
"group": {
"terms": {
"field": "sourceId",
"size": 1000
},
"aggs": {
"2-metric": {
"top_metrics": {
"metrics": {
"field": "rate"
},
"size": 2,
"sort": {
"modified": "desc"
}
}
}
}
}
}
}
Using this query I managed to get to latest changes by destination. Response look like this
{
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mx-mxn",
"doc_count": 25,
"2-metric": {
"top": [
{
"sort": [
"2022-10-16T15:01:57.098Z"
],
"metrics": {
"rate": 12
}
},
{
"sort": [
"2022-10-16T13:01:57.098Z"
],
"metrics": {
"rate": 150
}
}
]
}
},
{
"key": "gh-ghs",
"doc_count": 18,
"2-metric": {
"top": [
{
"sort": [
"2022-10-14T11:28:38.995Z"
],
"metrics": {
"rate": 11.25
}
},
{
"sort": [
"2022-10-13T11:37:09.945Z"
],
"metrics": {
"rate": 10.9609375
}
}
]
}
}
]
}
}
}
So I managed to get two latest changes for each destination. But I want to setup alert for all buckets, where value changed more than on 10 percent. In this case it is mx-mxn. How can I do it in Elastic and Kibana ?

Tldr;
To gather the "detection" data:
anomaly detection jobs
derivative aggregation
To perform the alert:
Kibana alerting with machine learning job.
Watcher with elasticsearch query input.

Related

Is it possible to add data in Elastic Search from a filter?

I have an API backed by Elastic Search. Depending on login/password automatically a diferent filter is applied.
Elastic search index contains:
"organisation.id"
"organisation.name"
"organisation.country"
"shop.id"
"shop.name"
"shop.address"
"creationdatetime"
This would be a sample filter:
{
"_source":{
"includes":[
"organisation.id",
"organisation.name",
"shop.id",
"shop.name"
"creationdatetime"
],
"excludes": [
"shop.address",
"organisation.country"
]
},
"from":"0",
"size":"500",
"sort":{"creationdatetime":"asc"},
"query":{
"bool":{
"must":{
"match":{
"shop.sharedwith":"client1"
}
},
"filter":{
"range":{
"creationdatetime":{
"gte":"2020-01-01"
}
}
}
}
}
}
Output would be
{
"total": 2,
"from": "0",
"size": "10",
"hops": [
{
"organisation": [
{
"name": "A1",
"id": "0001-A1"
}
],
"shop": [
{
"name": "A1Shop",
"id": "0001-0001-A1"
}
]
}
]
}
I would like to add a "version" and "filtername" to the output... coming from the filter itself.
Exactly this:
{
"total": 2,
"from": "0",
"size": "10",
"version": "1.0.0.0", // -------------------------------NEW FIELD
"filtername": "filter01", // -------------------------------NEW FIELD
"hops": [
{
"organisation": [
{
"name": "A1",
"id": "0001-A1"
}
],
"shop": [
{
"name": "A1Shop",
"id": "0001-0001-A1"
}
]
}
]
}
Is it possible to add those two extra outputs from the filter itself?
This is not directly possible but there's a workaround using a top_hits aggregation in combination with agg metadata:
GET _search
{
"size": 0, // no need for the standard hits b/c of our `top_hits`
"query": {
"match_all": {} // your actual query
},
"aggs": {
"my_hits": {
"top_hits": {
"size": 10,
"_source": {
"includes": [
"organisation.id",
"organisation.name",
"shop.id",
"shop.name",
"creationdatetime"
],
"excludes": [
"shop.address",
"organisation.country"
]
}
},
"meta": { // custom key-value pairs
"version": "1.0.0.0",
"filtername": "filter01"
}
}
}
}
resulting in
{
...
"aggregations": {
"my_hits": {
"meta": {
"version": "1.0.0.0",
"filtername": "filter01"
},
"hits": {
... // the actual docs
}
}
}
}
It's also worth looking at named queries although their use here is very loosely applicable.

Kibana and Vega with nested aggregations

Given the below Vega JSON in Kibana, how can i make it dynamic such that when a user on searches for something in the search box it applies to both the query and the pos filter tags below. So let's say a user types in 'face*' in the search box then i want the values marked with <----- this to change below to dynamically change
{
"$schema": "https://vega.github.io/schema/vega/v4.3.0.json",
"autosize": "fit",
"title": "POS COUNT",
"data": [
{
"name": "data_table",
"url": {
"index": "sa_test_index_data",
"body": {
"size": 0,
"query": {
"nested": {
"path": "xforms.sentence.tokens",
"query": {
"bool": {
"should": [
{
"wildcard": {
"xforms.sentence.tokens.value.keyword": "24*" <----- this to change
}
}
]
}
}
}
},
"aggs": {
"sentence": {
"nested": {"path": "xforms.sentence.tokens"},
"aggs": {
"pos_filter": {
"filter": {
"wildcard": {"xforms.sentence.tokens.value.keyword": "24*"} <----- this to change
},
"aggs": {
"pos": {
"terms": {"field": "xforms.sentence.tokens.tag.keyword"}
}
}
}
}
}
}
}
},
"format": {"property": "aggregations.sentence.pos_filter.pos.buckets"},
"transform": [
{
"type": "collect",
"sort": {"field": ["doc_count"], "order": ["descending"]}
}
]
},
{
"name": "data_table_pie_inner",
"source": "data_table",
"transform": [
{
"type": "aggregate",
"groupby": ["key"],
"fields": ["doc_count"],
"ops": ["sum"],
"as": ["ff_sum_count"]
},
{
"type": "pie",
"field": "ff_sum_count",
"as": ["ff_inner_startAngle", "ff_inner_endAngle"]
}
]
}
],
"scales": [
{
"name": "scale_color",
"type": "ordinal",
"range": {"scheme": "category10"},
"domain": {"data": "data_table", "field": "key"}
}
],
"marks": [
{
"name": "mark_inner_ring",
"type": "arc",
"from": {"data": "data_table_pie_inner"},
"encode": {
"enter": {
"x": {"signal": "width / 2"},
"y": {"signal": "height / 2"},
"fill": {"scale": "scale_color", "field": "key"},
"fillOpacity": {"value": 0.8},
"stroke": {"value": "white"},
"startAngle": {"field": "ff_inner_startAngle"},
"endAngle": {"field": "ff_inner_endAngle"},
"innerRadius": {"value": 0},
"outerRadius": {"value": 100},
"tooltip": {
"signal": "datum['key'] + ': count ' + datum['ff_sum_count']"
}
}
}
}
],
"legends": [
{
"fill": "scale_color",
"title": "POS",
"orient": "right",
"encode": {
"symbols": {"enter": {"fillOpacity": {"value": 0.5}}},
"labels": {"update": {"text": {"field": "value"}}}
}
}
]
}
Slightly confused by the question, but I figure you are looking to incorporate the dashboard filters into the query body aswell. Make sure to use the %...% parameters in your query likeso.
I noticed in the comment you mention it going into the aggs. I don't think this is doable, maybe try using the %...% parameters in there?
{
body: {
query: {
bool: {
must: [
// This string will be replaced
// with the auto-generated "MUST" clause
"%dashboard_context-must_clause%"
{
range: {
// apply timefilter (upper right corner)
// to the #timestamp variable
#timestamp: {
// "%timefilter%" will be replaced with
// the current values of the time filter
// (from the upper right corner)
"%timefilter%": true
// Only work with %timefilter%
// Shift current timefilter by 10 units back
shift: 10
// week, day (default), hour, minute, second
unit: minute
}
}
}
]
must_not: [
// This string will be replaced with
// the auto-generated "MUST-NOT" clause
"%dashboard_context-must_not_clause%"
]
filter: [
// This string will be replaced
// with the auto-generated "FILTER" clause
"%dashboard_context-filter_clause%"
]
}
}
}
}

Post Filtering Date histogram aggregation bucket results not working

I have an aggregation query where I am trying to calculate the max standard deviation of the number of destination ips per IP Address for a certain time range. As everyone knows the common problem with the moving function std_dev aggregation function, the first 2 days' std dev values will always be null and 0 respectively due to no data being taken into account previously.
Here is my aggregation query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"exists": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
}
]
}
},
"aggs": {
"range": {
"date_range": {
"field": "Source Time",
"ranges": [
{
"from": "2018-04-25",
"to": "2018-05-02"
}
]
},
"aggs": {
"by ip": {
"terms": {
"field": "IP Address.keyword",
"size": 500
},
"aggs": {
"datehisto": {
"date_histogram": {
"field": "Source Time",
"interval": "day"
},
"aggs": {
"max_dest_ips": {
"sum": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
},
"max_dest_ips_std_dev": {
"moving_fn": {
"buckets_path": "max_dest_ips",
"window": 3,
"script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))"
}
}
}
}
}
}
}
}
},
"post_filter": {
"range": {
"Source Time": {
"gte": "2018-05-01"
}
}
}
}
Here is a snippet of the response:
{
"key": "192.168.0.1",
"doc_count": 6,
"datehisto": {
"buckets": [
{
"key_as_string": "2018-04-25T00:00:00.000Z",
"key": 1524614400000,
"doc_count": 1,
"max_dest_ips": {
"value": 309
},
"max_dest_ips_std_dev": {
"value": null
}
},
{
"key_as_string": "2018-04-26T00:00:00.000Z",
"key": 1524700800000,
"doc_count": 1,
"max_dest_ips": {
"value": 529
},
"max_dest_ips_std_dev": {
"value": 0
}
},
{
"key_as_string": "2018-04-27T00:00:00.000Z",
"key": 1524787200000,
"doc_count": 1,
"max_dest_ips": {
"value": 408
},
"max_dest_ips_std_dev": {
"value": 110
}
},
{
"key_as_string": "2018-04-28T00:00:00.000Z",
"key": 1524873600000,
"doc_count": 1,
"max_dest_ips": {
"value": 187
},
"max_dest_ips_std_dev": {
"value": 89.96419040682551
}
}
]
}
}
What I want is for the first 2 days' bucket data (25th and 26th) to be filtered and removed from the above bucket results. I have tried the post filter above and the normal query filter below:
"filter": {
"range": {
"Source Time": {
"gte": "2018-04-27"
}
}
}
The Post Filter does nothing and doesn't work. The above filter range query makes the buckets start from the 27th but also makes the standard deviation calculations start on 27th as well (resulting in 27th being null and 28th being 0) when I want it to start from the 25th instead.
Any other alternative solutions? Help is greatly appreciated!

Multiple key aggregation in ElasticSearch

I am new to Elastic Search and was exploring aggregation query. The documents I have are in the format -
{"name":"A",
"class":"10th",
"subjects":{
"S1":92,
"S2":92,
"S3":92,
}
}
We have about 40k such documents in our ES with the Subjects varying from student to student. The query to the system can be to aggregate all subject-wise scores for a given class. We tried to create a bucket aggregation query as explained in this guide here, however, this generates a single bucket per document and in our understanding requires an explicit mention of every subject.
We want to system to generate subject wise aggregate for the data by executing a single aggregation query, the problem I face is that in our data the subjects could vary from student to student and we don't have a global list of subject keys.
We wrote the following script but this only works if we know all possible subjects.
GET student_data_v1_1/_search
{ "query" :
{"match" :
{ "class" : "' + query + '" }},
"aggs" : { "my_buckets" : { "terms" :
{ "field" : "subjects", "size":10000 },
"aggregations": {"the_avg":
{"avg": { "field": "subjects.value" }}} }},
"size" : 0 }'
but this query only works for the document structure, but does not work multiple subjects are defined where we may not know the key-pair -
{"name":"A",
"class":"10th",
"subjects":{
"value":93
}
}
An alternate form the document is present is that the subject is a list of dictionaries -
{"name":"A",
"class":"10th",
"subjects":[
{"S1":92},
{"S2":92},
{"S3":92},
]
}
Having an aggregation query to solve either of the 2 document formats would be helpful.
======EDITS======
After updating the document to hold weights for each subject -
{
class": "10th",
"subject": [
{
"name": "s1",
"marks": 90,
"weight":30
},
{
"name": "s2",
"marks": 80,
"weight":70
}
]}
I have updated the query to be -
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "scores"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs" : { "weighted_grade": { "weighted_avg": { "value": { "field": "subjects.score" }, "weight": { "field": "subjects.weight" } } } }
}
}
}
}
},
"size": 0
}
but it throws the error-
{u'error': {u'col': 312,
u'line': 1,
u'reason': u'Unknown BaseAggregationBuilder [weighted_avg]',
u'root_cause': [{u'col': 312,
u'line': 1,
u'reason': u'Unknown BaseAggregationBuilder [weighted_avg]',
u'type': u'unknown_named_object_exception'}],
u'type': u'unknown_named_object_exception'},
u'status': 400}
To achieve the required result I would suggest you to keep your index mapping as follows:
{
"properties": {
"class": {
"type": "keyword"
},
"subject": {
"type": "nested",
"properties": {
"marks": {
"type": "integer"
},
"name": {
"type": "keyword"
}
}
}
}
}
In the mapping above I have created subject as nested type with two properties, name to hold subject name and marks to hold marks in the subject.
Sample doc:
{
"class": "10th",
"subject": [
{
"name": "s1",
"marks": 90
},
{
"name": "s2",
"marks": 80
}
]
}
Now you can use nested aggregation and multilevel aggregation (i.e. aggregation inside aggregation). I used nested aggregation with terms aggregation for subject.name to get bucket containing all the available subjects. Then to get avg for each subject we add a child aggregation of avg to the subjects aggregation as below:
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "subject"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs": {
"avg_score": {
"avg": {
"field": "subject.marks"
}
}
}
}
}
}
},
"size": 0
}
NOTE: I have added "size" : 0 so that elastic doesn't return matching docs in the result. To include or exclude it depends totally on your use case.
Sample result:
{
"took": 25,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"subjects": {
"doc_count": 6,
"subjects": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "s1",
"doc_count": 3,
"avg_score": {
"value": 80
}
},
{
"key": "s2",
"doc_count": 2,
"avg_score": {
"value": 75
}
},
{
"key": "s3",
"doc_count": 1,
"avg_score": {
"value": 80
}
}
]
}
}
}
}
As you can see the result contains buckets with key as subject name and avg_score.value as the avg of marks.
UPDATE to include weighted_avg:
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "subject"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs": {
"avg_score": {
"avg": {
"field": "subject.marks"
}
},
"weighted_grade": {
"weighted_avg": {
"value": {
"field": "subject.marks"
},
"weight": {
"field": "subject.weight"
}
}
}
}
}
}
}
},
"size": 0
}

Elasticsearch aggregations for faceted search excluding some fields

I have shop which use elasticsearch 2.4 for faceted search.
But at the moment the existing filters (product attributes) are taken from mysql. I want to do this using elasticsearch aggregations.
But I got the problem: I do not need to aggregate all the attributes.
What a have:
Part of Mapping:
...
'is_active' => [
'type' => 'long',
'index' => 'not_analyzed',
],
'category_id' => [
'type' => 'long',
'index' => 'not_analyzed',
],
'attrs' => [
'properties' => [
'attr_name' => ['type' => 'string', 'index' => 'not_analyzed'],
'value' => [
'type' => 'string',
'index' => 'analyzed',
'analyzer' => 'attrs_analizer',
],
]
],
...
Exemple of data:
{
"id": 1,
"is_active": "1",
"category_id": 189,
...
"price": "48.00",
"attrs": [
{
"attr_name": "Brand",
"value": "TP-Link"
},
{
"attr_name": "Model",
"value": "TL-1"
},
{
"attr_name": "Other",
"value": "<div>Some text of 'Other' property<br><img src......><ul><li>......</ul></div>"
}
]
},
{
"id": 2,
"is_active": "1",
"category_id": 242,
...
"price": "12.00",
"attrs": [
{
"attr_name": "Brand",
"value": "Lenovo"
},
{
"attr_name": "Model",
"value": "B570"
},
{
"attr_name": "OS",
"value": "Linux"
},
{
"attr_name": "Other",
"value": "<div>Some text of 'Other' property<br><img src......><ul><li>......</ul></div>"
}
]
},
{
"id": 3,
"is_active": "1",
"category_id": 242,
...
"price": "24.00",
"attrs": [
{
"attr_name": "Brand",
"value": "Asus"
},
{
"attr_name": "Model",
"value": "QZ85"
},
{
"attr_name": "OS",
"value": "Windows"
},
{
"attr_name": "Other",
"value": "<div>Some text of 'Other' property<br><img src......><ul><li>......</ul></div>"
}
]
}
Attributes such as "Model" and "Other" are not used when filtering products, they are only displayed on the product page. On the other attributes (Brand, OS, and others ...) I want to receive aggregations.
When I try to aggregate the attrs.value field, of course I get aggregations for all data (including the large "Other" fields, in which there can be a lot of HTML).
"aggs": {
"facet_value": {
"terms": {
"field": "attrs.value",
"size": 0
}
}
}
How to exclude "attrs.attr_name": ["Model", "Other"]?
Change the mapping is a bad solution for me, but if it is inevitable, tell me how to do it? I guess I'll need to make "attrs" nested?
UPD:
I want to receive:
1. All the attributes that the products have in a certain category, except for those that I indicate in the settings of the my system (in this example I will exclude "Model" and "Other").
2. Number of products near each value.
It should look like this:
For category "Laptops":
Brand:
Lenovo (18)
Asus (19)
.....
OS:
Windows (19)
Linux (5)
...
For "computer monitors":
Brand:
Samsung (18)
LG (19)
.....
Resolution:
1360x768 (19)
1920x1080 (22)
....
It's Terms Aggregation , I use this for the number of products for each category. And I try it for attrs.value, but I do not know how to exclude "attrs.value", which refer to "attrs.attr_name": "Model" & "attrs.attr_name": "Other".
UPD2:
In my case if map attrs as nested type, the weight of the index increases by 30%.
from 2700Mi to 3510Mi.
If there is no other option, I'll have to put up with it.
you have to map first attrs as nested type and use nested aggregations.
PUT no_play
{
"mappings": {
"document_type" : {
"properties": {
"is_active" : {
"type": "long"
},
"category_id" : {
"type": "long"
},
"attrs" : {
"type": "nested",
"properties": {
"attr_name" : {
"type" : "keyword"
},
"value" : {
"type" : "keyword"
}
}
}
}
}
}
}
POST no_play/document_type
{
"id": 3,
"is_active": "1",
"category_id": 242,
"price": "24.00",
"attrs": [
{
"attr_name": "Brand",
"value": "Asus"
},
{
"attr_name": "Model",
"value": "QZ85"
},
{
"attr_name": "OS",
"value": "<div>Some text of 'Other' property<br><img src......><ul><li>......</ul></div>"
},
{
"attr_name": "Other",
"value": "<div>Some text of 'Other' property<br><img src......><ul><li>......</ul></div>"
}
]
}
Since you didn't mention how you want to aggregate.
Case 1) If you want to count the attrs as individual. This metric gives you count of term occurrences.
POST no_play/_search
{
"size": 0,
"aggs": {
"nested_aggregation_value": {
"nested": {
"path": "attrs"
},
"aggs": {
"value_term": {
"terms": {
"field": "attrs.value",
"size": 10
}
}
}
}
}
}
POST no_play/_search
{
"size": 0,
"aggs": {
"nested_aggregation_value": {
"nested": {
"path": "attrs"
},
"aggs": {
"value_term": {
"terms": {
"field": "attrs.value",
"size": 10
},
"aggs": {
"reverse_back_to_roots": {
"reverse_nested": {
}
}
}
}
}
}
}
}
Now to get count of root document with attrs value you will need to hook a reverse nested aggregation to move the aggregator a level up to the level of root document.
Think of the following document.
{
"id": 3,
"is_active": "1",
"category_id": 242,
"price": "24.00",
"attrs": [
{
"attr_name": "Brand",
"value": "Asus"
},
{
"attr_name": "Model",
"value": "QZ85"
},
{
"attr_name": "OS",
"value": "repeated value"
},
{
"attr_name": "Other",
"value": "repeated value"
}
]
}
For first query the value count for 'repeated value' will be 2 and for second query it will be 1
Note
here is how you can do filtering to exclude
POST no_play/_search
{
"size": 0,
"aggs": {
"nested_aggregation_value": {
"nested": {
"path": "attrs"
},
"aggs": {
"filtered_results": {
"filter": {
"bool": {
"must_not": [{
"terms": {
"attrs.attr_name": ["Model", "Brand"]
}
}]
}
},
"aggs": {
"value_term": {
"terms": {
"field": "attrs.value",
"size": 10
}
}
}
}
}
}
}
}
POST no_play/_search
{
"size": 0,
"aggs": {
"nested_aggregation_value": {
"nested": {
"path": "attrs"
},
"aggs": {
"filtered_results": {
"filter": {
"bool": {
"must_not": [{
"terms": {
"attrs.attr_name": ["Model", "Brand"]
}
}]
}
},
"aggs": {
"value_term": {
"terms": {
"field": "attrs.value",
"size": 10
},
"aggs": {
"reverse_back_to_roots": {
"reverse_nested": {}
}
}
}
}
}
}
}
}
}
Thanks

Resources