Using Metric Aggregation with Composite Aggregation - elasticsearch

I have the following mapping for an index:
{
"test5" : {
"mappings" : {
"dynamic" : "false",
"properties" : {
"messageType" : {
"type" : "keyword"
},
"groupId" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"request" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"data" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"chargingPeriods" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"endDateTime" : {
"type" : "date"
},
"power" : {
"type" : "double"
},
"startDateTime" : {
"type" : "date"
}
}
}
}
}
}
}
}
}
}
}
}
}
First use case, I want buckets in 2 min intervals based on payload.request.data.chargingPeriods.startDateTime and groupId with a filter criteria of messageType . BTW chargingPeriods is an array.
This query works for that use case:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Now I want metric aggregations done on these composite buckets returned and I tried this:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
According to ES documentation https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html, this should work by doing metric aggregation on the composite bucket
But instead of the metric aggregation being computed on the composite bucket, it is being computed across all the power fields in chargingPeriods array in the entire given document.
How I created the index:
PUT /test5
{
"settings": {
"number_of_shards": 1
},
"mappings" : {
"dynamic" : "false",
"properties" : {
"groupId" : {
"type" : "keyword"
},
"messageType" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root": true,
"properties": {
"request": {
"type":"nested",
"include_in_root":true,
"properties": {
"data": {
"type":"nested",
"include_in_root": true,
"properties": {
"chargingPeriods": {
"type": "nested",
"include_in_root": true,
"properties" : {
"endDateTime":{
"type": "date"
},
"power": {
"type": "double"
},
"startDateTime":{
"type": "date"
}
}
}
}
}
}
}
}
}
}
}
}
Test Data:
POST test5/_doc/testdocu1
{
"groupId": "563",
"messageType": "test",
"payload": {
"request": {
"data": {
"chargingPeriods": [
{
"endDateTime": "2022-10-13T17:42:25Z",
"power": 9.62857,
"startDateTime": "2022-10-13T17:41:55Z"
},
{
"endDateTime": "2022-10-13T17:42:55Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:25Z"
},
{
"endDateTime": "2022-10-13T17:43:25Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:55Z"
},
{
"endDateTime": "2022-10-13T17:43:55Z",
"power": 9.66963,
"startDateTime": "2022-10-13T17:43:25Z"
},
{
"endDateTime": "2022-10-13T17:44:25Z",
"power": 9.67128,
"startDateTime": "2022-10-13T17:43:55Z"
},
{
"endDateTime": "2022-10-13T17:44:55Z",
"power": 9.65079,
"startDateTime": "2022-10-13T17:44:25Z"
},
{
"endDateTime": "2022-10-13T17:45:25Z",
"power": 9.66492,
"startDateTime": "2022-10-13T17:44:55Z"
},
{
"endDateTime": "2022-10-13T17:45:55Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:25Z"
},
{
"endDateTime": "2022-10-13T17:46:25Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:55Z"
},
{
"endDateTime": "2022-10-13T17:46:55Z",
"power": 9.67434,
"startDateTime": "2022-10-13T17:46:25Z"
}
]
}
}
}
}
My output:
"aggregations" : {
"my_buckets" : {
"after_key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"buckets" : [
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:42:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:44:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
}
]
}
}
As you can see, it chose the max payload.request.data.chargingPeriods.power from all the elements, ignoring the composite buckets. For example
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
metricAgg should have been 9.62857

It doesn't work the way you expect because you're aggregating nested data which you have include_in_root, and hence, all the nested data finds itself in the root document as if it was not nested, and so, the relation between the startDateTime and the power is basically lost.
The other issue is that your composite aggregation aggregates nested (payload...) and non-nested data (groupId), that won't work.
However, if you add the groupId field inside each element of your array, then you can make your query work like this:
GET test5/_search
{
"size": 0,
"aggs": {
"payload": {
"nested": {
"path": "payload"
},
"aggs": {
"request": {
"nested": {
"path": "payload.request"
},
"aggs": {
"data": {
"nested": {
"path": "payload.request.data"
},
"aggs": {
"charging": {
"nested": {
"path": "payload.request.data.chargingPeriods"
},
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"sessionId": {
"terms": {
"field": "payload.request.data.chargingPeriods.groupId"
}
}
},
{
"date": {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
}
}
}
}
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Results:
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.62857
}
},

Related

Find distinct values in elasticsearch

Elasticsearch 7.10.0
Dynamic Mapping:
{
"mappings": {
"dynamic_templates": [{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
]
}
}
Kibana shows following mapping of the index:
{
"mappings": {
"_doc": {
"dynamic_templates": [
{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"fields": {
"raw": {
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
....filtered out other properties....
"Registry": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
....filtered out other properties....
}
}
}
}
GET /iptree_index_base/_search?filter_path=hits.total.value,took,hits.hits._source.Registry
{
"aggs": {
"values": {
"terms": { "field": "Registry.raw" }
}
},
"sort" : [
{"Registry.raw" : {"order" : "asc"}}
]
}
Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 19
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
..Rest of duplicate results filtered out
]
}
}
Desired Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 2
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
}
]
}
}
Registry.raw is a keyword. What am I missing?
You're not interested in hits, but in aggregated buckets. So the query you're looking for is this one:
GET /iptree_index_base/_search?filter_path=hits.total.value,took,aggregations.values.buckets.key
{
"size": 0,
"aggs": {
"values": {
"terms": {
"field": "Registry.raw",
"order": {
"_key": "asc"
}
}
}
}
}

elasticsearch find documents where the given number items in array has the same property value

first of all I would like to show simplified structure of document.
{
"_id": "413123123",
"_source": {
"description": {
"firstLine": "this is my description",
"secondLine": "some value"
},
"InsertDetails": {
"Timestamp": "2020-06-12T11:14:36+0000"
},
"Links": [
{
"LinkDetails": {
"linkId": 2342,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 321313,
"type": "Link",
"dateCreation": "2012-08-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 1231,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 32323,
"typeOfLink": "https"
}
},
{
"LinkDetails": {
"linkId": 53434,
"type": "Link",
"dateCreation": "2012-11-21T08:42:09+0000",
"typeId": 123231,
"typeOfLink": "wss"
}
}
]
}
}
I have a problem with forming query, which would find documents, where the following requirements are met:
two items in Links arrays has typeOfLink equal to http
description string contains word "this"
found items will be sorted by date desc
The version of elasticsearch is 2.3.2
I've tried with query such like this:
{
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
],
"minimum_should_match": 2
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
The problem is that this query returns me also the documents, which has only one item in the array with the given value. I've tried to modify this query in different ways, but without any luck.
Added mapping
{
"my_index": {
"mappings": {
"en": {
"properties": {
"InsertDetails": {
"properties": {
"Timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
},
"description": {
"properties": {
"firstLine": {
"type": "string"
},
"secondLine": {
"type": "string"
}
}
},
"Links": {
"properties": {
"LinkDetails": {
"properties": {
"linkId": {
"type": "long"
},
"type": {
"type": "string"
},
"dateCreation": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"typeOfLink": {
"type": "string"
},
"typeId": {
"type": "long"
}
}
}
}
}
}
}
}
}
}
At first, you want to filter on a nested field. (array of object)
To have coherent result you must have to map this field as a nested one.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Then, you will have to use aggregations.
What you want is to aggregate only "http" values for type_of_link, and return results if the aggregation return more than 2 results.
You query will be a little more complicated:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "Links",
"query": {
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"aggs": {
"links": {
"nested": {
"path": "Links"
},
"aggs": {
"http_only": {
"filter": {
"term": {
"Links.LinkDetails.typeOfLink.keyword": "http"
}
},
"aggs": {
"several_http": {
"terms": {
"field": "Links.LinkDetails.typeOfLink.keyword",
"min_doc_count": 2
}
,
"aggs": {
"complete_match": {
"top_hits": {
"size": 100
}
}
}
}
}
}
}
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
And your response will looks like:
"aggregations" : {
"links" : {
"doc_count" : 4,
"http_only" : {
"doc_count" : 2,
"several_http" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "http",
"doc_count" : 2,
"complete_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.98082924,
"hits" : [
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 0
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 2342,
"type" : "Link",
"dateCreation" : "2012-09-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
},
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 1
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 321313,
"type" : "Link",
"dateCreation" : "2012-08-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
}
]
}
}
}
]
}
}
}
}
By playing with the given aggregation you should be able to do what you want.

How to filter nested aggregations in ElasticSearch?

For example, let's assume we have a product index with the following mapping:
{
"product": {
"mappings": {
"producttype": {
"properties": {
"id": {
"type": "keyword"
},
"productAttributes": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "text",
"analyzer": "keyword"
}
},
"analyzer": "standard"
}
}
}
}
}
}
I am trying to find how many products which have specific product attributes using the following query(I am using a fuzzy query to allow some edit distance):
{
"size": 0,
"query": {
"nested": {
"query": {
"fuzzy": {
"productAttributes.name": {
"value": "SSD"
}
}
},
"path": "productAttributes"
}
},
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
But it returns all product attributes for each matched document and here is the response I get.
"aggregations" : {
"product_attribute_nested_agg" : {
"doc_count" : 6,
"terms_nested_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "SSD",
"doc_count" : 3
},
{
"key" : "USB 2.0",
"doc_count" : 3
}
]
}
}
}
Could you please guide me to how to filter buckets to only return matched attributes?
Edit:
Here are some document samples:
"hits" : {
"total" : 12,
"max_score" : 1.0,
"hits" : [
{
"_index" : "product",
"_type" : "producttype",
"_id" : "677d1164-c401-4d36-8a08-6aa14f7f32bb",
"_score" : 1.0,
"_source" : {
"title" : "Dell laptop",
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "4"
},
{
"name" : "SSD",
"value" : "250 GB"
}
]
}
},
{
"_index" : "product",
"_type" : "producttype",
"_id" : "2954935a-7f60-437a-8a54-00da2d71da46",
"_score" : 1.0,
"_source" : {
"productAttributes" : [
{
"name" : "USB 2.0",
"value" : "3"
},
{
"name" : "SSD",
"value" : "500 GB"
}
],
"title" : "HP laptop"
}
},
]
}
To filter only specific, you can use filter queries.
Query:
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}
This is what it does the trick:
"filter": {
"terms": {
"productAttributes.name": [
"SSD"
]
}
}
You need to do filter part of the aggregation.
Output:
"aggregations": {
"product_attribute_nested_agg": {
"doc_count": 4,
"inner": {
"doc_count": 2,
"terms_nested_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SSD",
"doc_count": 2
}
]
}
}
}
}
Filtering using Fuzziness :
GET /product/_search
{
"size": 0,
"aggs": {
"product_attribute_nested_agg": {
"nested": {
"path": "productAttributes"
},
"aggs": {
"inner": {
"filter": {
"fuzzy": {
"productAttributes.name": {
"value": "SSt",//here will match SSD
"fuzziness": 3//you can remove it to be as Auto
}
}
},
"aggs": {
"terms_nested_agg": {
"terms": {
"field": "productAttributes.name"
}
}
}
}
}
}
}
}

Elasticsearch: terms aggregations on doubly nested object

I am trying to do a doubly nested aggregation on a doubly nested object. That is, I have the root document, a child property, and a grand-child property. To be more precise, I have the following mapping:
{
"mappings": {
"root": {
"properties": {
"fields": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"selections": {
"type": "nested",
"properties": {
"value": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
I am trying to aggregate selection value counts per field, or in other words, to count the number of occurrences of each value for each field name, accross all root objects.
I have this:
{
"query": {
...
},
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "selections"
},
"aggregations": {
"value": {
"terms": {
"field": "selections.value"
}
}
}
}
}
}
}
}
}
}
which gets the field names as I want but for each of them I get no doc counts for the values.
What am I doing wrong?
You need to give full name for inner nested field, Change "path":"selections" to "path":"fields.selections"
{
"size": 0,
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "fields.selections"
},
"aggregations": {
"value": {
"terms": {
"field": "fields.selections.value"
}
}
}
}
}
}
}
}
}
}
Result:
"aggregations" : {
"fields" : {
"doc_count" : 2,
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "abc",
"doc_count" : 2,
"values" : {
"doc_count" : 2,
"value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 2
}
]
}
}
}
]
}
}
}

Elasticsearch mapping document for epoch in seconds

As far as I understand, es only supports epoch in ms. My data source is couchbase and the json documents in there have an insertEpoch that is stored in seconds. I have been struggling to make my mapping document do the seconds to ms conversion for me.
Here is my mapping doc:
{
"template" : "cb*",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_source" : {
"includes" : ["meta.*","doc.*"]
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "nested",
"include_in_all" : false,
"transform": {
"script": "ctx._source['insertEpoch'] = ctx._source['insertEpoch'] * 1000",
"params": {},
"lang": "groovy"
}
}
}
}
}
}
The transform isn't happening.
New mapping document:
{
"template" : "wheepl",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_timestamp" : {
"enabled" : true,
"store" : true
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "object",
"include_in_all" : false,
"updateEpoch" : {
"type" : "date",
"format" : "date_time",
"numeric_resolution" : "seconds"
}
}
}
}
}
}
I don't even see the _timestamp field that I should be seeing!
Here's a Kibana screenie:
Thanks
This works in 1.6, as per https://github.com/elastic/elasticsearch/pull/10420.
Even if, internally, the date itself will be kept in milliseconds, you can index it as seconds, retrieve it as seconds, meaning just like you indexed it.
I've tried out a simple test, to see this in action:
PUT /test_dates
{
"mappings": {
"test": {
"properties": {
"time_stamp": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}
Test data:
POST /test_dates/test/1
{
"time_stamp": "9231200"
}
Retrieving it:
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_dates",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"time_stamp": "9231200"
}
}
]
}
To prove it works, running this aggregation:
GET /test_dates/test/_search?search_type=count
{
"aggs": {
"NAME": {
"date_histogram": {
"field": "time_stamp",
"interval": "second",
"format": "yyyy-MM-dd"
}
}
}
}
returns
"aggregations": {
"NAME": {
"buckets": [
{
"key_as_string": "1970-04-17",
"key": 9231200000,
"doc_count": 1
}
]
}
}
Also, your template is a bit wrong. It should be:
"couchbaseDocument": {
"_timestamp": {
"enabled": true,
"store": true
},
"properties": {
"meta": {
"type": "object",
"include_in_all": false
},
"doc": {
"type": "object",
"include_in_all": false,
"properties": {
"updateEpoch": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}

Resources