Find distinct values in elasticsearch - elasticsearch

Elasticsearch 7.10.0
Dynamic Mapping:
{
"mappings": {
"dynamic_templates": [{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
]
}
}
Kibana shows following mapping of the index:
{
"mappings": {
"_doc": {
"dynamic_templates": [
{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"fields": {
"raw": {
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
....filtered out other properties....
"Registry": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
....filtered out other properties....
}
}
}
}
GET /iptree_index_base/_search?filter_path=hits.total.value,took,hits.hits._source.Registry
{
"aggs": {
"values": {
"terms": { "field": "Registry.raw" }
}
},
"sort" : [
{"Registry.raw" : {"order" : "asc"}}
]
}
Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 19
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
..Rest of duplicate results filtered out
]
}
}
Desired Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 2
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
}
]
}
}
Registry.raw is a keyword. What am I missing?

You're not interested in hits, but in aggregated buckets. So the query you're looking for is this one:
GET /iptree_index_base/_search?filter_path=hits.total.value,took,aggregations.values.buckets.key
{
"size": 0,
"aggs": {
"values": {
"terms": {
"field": "Registry.raw",
"order": {
"_key": "asc"
}
}
}
}
}

Related

Using Metric Aggregation with Composite Aggregation

I have the following mapping for an index:
{
"test5" : {
"mappings" : {
"dynamic" : "false",
"properties" : {
"messageType" : {
"type" : "keyword"
},
"groupId" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"request" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"data" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"chargingPeriods" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"endDateTime" : {
"type" : "date"
},
"power" : {
"type" : "double"
},
"startDateTime" : {
"type" : "date"
}
}
}
}
}
}
}
}
}
}
}
}
}
First use case, I want buckets in 2 min intervals based on payload.request.data.chargingPeriods.startDateTime and groupId with a filter criteria of messageType . BTW chargingPeriods is an array.
This query works for that use case:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Now I want metric aggregations done on these composite buckets returned and I tried this:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
According to ES documentation https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html, this should work by doing metric aggregation on the composite bucket
But instead of the metric aggregation being computed on the composite bucket, it is being computed across all the power fields in chargingPeriods array in the entire given document.
How I created the index:
PUT /test5
{
"settings": {
"number_of_shards": 1
},
"mappings" : {
"dynamic" : "false",
"properties" : {
"groupId" : {
"type" : "keyword"
},
"messageType" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root": true,
"properties": {
"request": {
"type":"nested",
"include_in_root":true,
"properties": {
"data": {
"type":"nested",
"include_in_root": true,
"properties": {
"chargingPeriods": {
"type": "nested",
"include_in_root": true,
"properties" : {
"endDateTime":{
"type": "date"
},
"power": {
"type": "double"
},
"startDateTime":{
"type": "date"
}
}
}
}
}
}
}
}
}
}
}
}
Test Data:
POST test5/_doc/testdocu1
{
"groupId": "563",
"messageType": "test",
"payload": {
"request": {
"data": {
"chargingPeriods": [
{
"endDateTime": "2022-10-13T17:42:25Z",
"power": 9.62857,
"startDateTime": "2022-10-13T17:41:55Z"
},
{
"endDateTime": "2022-10-13T17:42:55Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:25Z"
},
{
"endDateTime": "2022-10-13T17:43:25Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:55Z"
},
{
"endDateTime": "2022-10-13T17:43:55Z",
"power": 9.66963,
"startDateTime": "2022-10-13T17:43:25Z"
},
{
"endDateTime": "2022-10-13T17:44:25Z",
"power": 9.67128,
"startDateTime": "2022-10-13T17:43:55Z"
},
{
"endDateTime": "2022-10-13T17:44:55Z",
"power": 9.65079,
"startDateTime": "2022-10-13T17:44:25Z"
},
{
"endDateTime": "2022-10-13T17:45:25Z",
"power": 9.66492,
"startDateTime": "2022-10-13T17:44:55Z"
},
{
"endDateTime": "2022-10-13T17:45:55Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:25Z"
},
{
"endDateTime": "2022-10-13T17:46:25Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:55Z"
},
{
"endDateTime": "2022-10-13T17:46:55Z",
"power": 9.67434,
"startDateTime": "2022-10-13T17:46:25Z"
}
]
}
}
}
}
My output:
"aggregations" : {
"my_buckets" : {
"after_key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"buckets" : [
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:42:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:44:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
}
]
}
}
As you can see, it chose the max payload.request.data.chargingPeriods.power from all the elements, ignoring the composite buckets. For example
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
metricAgg should have been 9.62857
It doesn't work the way you expect because you're aggregating nested data which you have include_in_root, and hence, all the nested data finds itself in the root document as if it was not nested, and so, the relation between the startDateTime and the power is basically lost.
The other issue is that your composite aggregation aggregates nested (payload...) and non-nested data (groupId), that won't work.
However, if you add the groupId field inside each element of your array, then you can make your query work like this:
GET test5/_search
{
"size": 0,
"aggs": {
"payload": {
"nested": {
"path": "payload"
},
"aggs": {
"request": {
"nested": {
"path": "payload.request"
},
"aggs": {
"data": {
"nested": {
"path": "payload.request.data"
},
"aggs": {
"charging": {
"nested": {
"path": "payload.request.data.chargingPeriods"
},
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"sessionId": {
"terms": {
"field": "payload.request.data.chargingPeriods.groupId"
}
}
},
{
"date": {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
}
}
}
}
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Results:
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.62857
}
},

Improve performance of a nested term aggregation?

Is there a way to improve performance of a nested term aggregation without sampling?
Terms query:
GET <INDEX>/_search?pretty&request_cache=false
{
"_source": false,
"sort": [
"_doc"
],
"size": 0,
"track_total_hits": false,
"aggregations": {
"nested_suggestions": {
"nested": {
"path": "measurement"
},
"aggs": {
"suggestions": {
"terms": {
"field": "measurement.description.label",
"size": 1
}
}
}
}
}
}
...
{
"took" : 8239,
"timed_out" : false,
...
"aggregations" : {
"nested_suggestions" : {
"doc_count" : 226139234,
"suggestions" : {
"doc_count_error_upper_bound" : 7445607,
"sum_other_doc_count" : 214543500,
"buckets" : [
{
"key" : "xxx",
"doc_count" : 11635382
}
]
}
}
}
}
Cardinality query:
GET <INDEX>/_search?pretty&request_cache=false
{
"_source": false,
"sort": [
"_doc"
],
"size": 0,
"track_total_hits": false,
"aggregations": {
"nested_suggestions": {
"nested": {
"path": "measurement"
},
"aggs": {
"suggestions": {
"cardinality": {
"field": "measurement.description.label"
}
}
}
}
}
}
...
{
"took" : 5688,
"timed_out" : false,
...
"aggregations" : {
"nested_suggestions" : {
"doc_count" : 226139234,
"suggestions" : {
"value" : 1379
}
}
}
}
Minimal mapping:
{
"settings": {
"number_of_replicas": "0",
"number_of_shards": "10",
"analysis": {
"normalizer": {
"raw_clean": {
"type": "custom",
"filter": [
"asciifolding"
]
}
}
}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"properties": {
"id": {
"type": "keyword"
},
"measurement": {
"type": "nested",
"dynamic": "strict",
"properties": {
"id": {
"type": "keyword"
},
"description": {
"type": "text",
"norms": false,
"fields": {
"label": {
"type": "keyword",
"normalizer": "raw_clean",
"ignore_above": 255,
"eager_global_ordinals": true
}
}
}
}
}
}
}
}
}
I've verified that the global ordinals have data via /_cat/fielddata?v.
Is this kind of performance expected with nested terms aggregations?
Environment:
elasticsearch 6.8.3
index size ~200GB (with the full mapping)
documents ~1million
nested documents ~225million
4CPU 16GB RAM 500GB SSD

elasticsearch find documents where the given number items in array has the same property value

first of all I would like to show simplified structure of document.
{
"_id": "413123123",
"_source": {
"description": {
"firstLine": "this is my description",
"secondLine": "some value"
},
"InsertDetails": {
"Timestamp": "2020-06-12T11:14:36+0000"
},
"Links": [
{
"LinkDetails": {
"linkId": 2342,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 321313,
"type": "Link",
"dateCreation": "2012-08-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 1231,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 32323,
"typeOfLink": "https"
}
},
{
"LinkDetails": {
"linkId": 53434,
"type": "Link",
"dateCreation": "2012-11-21T08:42:09+0000",
"typeId": 123231,
"typeOfLink": "wss"
}
}
]
}
}
I have a problem with forming query, which would find documents, where the following requirements are met:
two items in Links arrays has typeOfLink equal to http
description string contains word "this"
found items will be sorted by date desc
The version of elasticsearch is 2.3.2
I've tried with query such like this:
{
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
],
"minimum_should_match": 2
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
The problem is that this query returns me also the documents, which has only one item in the array with the given value. I've tried to modify this query in different ways, but without any luck.
Added mapping
{
"my_index": {
"mappings": {
"en": {
"properties": {
"InsertDetails": {
"properties": {
"Timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
},
"description": {
"properties": {
"firstLine": {
"type": "string"
},
"secondLine": {
"type": "string"
}
}
},
"Links": {
"properties": {
"LinkDetails": {
"properties": {
"linkId": {
"type": "long"
},
"type": {
"type": "string"
},
"dateCreation": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"typeOfLink": {
"type": "string"
},
"typeId": {
"type": "long"
}
}
}
}
}
}
}
}
}
}
At first, you want to filter on a nested field. (array of object)
To have coherent result you must have to map this field as a nested one.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Then, you will have to use aggregations.
What you want is to aggregate only "http" values for type_of_link, and return results if the aggregation return more than 2 results.
You query will be a little more complicated:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "Links",
"query": {
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"aggs": {
"links": {
"nested": {
"path": "Links"
},
"aggs": {
"http_only": {
"filter": {
"term": {
"Links.LinkDetails.typeOfLink.keyword": "http"
}
},
"aggs": {
"several_http": {
"terms": {
"field": "Links.LinkDetails.typeOfLink.keyword",
"min_doc_count": 2
}
,
"aggs": {
"complete_match": {
"top_hits": {
"size": 100
}
}
}
}
}
}
}
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
And your response will looks like:
"aggregations" : {
"links" : {
"doc_count" : 4,
"http_only" : {
"doc_count" : 2,
"several_http" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "http",
"doc_count" : 2,
"complete_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.98082924,
"hits" : [
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 0
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 2342,
"type" : "Link",
"dateCreation" : "2012-09-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
},
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 1
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 321313,
"type" : "Link",
"dateCreation" : "2012-08-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
}
]
}
}
}
]
}
}
}
}
By playing with the given aggregation you should be able to do what you want.

Counting search results in ElasticSearch by a nested property

Here is a schema with a nested property.
{
"dynamic": "strict",
"properties" : {
"Id" : {
"type": "integer"
},
"Name_en" : {
"type": "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"normalizer": "cloudbuy_normalizer_alphanumeric"
},
"text" : {
"type" : "text",
"analyzer": "english"
}
}
},
"Menus" : {
"type" : "nested",
"properties" : {
"Id" : {
"type" : "integer"
},
"Name" : {
"type" : "keyword",
"normalizer": "normalizer_alphanumeric"
},
"AncestorsIds" : {
"type" : "integer"
}
}
}
}
}
And here is a document.
{
"Id": 12781279
"Name": "Thing of purpose made to fit",
"Menus": [
{
"Id": -571057,
"Name": "Top level menu",
"AncestorsIds": [
-571057
]
}
,
{
"Id": 1022313,
"Name": "Other",
"AncestorsIds": [
-571057
,
1022313
]
}
]
}
For any given query I need a list with two columns: the Menu.Id and the number of documents in the result set that have that Menu.Id in their Menus array.
How?
(Is there any documentation for aggs that isn't impenetrable?)
#Richard, does this query suits your need ?
POST yourindex/_search
{
"_source": "false",
"aggs":{
"menus": {
"nested": {
"path": "Menus"
},
"aggs":{
"menu_aggregation": {
"terms": {
"field": "Menus.Id",
"size": 10
}
}
}
}
}
Output :
"aggregations": {
"menus": {
"doc_count": 2,
"menu_aggregation": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": -571057,
"doc_count": 1
},
{
"key": 1022313,
"doc_count": 1
}
]
}
}
Here we specify a nested path and then aggregate on the menu Ids.
You can take a look at this documentation page : https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-nested-aggregation.html

Elasticsearch mapping document for epoch in seconds

As far as I understand, es only supports epoch in ms. My data source is couchbase and the json documents in there have an insertEpoch that is stored in seconds. I have been struggling to make my mapping document do the seconds to ms conversion for me.
Here is my mapping doc:
{
"template" : "cb*",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_source" : {
"includes" : ["meta.*","doc.*"]
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "nested",
"include_in_all" : false,
"transform": {
"script": "ctx._source['insertEpoch'] = ctx._source['insertEpoch'] * 1000",
"params": {},
"lang": "groovy"
}
}
}
}
}
}
The transform isn't happening.
New mapping document:
{
"template" : "wheepl",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_timestamp" : {
"enabled" : true,
"store" : true
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "object",
"include_in_all" : false,
"updateEpoch" : {
"type" : "date",
"format" : "date_time",
"numeric_resolution" : "seconds"
}
}
}
}
}
}
I don't even see the _timestamp field that I should be seeing!
Here's a Kibana screenie:
Thanks
This works in 1.6, as per https://github.com/elastic/elasticsearch/pull/10420.
Even if, internally, the date itself will be kept in milliseconds, you can index it as seconds, retrieve it as seconds, meaning just like you indexed it.
I've tried out a simple test, to see this in action:
PUT /test_dates
{
"mappings": {
"test": {
"properties": {
"time_stamp": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}
Test data:
POST /test_dates/test/1
{
"time_stamp": "9231200"
}
Retrieving it:
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_dates",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"time_stamp": "9231200"
}
}
]
}
To prove it works, running this aggregation:
GET /test_dates/test/_search?search_type=count
{
"aggs": {
"NAME": {
"date_histogram": {
"field": "time_stamp",
"interval": "second",
"format": "yyyy-MM-dd"
}
}
}
}
returns
"aggregations": {
"NAME": {
"buckets": [
{
"key_as_string": "1970-04-17",
"key": 9231200000,
"doc_count": 1
}
]
}
}
Also, your template is a bit wrong. It should be:
"couchbaseDocument": {
"_timestamp": {
"enabled": true,
"store": true
},
"properties": {
"meta": {
"type": "object",
"include_in_all": false
},
"doc": {
"type": "object",
"include_in_all": false,
"properties": {
"updateEpoch": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}

Resources