Improve performance of a nested term aggregation? - elasticsearch

Is there a way to improve performance of a nested term aggregation without sampling?
Terms query:
GET <INDEX>/_search?pretty&request_cache=false
{
"_source": false,
"sort": [
"_doc"
],
"size": 0,
"track_total_hits": false,
"aggregations": {
"nested_suggestions": {
"nested": {
"path": "measurement"
},
"aggs": {
"suggestions": {
"terms": {
"field": "measurement.description.label",
"size": 1
}
}
}
}
}
}
...
{
"took" : 8239,
"timed_out" : false,
...
"aggregations" : {
"nested_suggestions" : {
"doc_count" : 226139234,
"suggestions" : {
"doc_count_error_upper_bound" : 7445607,
"sum_other_doc_count" : 214543500,
"buckets" : [
{
"key" : "xxx",
"doc_count" : 11635382
}
]
}
}
}
}
Cardinality query:
GET <INDEX>/_search?pretty&request_cache=false
{
"_source": false,
"sort": [
"_doc"
],
"size": 0,
"track_total_hits": false,
"aggregations": {
"nested_suggestions": {
"nested": {
"path": "measurement"
},
"aggs": {
"suggestions": {
"cardinality": {
"field": "measurement.description.label"
}
}
}
}
}
}
...
{
"took" : 5688,
"timed_out" : false,
...
"aggregations" : {
"nested_suggestions" : {
"doc_count" : 226139234,
"suggestions" : {
"value" : 1379
}
}
}
}
Minimal mapping:
{
"settings": {
"number_of_replicas": "0",
"number_of_shards": "10",
"analysis": {
"normalizer": {
"raw_clean": {
"type": "custom",
"filter": [
"asciifolding"
]
}
}
}
},
"mappings": {
"_doc": {
"dynamic": "strict",
"properties": {
"id": {
"type": "keyword"
},
"measurement": {
"type": "nested",
"dynamic": "strict",
"properties": {
"id": {
"type": "keyword"
},
"description": {
"type": "text",
"norms": false,
"fields": {
"label": {
"type": "keyword",
"normalizer": "raw_clean",
"ignore_above": 255,
"eager_global_ordinals": true
}
}
}
}
}
}
}
}
}
I've verified that the global ordinals have data via /_cat/fielddata?v.
Is this kind of performance expected with nested terms aggregations?
Environment:
elasticsearch 6.8.3
index size ~200GB (with the full mapping)
documents ~1million
nested documents ~225million
4CPU 16GB RAM 500GB SSD

Related

Find distinct values in elasticsearch

Elasticsearch 7.10.0
Dynamic Mapping:
{
"mappings": {
"dynamic_templates": [{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
]
}
}
Kibana shows following mapping of the index:
{
"mappings": {
"_doc": {
"dynamic_templates": [
{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"fields": {
"raw": {
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
....filtered out other properties....
"Registry": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
....filtered out other properties....
}
}
}
}
GET /iptree_index_base/_search?filter_path=hits.total.value,took,hits.hits._source.Registry
{
"aggs": {
"values": {
"terms": { "field": "Registry.raw" }
}
},
"sort" : [
{"Registry.raw" : {"order" : "asc"}}
]
}
Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 19
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
..Rest of duplicate results filtered out
]
}
}
Desired Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 2
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
}
]
}
}
Registry.raw is a keyword. What am I missing?
You're not interested in hits, but in aggregated buckets. So the query you're looking for is this one:
GET /iptree_index_base/_search?filter_path=hits.total.value,took,aggregations.values.buckets.key
{
"size": 0,
"aggs": {
"values": {
"terms": {
"field": "Registry.raw",
"order": {
"_key": "asc"
}
}
}
}
}

elasticsearch find documents where the given number items in array has the same property value

first of all I would like to show simplified structure of document.
{
"_id": "413123123",
"_source": {
"description": {
"firstLine": "this is my description",
"secondLine": "some value"
},
"InsertDetails": {
"Timestamp": "2020-06-12T11:14:36+0000"
},
"Links": [
{
"LinkDetails": {
"linkId": 2342,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 321313,
"type": "Link",
"dateCreation": "2012-08-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 1231,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 32323,
"typeOfLink": "https"
}
},
{
"LinkDetails": {
"linkId": 53434,
"type": "Link",
"dateCreation": "2012-11-21T08:42:09+0000",
"typeId": 123231,
"typeOfLink": "wss"
}
}
]
}
}
I have a problem with forming query, which would find documents, where the following requirements are met:
two items in Links arrays has typeOfLink equal to http
description string contains word "this"
found items will be sorted by date desc
The version of elasticsearch is 2.3.2
I've tried with query such like this:
{
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
],
"minimum_should_match": 2
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
The problem is that this query returns me also the documents, which has only one item in the array with the given value. I've tried to modify this query in different ways, but without any luck.
Added mapping
{
"my_index": {
"mappings": {
"en": {
"properties": {
"InsertDetails": {
"properties": {
"Timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
},
"description": {
"properties": {
"firstLine": {
"type": "string"
},
"secondLine": {
"type": "string"
}
}
},
"Links": {
"properties": {
"LinkDetails": {
"properties": {
"linkId": {
"type": "long"
},
"type": {
"type": "string"
},
"dateCreation": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"typeOfLink": {
"type": "string"
},
"typeId": {
"type": "long"
}
}
}
}
}
}
}
}
}
}
At first, you want to filter on a nested field. (array of object)
To have coherent result you must have to map this field as a nested one.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Then, you will have to use aggregations.
What you want is to aggregate only "http" values for type_of_link, and return results if the aggregation return more than 2 results.
You query will be a little more complicated:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "Links",
"query": {
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"aggs": {
"links": {
"nested": {
"path": "Links"
},
"aggs": {
"http_only": {
"filter": {
"term": {
"Links.LinkDetails.typeOfLink.keyword": "http"
}
},
"aggs": {
"several_http": {
"terms": {
"field": "Links.LinkDetails.typeOfLink.keyword",
"min_doc_count": 2
}
,
"aggs": {
"complete_match": {
"top_hits": {
"size": 100
}
}
}
}
}
}
}
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
And your response will looks like:
"aggregations" : {
"links" : {
"doc_count" : 4,
"http_only" : {
"doc_count" : 2,
"several_http" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "http",
"doc_count" : 2,
"complete_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.98082924,
"hits" : [
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 0
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 2342,
"type" : "Link",
"dateCreation" : "2012-09-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
},
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 1
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 321313,
"type" : "Link",
"dateCreation" : "2012-08-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
}
]
}
}
}
]
}
}
}
}
By playing with the given aggregation you should be able to do what you want.

Filter aggregation keys with non nested mapping in elasticsearch

I have following mapping:
{
"Country": {
"properties": {
"State": {
"properties": {
"Name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"Code": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"Lang": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
}
}
}
This is sample document:
{
"Country": {
"State": [
{
"Name": "California",
"Code": "CA",
"Lang": "EN"
},
{
"Name": "Alaska",
"Code": "AK",
"Lang": "EN"
},
{
"Name": "Texas",
"Code": "TX",
"Lang": "EN"
}
]
}
}
I am querying on this index to get aggregates of count of states by name. I am using following query:
{
"from": 0,
"size": 0,
"query": {
"query_string": {
"query": "Country.State.Name: *Ala*"
}
},
"aggs": {
"counts": {
"terms": {
"field": "Country.State.Name.raw",
"include": ".*Ala.*"
}
}
}
}
I am able to get only keys matching with query_string using include regex in terms aggregation but seems there is no way to make it case insensitive regex in include.
The result I want is:
{
"aggregations": {
"counts": {
"buckets": [
{
"key": "Alaska",
"doc_count": 1
}
]
}
}
}
Is there other solution available to get me only keys matching query_string without using nested mapping?
Use Normalizer for keyword datatype. Below is the sample mapping:
Mapping:
PUT country
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": { <---- Note this
"type": "custom",
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"properties": {
"Country": {
"properties": {
"State": {
"properties": {
"Name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword",
"normalizer": "my_normalizer" <---- Note this
}
}
},
"Code": {
"type": "text",
"fields": {
"raw": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
},
"Lang": {
"type": "text",
"fields": {
"raw": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
}
}
}
}
}
Document:
POST country/_doc/1
{
"Country": {
"State": [
{
"Name": "California",
"Code": "CA",
"Lang": "EN"
},
{
"Name": "Alaska",
"Code": "AK",
"Lang": "EN"
},
{
"Name": "Texas",
"Code": "TX",
"Lang": "EN"
}
]
}
}
Aggregation Query:
POST country/_search
{
"from": 0,
"size": 0,
"query": {
"query_string": {
"query": "Country.State.Name: *Ala*"
}
},
"aggs": {
"counts": {
"terms": {
"field": "Country.State.Name.raw",
"include": "ala.*"
}
}
}
}
Notice the query pattern in include. Basically all the values of the *.raw fields that you have, would be stored in lowercase letters due to the normalizer that I've applied.
Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"counts" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "alaska",
"doc_count" : 1
}
]
}
}
}
Hope this helps!
I was able to fix the problem by using inline script to filter the keys. (Still a dirty fix but it solves my use case for now and I can avoid mapping changes)
Here is how I am executing query.
{
"from": 0,
"size": 0,
"query": {
"query_string": {
"query": "Country.State.Name: *Ala*"
}
},
"aggs": {
"counts": {
"terms": {
"script": {
"source": "doc['Country.State.Name.raw'].value.toLowerCase().contains('ala') ? doc['Country.State.Name.raw'].value : null",
"lang": "painless"
}
}
}
}
}

Elasticsearch: terms aggregations on doubly nested object

I am trying to do a doubly nested aggregation on a doubly nested object. That is, I have the root document, a child property, and a grand-child property. To be more precise, I have the following mapping:
{
"mappings": {
"root": {
"properties": {
"fields": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
},
"selections": {
"type": "nested",
"properties": {
"value": {
"type": "keyword"
}
}
}
}
}
}
}
}
}
I am trying to aggregate selection value counts per field, or in other words, to count the number of occurrences of each value for each field name, accross all root objects.
I have this:
{
"query": {
...
},
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "selections"
},
"aggregations": {
"value": {
"terms": {
"field": "selections.value"
}
}
}
}
}
}
}
}
}
}
which gets the field names as I want but for each of them I get no doc counts for the values.
What am I doing wrong?
You need to give full name for inner nested field, Change "path":"selections" to "path":"fields.selections"
{
"size": 0,
"aggregations": {
"fields": {
"nested": {
"path": "fields"
},
"aggregations": {
"name": {
"terms": {
"field": "fields.name"
},
"aggregations": {
"values": {
"nested": {
"path": "fields.selections"
},
"aggregations": {
"value": {
"terms": {
"field": "fields.selections.value"
}
}
}
}
}
}
}
}
}
}
Result:
"aggregations" : {
"fields" : {
"doc_count" : 2,
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "abc",
"doc_count" : 2,
"values" : {
"doc_count" : 2,
"value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 2
}
]
}
}
}
]
}
}
}

Synonyms aggregation in elasticsearch 7 - term based

I am trying to aggregate fields, but fields are similar like Med and Medium. I don't want both to come in my aggregation results, only either of it should come. I tried with synonyms but it doesn't seem to work.
Question is: How can I concatenate or unify similar aggregation results when it is term based?
Below is my work.
Mapping and Setting
{
"settings": {
"index" : {
"analysis" : {
"filter" : {
"synonym_filter" : {
"type" : "synonym",
"synonyms" : [
"medium, m, med",
"large, l",
"extra small, xs, x small"
]
}
},
"analyzer" : {
"synonym_analyzer" : {
"tokenizer" : "standard",
"filter" : ["lowercase", "synonym_filter"]
}
}
}
}
},
"mappings": {
"properties": {
"skus": {
"type": "nested",
"properties": {
"labels": {
"dynamic": "true",
"properties": {
"Color": {
"type": "text",
"fields": {
"synonym": {
"analyzer": "synonym_analyzer",
"type": "text",
"fielddata":true
}
}
},
"Size": {
"type": "text",
"fields": {
"synonym": {
"analyzer": "synonym_analyzer",
"type": "text",
"fielddata":true
}
}
}
}
}
}
}
}
}}
Aggregation
{
"aggs":{
"sizesFilter": {
"aggs": {
"sizes": {
"terms": {
"field": "skus.labels.Size.synonym"
}
}
},
"nested": {
"path": "skus"
}
}
}}
With only one doc my aggregation result is
"aggregations": {
"sizesFilter": {
"doc_count": 1,
"sizes": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "m",
"doc_count": 1
},
{
"key": "med",
"doc_count": 1
},
{
"key": "medium",
"doc_count": 1
}
]
}
}
}
I got it by setting tokenizer in analyzer to "keyword"
{
"analyzer" : {
"synonym_analyzer" : {
"tokenizer" : "keyword",
"filter" : ["lowercase", "synonym_filter"]
}
}
}

Resources