Filter elasticsearch bucket aggregation based on term field - elasticsearch

I have a list of products (deal entities) and I'm attempting to create a bucket aggregation by categories, ordered by the sum of available_stock.
This all works fine, but I want to exclude such categories from the resulting aggregation that don't have level set to 1 (In other words, I only want to keep aggregations on category where level IS 1).
I am aware that elasticsearch provides "exclude" and "include" parameters, but these only work on the same field I'm aggregating on (deal.category.id in this case)
This is my sample deal document:
{
"_source": {
"id": 392745,
"category": [
{
"id": 17575,
"level": 2
},
{
"id": 17574,
"level": 1
},
{
"id": 17572,
"level": 0
}
],
"stats": {
"available_stock": 500
}
}
}
And this would be the query:
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
}
},
"aggs": {
"mainAggregation": {
"terms": {
"field": "deal.category.id",
"order": {
"available_stock": "desc"
},
"size": 3
},
"aggs": {
"available_stock": {
"sum": {
"field": "deal.stats.available_stock"
}
}
}
}
},
"size": 0
}
And my resulting aggregation, sadly including category 17572 with level 0.
{
"aggregations": {
"mainAggregation": {
"buckets": [
{
"key": 17572,
"doc_count": 30,
"available_stock": {
"value": 24000
}
},
{
"key": 17598,
"doc_count": 10,
"available_stock": {
"value": 12000
}
},
{
"key": 17602,
"doc_count": 8,
"available_stock": {
"value": 6000
}
}
]
}
}
}
P.S.: Currently on ElasticSearch 1.6
Update 1: Still stuck on the problem after various experiments with various combimation of subaggregations.

I have found this impossible to solve and decided to go with two separate queries.

Related

Nested array of objects aggregation in Elasticsearch

Documents in the Elasticsearch are indexed as such
Document 1
{
"task_completed": 10
"tagged_object": [
{
"category": "cat",
"count": 10
},
{
"category": "cars",
"count": 20
}
]
}
Document 2
{
"task_completed": 50
"tagged_object": [
{
"category": "cars",
"count": 100
},
{
"category": "dog",
"count": 5
}
]
}
As you can see that the value of the category key is dynamic in nature. I want to perform a similar aggregation like in SQL with the group by category and return the sum of the count of each category.
In the above example, the aggregation should return
cat: 10,
cars: 120 and
dog: 5
Wanted to know how to write this aggregation query in Elasticsearch if it is possible. Thanks in advance.
You can achieve your required result, using nested, terms, and sum aggregation.
Adding a working example with index mapping, search query and search result
Index Mapping:
{
"mappings": {
"properties": {
"tagged_object": {
"type": "nested"
}
}
}
}
Search Query:
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "tagged_object"
},
"aggs": {
"books": {
"terms": {
"field": "tagged_object.category.keyword"
},
"aggs":{
"sum_of_count":{
"sum":{
"field":"tagged_object.count"
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"resellers": {
"doc_count": 4,
"books": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cars",
"doc_count": 2,
"sum_of_count": {
"value": 120.0
}
},
{
"key": "cat",
"doc_count": 1,
"sum_of_count": {
"value": 10.0
}
},
{
"key": "dog",
"doc_count": 1,
"sum_of_count": {
"value": 5.0
}
}
]
}
}
}

Post Filtering Date histogram aggregation bucket results not working

I have an aggregation query where I am trying to calculate the max standard deviation of the number of destination ips per IP Address for a certain time range. As everyone knows the common problem with the moving function std_dev aggregation function, the first 2 days' std dev values will always be null and 0 respectively due to no data being taken into account previously.
Here is my aggregation query:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"exists": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
}
]
}
},
"aggs": {
"range": {
"date_range": {
"field": "Source Time",
"ranges": [
{
"from": "2018-04-25",
"to": "2018-05-02"
}
]
},
"aggs": {
"by ip": {
"terms": {
"field": "IP Address.keyword",
"size": 500
},
"aggs": {
"datehisto": {
"date_histogram": {
"field": "Source Time",
"interval": "day"
},
"aggs": {
"max_dest_ips": {
"sum": {
"field": "aggregations.range.buckets.by ip.buckets.by date.buckets.max_dest_ips.value"
}
},
"max_dest_ips_std_dev": {
"moving_fn": {
"buckets_path": "max_dest_ips",
"window": 3,
"script": "MovingFunctions.stdDev(values, MovingFunctions.unweightedAvg(values))"
}
}
}
}
}
}
}
}
},
"post_filter": {
"range": {
"Source Time": {
"gte": "2018-05-01"
}
}
}
}
Here is a snippet of the response:
{
"key": "192.168.0.1",
"doc_count": 6,
"datehisto": {
"buckets": [
{
"key_as_string": "2018-04-25T00:00:00.000Z",
"key": 1524614400000,
"doc_count": 1,
"max_dest_ips": {
"value": 309
},
"max_dest_ips_std_dev": {
"value": null
}
},
{
"key_as_string": "2018-04-26T00:00:00.000Z",
"key": 1524700800000,
"doc_count": 1,
"max_dest_ips": {
"value": 529
},
"max_dest_ips_std_dev": {
"value": 0
}
},
{
"key_as_string": "2018-04-27T00:00:00.000Z",
"key": 1524787200000,
"doc_count": 1,
"max_dest_ips": {
"value": 408
},
"max_dest_ips_std_dev": {
"value": 110
}
},
{
"key_as_string": "2018-04-28T00:00:00.000Z",
"key": 1524873600000,
"doc_count": 1,
"max_dest_ips": {
"value": 187
},
"max_dest_ips_std_dev": {
"value": 89.96419040682551
}
}
]
}
}
What I want is for the first 2 days' bucket data (25th and 26th) to be filtered and removed from the above bucket results. I have tried the post filter above and the normal query filter below:
"filter": {
"range": {
"Source Time": {
"gte": "2018-04-27"
}
}
}
The Post Filter does nothing and doesn't work. The above filter range query makes the buckets start from the 27th but also makes the standard deviation calculations start on 27th as well (resulting in 27th being null and 28th being 0) when I want it to start from the 25th instead.
Any other alternative solutions? Help is greatly appreciated!

Multiple key aggregation in ElasticSearch

I am new to Elastic Search and was exploring aggregation query. The documents I have are in the format -
{"name":"A",
"class":"10th",
"subjects":{
"S1":92,
"S2":92,
"S3":92,
}
}
We have about 40k such documents in our ES with the Subjects varying from student to student. The query to the system can be to aggregate all subject-wise scores for a given class. We tried to create a bucket aggregation query as explained in this guide here, however, this generates a single bucket per document and in our understanding requires an explicit mention of every subject.
We want to system to generate subject wise aggregate for the data by executing a single aggregation query, the problem I face is that in our data the subjects could vary from student to student and we don't have a global list of subject keys.
We wrote the following script but this only works if we know all possible subjects.
GET student_data_v1_1/_search
{ "query" :
{"match" :
{ "class" : "' + query + '" }},
"aggs" : { "my_buckets" : { "terms" :
{ "field" : "subjects", "size":10000 },
"aggregations": {"the_avg":
{"avg": { "field": "subjects.value" }}} }},
"size" : 0 }'
but this query only works for the document structure, but does not work multiple subjects are defined where we may not know the key-pair -
{"name":"A",
"class":"10th",
"subjects":{
"value":93
}
}
An alternate form the document is present is that the subject is a list of dictionaries -
{"name":"A",
"class":"10th",
"subjects":[
{"S1":92},
{"S2":92},
{"S3":92},
]
}
Having an aggregation query to solve either of the 2 document formats would be helpful.
======EDITS======
After updating the document to hold weights for each subject -
{
class": "10th",
"subject": [
{
"name": "s1",
"marks": 90,
"weight":30
},
{
"name": "s2",
"marks": 80,
"weight":70
}
]}
I have updated the query to be -
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "scores"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs" : { "weighted_grade": { "weighted_avg": { "value": { "field": "subjects.score" }, "weight": { "field": "subjects.weight" } } } }
}
}
}
}
},
"size": 0
}
but it throws the error-
{u'error': {u'col': 312,
u'line': 1,
u'reason': u'Unknown BaseAggregationBuilder [weighted_avg]',
u'root_cause': [{u'col': 312,
u'line': 1,
u'reason': u'Unknown BaseAggregationBuilder [weighted_avg]',
u'type': u'unknown_named_object_exception'}],
u'type': u'unknown_named_object_exception'},
u'status': 400}
To achieve the required result I would suggest you to keep your index mapping as follows:
{
"properties": {
"class": {
"type": "keyword"
},
"subject": {
"type": "nested",
"properties": {
"marks": {
"type": "integer"
},
"name": {
"type": "keyword"
}
}
}
}
}
In the mapping above I have created subject as nested type with two properties, name to hold subject name and marks to hold marks in the subject.
Sample doc:
{
"class": "10th",
"subject": [
{
"name": "s1",
"marks": 90
},
{
"name": "s2",
"marks": 80
}
]
}
Now you can use nested aggregation and multilevel aggregation (i.e. aggregation inside aggregation). I used nested aggregation with terms aggregation for subject.name to get bucket containing all the available subjects. Then to get avg for each subject we add a child aggregation of avg to the subjects aggregation as below:
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "subject"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs": {
"avg_score": {
"avg": {
"field": "subject.marks"
}
}
}
}
}
}
},
"size": 0
}
NOTE: I have added "size" : 0 so that elastic doesn't return matching docs in the result. To include or exclude it depends totally on your use case.
Sample result:
{
"took": 25,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"subjects": {
"doc_count": 6,
"subjects": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "s1",
"doc_count": 3,
"avg_score": {
"value": 80
}
},
{
"key": "s2",
"doc_count": 2,
"avg_score": {
"value": 75
}
},
{
"key": "s3",
"doc_count": 1,
"avg_score": {
"value": 80
}
}
]
}
}
}
}
As you can see the result contains buckets with key as subject name and avg_score.value as the avg of marks.
UPDATE to include weighted_avg:
{
"query": {
"match": {
"class": "10th"
}
},
"aggs": {
"subjects": {
"nested": {
"path": "subject"
},
"aggs": {
"subjects": {
"terms": {
"field": "subject.name"
},
"aggs": {
"avg_score": {
"avg": {
"field": "subject.marks"
}
},
"weighted_grade": {
"weighted_avg": {
"value": {
"field": "subject.marks"
},
"weight": {
"field": "subject.weight"
}
}
}
}
}
}
}
},
"size": 0
}

Elasticsearch nested cardinality aggregation

I have a mapping with nested schema, i am tring to do aggregation on nested field and order by docid count.
select name, count(distinct docid) as uniqueid from table
group by name
order by uniqueid desc
Above is what i am trying to do.
{
"size": 0,
"aggs": {
"samples": {
"nested": {
"path": "sample"
},
"aggs": {
"sample": {
"terms": {
"field": "sample.name",
"order": {
"DocCounts": "desc"
}
},
"aggs": {
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
}
}
}
}
But in the result i am not getting the expected output
result:
"buckets": [
{
"key": "xxxxx",
"doc_count": 173256,
"DocCounts": {
"value": 0
}
},
{
"key": "yyyyy",
"doc_count": 63,
"DocCounts": {
"value": 0
}
}
]
i am getting the DocCounts = 0. This is not expected. What went wrong in my query.
I think your last nested aggregation is too much. Try to get rid of it:
{
"size": 0,
"aggs": {
"samples": {
"nested": {
"path": "sample"
},
"aggs": {
"sample": {
"terms": {
"field": "sample.name",
"order": {
"DocCounts": "desc"
}
},
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
}
}
}
In general when doing some aggregation on nested type by value from upper scope, we observed that we need to put/copy the value from upper scope on nested type when storing document.
Then in your case aggregation would look like:
"aggs": {
"DocCounts": {
"cardinality": {
"field": "sample.docid"
}
}
}
It works in such case at least on version 1.7 of Elasticsearch.
You can use reverse nested aggregation on top of Cardinality aggregation on DocCounts. This is because when nested aggregation is applied, the query runs against the nested document. So to access any field of parent document inside nested doc, reverse nested aggregation can be used. Check ES Reference for more info on this.
Your cardinality query will look like:
"aggs": {
"internal_DocCounts": {
"reverse_nested": { },
"DocCounts": {
"cardinality": {
"field": "docid"
}
}
}
}
The response will look like:
"buckets": [
{
"key": "xxxxx",
"doc_count": 173256,
"internal_DocCounts": {
"doc_count": 173256,
"DocCounts": {
"value": <some_value>
}
}
},
{
"key": "yyyyy",
"doc_count": 63,
"internal_DocCounts": {
"doc_count": 63,
"DocCounts": {
"value": <some_value>
}
}
},
.....
Check this similar thread

elastic search embedded script optimization

Is there a way to simplify and optimize the following query:
{
"query": {
"filtered": {
"filter": {
"and": [
{
"range": {
"ts": {
"gte": "2014-12-18",
"lte": "2014-12-18"
}
}
}
]
},
"query": {
"match": {
"track_events.event": "render"
}
}
}
},
"aggs": {
"per_type": {
"terms": {
"field": "type",
"order": {
"_count": "desc"
},
"size": 0
},
"aggs": {
"per_hour": {
"terms": {
"script": "(doc[\"track_events.ts\"].value - doc[\"ts\"].value)/(1000 * 3600)",
"order": {
"_count": "desc"
},
"size": 0
}
}
}
}
}
}
The index in elasticsearch contains documents with fields track_events.ts and ts. The purpose is to count how many occurances exist in the hourly intervals between track_events.ts and ts.
Example response:
"buckets": [{
"key": "0",
"doc_count": 67736997
},
{
"key": "1",
"doc_count": 7193214
},
{
"key": "2",
"doc_count": 3406966
},
{
"key": "3",
"doc_count": 1988135
}]
}
which means that 67736997 counts where found that have time difference less than 1 hour, 7193214 counts with time difference less than 2 hours, etc.
The biggest performance gain would be to replace the script.
i.e. instead of doing:
(doc[\"track_events.ts\"].value - doc[\"ts\"].value)/(1000 * 3600)
pre-calculate this value when loading the data into Elasticsearch and put it into another field. Then do the term aggregation on this field instead.

Resources