ElasticSearch: Aggregate Over a Collected Set of Results - elasticsearch

Let's say I have a set of... burgers...
For each burger, I have a set of images relating to each component of the burger.
Unfortunately, there isn't any consistency in the structure of these components (I didn't write it).
Here is an example of two documents:
{
"bunsResource": {
"image": {
"url": "./buns_1.png",
"who": "Sam"
},
"buns": [
{
"image": {
"url": "./top-bun_1.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_1.png",
"who": "Sarah"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
},
{
"bunsResource": {
"image": {
"url": "./buns_2.png",
"who": "Jim"
},
"buns": [
{
"image": {
"url": "./top-bun_2.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_2.png",
"who": "Kathy"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
}
What I need is a set of photographer / image count.
{
"who": "Sam",
"count": 1
},
{
"who": "Jim",
"count": 3
},
{
"who": "Sarah",
"count": 2
},
{
"who": "Kathy",
"count": 2
}
That is a UNIQUE image count, mind you!
I haven't been able to figure out how to achieve this...
I assume that I need to first resolve each burger to a unique set of url / who, then aggregate from there, but I can't figure out how to get the flattened list of url / who per burger.

It depends on whether the patties and buns arrays are nested or not. If they are not, then it's easy, you can simply run a terms aggregation using a script that gathers all the who fields from everywhere in the document:
POST not-nested/_search
{
"size": 0,
"aggs": {
"script": {
"terms": {
"script": {
"source": """
def list = new ArrayList();
list.addAll(doc['pattyResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.buns.image.who.keyword'].values);
list.addAll(doc['pattyResource.patties.image.who.keyword'].values);
return list;
"""
}
}
}
}
}
That will return this:
"aggregations" : {
"script" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 2
},
{
"key" : "Sam",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
}
However, if it's nested, things get more complicated as you'll need some client-side work to figure out the final counts, but we can simplify that client-side work with a few aggregations:
POST nested/_search
{
"size": 0,
"aggs": {
"bunsWho": {
"terms": {
"field": "bunsResource.image.who.keyword"
}
},
"bunsWhoNested": {
"nested": {
"path": "bunsResource.buns"
},
"aggs": {
"who": {
"terms": {
"field": "bunsResource.buns.image.who.keyword"
}
}
}
},
"pattiesWho": {
"terms": {
"field": "pattyResource.image.who.keyword"
}
},
"pattiesWhoNested": {
"nested": {
"path": "pattyResource.patties"
},
"aggs": {
"who": {
"terms": {
"field": "pattyResource.patties.image.who.keyword"
}
}
}
}
}
}
That will return this:
"aggregations" : {
"pattiesWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
},
"bunsWhoNested" : {
"doc_count" : 4,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
},
"pattiesWhoNested" : {
"doc_count" : 2,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
}
},
"bunsWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 1
},
{
"key" : "Sam",
"doc_count" : 1
}
]
}
}
And then you can simply create some client-side logic (here some sample code in Node.js) that adds the numbers up:
var whos = {};
var recordWho = function(who, count) {
whos[who] = (whos[who] || 0) + count;
};
resp.aggregations.pattiesWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.pattiesWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
console.log(whos);
=>
{ Kathy: 5, Jim: 3, Sam: 1, Sarah: 1 }

Related

Query filter for searching rollup index works with epoch time fails with date math

`How do we query (filter) a rollup index?
For example, based on the query here
Request:
{
"size": 0,
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}
Response:
{
"took" : 93,
"timed_out" : false,
"terminated_early" : false,
"_shards" : ... ,
"hits" : {
"total" : {
"value": 0,
"relation": "eq"
},
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"timeline" : {
"buckets" : [
{
"key_as_string" : "2018-01-18T00:00:00.000Z",
"key" : 1516233600000,
"doc_count" : 6,
"nodes" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "a",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 5.1499998569488525
}
},
{
"key" : "b",
"doc_count" : 2,
"max_temperature" : {
"value" : 201.0
},
"avg_voltage" : {
"value" : 5.700000047683716
}
},
{
"key" : "c",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 4.099999904632568
}
}
]
}
}
]
}
}
}
How to filter say last 3 days, is it possible?
For a test case, I used fixed_interval rate of 1m (one minute, and also 60 minutes) and I tried the following and the error was all query shards failed. Is it possible to query filter rollup agggregations?
Test Query for searching rollup index
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-3d/d",
"lt": "now/d"
}
}
}
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}

Sort Aggregated Buckets From Nested Object Array By Specific Field

I have indexed documents such as
// doc 1
{
...,
"list": [{
"value": "a",
"order": 1
}, {
"value": "b",
"order": 2
}]
,...
}
// doc 2
{
...,
"list": [{
"value": "b",
"order": 2
}, {
"value": "c",
"order": 3
}]
,...
}
If I use the aggregation on the list.value:
{
"aggs": {
"values": {
"terms": {
"field": "list.value.keyword"
}
}
}
}
I get buckets in order b, a, c:
"buckets" : [
{
"key" : "b",
"doc_count" : 2
},
{
"key" : "a",
"doc_count" : 1
},
{
"key" : "c",
"doc_count" : 1
}
]
as keys would be sorted by the _count in desc order.
If I use the aggregation on the list.value with sub-aggregation for sorting in form of max(list.order):
{
"aggs": {
"values": {
"terms": {
"field": "list.value.keyword",
"order": { "max_order": "desc" }
},
"aggs": {
"max_order": { "max": { "field": "list.order" } }
}
}
}
}
I get buckets in order b, c, a
"buckets" : [
{
"key" : "b",
"doc_count" : 2,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "c",
"doc_count" : 1,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "a",
"doc_count" : 1,
"max_order" : {
"value" : 2.0
}
}
]
as both b and c have max order 3 in their lists of the object.
However, I want to write a query to get buckets in order c, b, a as their order is 3, 2, 1 respectively. How to achieve that?
You need to use nested aggregation, to get the buckets in order of c,b,a
Adding a working example with index data, mapping, search query and search result
Index Mapping
PUT testidx1
{
"mappings":{
"properties": {
"list":{
"type": "nested"
}
}
}
}
Index Data:
POST testidx1/_doc/1
{
"list": [
{
"value": "a",
"order": 1
},
{
"value": "b",
"order": 2
}
]
}
POST testidx1/_doc/2
{
"list": [
{
"value": "b",
"order": 2
},
{
"value": "c",
"order": 3
}
]
}
Search Query:
POST testidx1/_search
{
"size": 0,
"aggs": {
"resellers": {
"nested": {
"path": "list"
},
"aggs": {
"unique_values": {
"terms": {
"field": "list.value.keyword",
"order": {
"max_order": "desc"
}
},
"aggs": {
"max_order": {
"max": {
"field": "list.order"
}
}
}
}
}
}
}
}
Search Response:
"aggregations" : {
"resellers" : {
"doc_count" : 4,
"unique_values" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "c",
"doc_count" : 1,
"max_order" : {
"value" : 3.0
}
},
{
"key" : "b",
"doc_count" : 2,
"max_order" : {
"value" : 2.0
}
},
{
"key" : "a",
"doc_count" : 1,
"max_order" : {
"value" : 1.0
}
}
]
}
}
}
}

Elasticsearch sub-aggregation queries that check whether some bucket values meet a condition

Hiii guys!! I am trying to pull out some aggregations that need to perform some specific computation logics per bucket, and it is killing me..
So I have some data tracking who uses what application feature like this:
[
{
"event_key": "basic_search",
"user": {
"tenant_tier": "free"
},
"origin": {
"visitor_id": "xxxxxxx"
}
},
{
"event_key": "registration",
"user": {
"tenant_tier": "basic"
},
"origin": {
"visitor_id": "xxxxxxx"
}
},
{
"event_key": "advanced_search",
"user": {
"tenant_tier": "basic"
},
"origin": {
"visitor_id": "xxxxxxx"
}
}
]
The user can opt to trial the app features using free tier identity, then register to enjoy other features. The origin.visitor_id is calculated from a website user's IP addresses and User-Agent etc.
With this data, I am hoping to answer this question: "how many people used free trial features BEFORE registering".
I came up with a ES query template like below, but couldn't figure out how to write the sub-aggregations that seem to require some more complex scripting against values in the bucket... Any advice is very much appreciated!
{
"aggs": {
"origin": {
"terms": {
"field": "origin.id.keyword",
"size": 1000
},
"aggs": {
"user_started_out_free": {
# ??????
# need to return a boolean telling whether `user.tenant_tier` of the first document in the bucket is `free`
}
},
"then_registered": {
# ??????
# need to return a boolean telling whether any `event_type` in the bucket is `registration`
},
"is_trial_user_then_registered": {
"bucket_script": {
"buckets_path": {
"user_started_out_free": "user_started_out_free"
"then_registered": "then_registered"
},
"script": "user_started_out_free && then_registered"
}
}
}
},
"num_trial_then_registered": {
"sum_bucket": {
"buckets_path": "origin>is_trial_user_then_registered"
}
}
}
}
You can use bucket selector aggregation to keep bucket where "trail" and "registration" both exists. Then use stats aggregation to get bucket count.
Query
{
"size": 0,
"aggs": {
"visitors": {
"terms": {
"field": "origin.visitor_id.keyword",
"size": 10
},
"aggs": {
"user_started_out_free": {
"filter": {
"term": {
"event_key.keyword": "basic_search"
}
}
},
"then_registered": {
"filter": {
"term": {
"event_key.keyword": "registration"
}
}
},
"user_first_free_then_registerd":{
"bucket_selector": {
"buckets_path": {
"free": "user_started_out_free._count",
"registered": "then_registered._count"
},
"script": "if(params.free>0 && params.registered>0) return true;"
}
}
}
},
"bucketcount":{
"stats_bucket":{
"buckets_path":"visitors._count"
}
}
}
}
Result
"visitors" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 4,
"then_registered" : {
"doc_count" : 3
},
"user_started_out_free" : {
"doc_count" : 1
}
},
{
"key" : "1",
"doc_count" : 3,
"then_registered" : {
"doc_count" : 1
},
"user_started_out_free" : {
"doc_count" : 1
}
},
{
"key" : "2",
"doc_count" : 2,
"then_registered" : {
"doc_count" : 1
},
"user_started_out_free" : {
"doc_count" : 1
}
}
]
},
"bucketcount" : {
"count" : 3,
"min" : 2.0,
"max" : 4.0,
"avg" : 3.0,
"sum" : 9.0
}

How do I compare two source IP from two different specific log in elastic search

In Elasticsearch I want to compare two logs (natlog and Gateway log) with DSL Query.
In nat log there is srcip1 and In gateway log there is srcip2
I want to if this condition srcip1 === srcip2 satisfied, "agent.id" display in result.
On top of it I will put my already corelated query which I have made
{
"query": {
"bool": {
"should": [
{
"match": {
"location": "\\Users\\Saad\\Desktop\\nat.log"
}
},
{
"match": {
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log"
}
}
],
"must": [
{
"term": {
"data.srcip": "1.1.1.1"
}
}
]
}
},
"fields": [
"data.srcip1"
],
"_source": false
}
I tried multiple things but not succeeded.
To display summaries of data you use aggregations. In case you want to compare the different agents depending on the log type for a certain ip the query will be this one:
Ingest data
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log",
"data": {
"srcip2": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_2"
}
}
Request
POST test_saad/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"term": {
"data.srcip1.keyword": "1.1.1.2"
}
},
{
"term": {
"data.srcip2.keyword": "1.1.1.2"
}
}
],
"minimum_should_match": 1
}
},
{
"bool": {
"should": [
{
"term": {
"location.keyword": """\Users\Saad\Desktop\nat.log"""
}
},
{
"term": {
"location.keyword": """\Users\Saad\Desktop\attendance-logs-with-ports.log"""
}
}
],
"minimum_should_match": 1
}
}
]
}
},
"aggs": {
"log_types": {
"terms": {
"field": "location.keyword",
"size": 10
},
"aggs": {
"agent_types": {
"terms": {
"field": "agent.id.keyword",
"size": 10
}
}
}
}
}
}
Response
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"log_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : """\Users\Saad\Desktop\nat.log""",
"doc_count" : 2,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
},
{
"key" : "agent_2",
"doc_count" : 1
}
]
}
},
{
"key" : """\Users\Saad\Desktop\attendance-logs-with-ports.log""",
"doc_count" : 1,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
}
]
}
}
]
}
}
}

ElasticSearch: Query to find max of count of objects based on field value

For the example document below in the index, I want to find max of count of actions based on component name across all documents in the index. Could you please help to find a way for this.
Expected result assuming only one document present in the Index:
comp1 -> action1 -> max 2 times
comp1 -> action2 -> max 1 time
comp2 -> action2 -> max 1 time
comp2 -> action3 -> max 1 time
Sample Document:
{
"id": "AC103902:A13A_AC140008:01BB_5FA2E8FA_1C08:0007",
"tokens": [
{
"name": "comp1",
"items": [
{
"action": "action1",
"attr": "value"
},
{
"action": "action1",
"attr": "value"
},
{
"action": "action2",
"attr": "value"
}
]
},
{
"name": "comp2",
"items": [
{
"action": "action2",
"attr": "value"
},
{
"action": "action3",
"attr": "value"
}
]
}
]
}
ElasticSearch Version: 7.9
I can loop through each document and calculate this at client side but I am curious to know if there is already an ES query which can help to get this kid of summary from the documents in the index.
You'll need to define both the tokens array and the tokens.items array as nested in order to get the correct stats.
Then, assuming your mapping looks something along the lines of
{
"mappings": {
"properties": {
"tokens": {
"type": "nested",
"properties": {
"items": {
"type": "nested"
}
}
}
}
}
}
the following query can be executed:
GET index_name/_search
{
"size": 0,
"aggs": {
"by_token_name": {
"nested": {
"path": "tokens"
},
"aggs": {
"token_name": {
"terms": {
"field": "tokens.name.keyword"
},
"aggs": {
"by_max_actions": {
"nested": {
"path": "tokens.items"
},
"aggs": {
"max_actions": {
"terms": {
"field": "tokens.items.action.keyword"
}
}
}
}
}
}
}
}
}
}
yielding these buckets:
[
{
"key" : "comp1", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 3,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action1", <--
"doc_count" : 2
},
{
"key" : "action2", <--
"doc_count" : 1
}
]
}
}
},
{
"key" : "comp2", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 2,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action2", <--
"doc_count" : 1
},
{
"key" : "action3", <--
"doc_count" : 1
}
]
}
}
}
]
which can be easily post-processed at client side.

Resources