ElasticSearch: Query to find max of count of objects based on field value - elasticsearch

For the example document below in the index, I want to find max of count of actions based on component name across all documents in the index. Could you please help to find a way for this.
Expected result assuming only one document present in the Index:
comp1 -> action1 -> max 2 times
comp1 -> action2 -> max 1 time
comp2 -> action2 -> max 1 time
comp2 -> action3 -> max 1 time
Sample Document:
{
"id": "AC103902:A13A_AC140008:01BB_5FA2E8FA_1C08:0007",
"tokens": [
{
"name": "comp1",
"items": [
{
"action": "action1",
"attr": "value"
},
{
"action": "action1",
"attr": "value"
},
{
"action": "action2",
"attr": "value"
}
]
},
{
"name": "comp2",
"items": [
{
"action": "action2",
"attr": "value"
},
{
"action": "action3",
"attr": "value"
}
]
}
]
}
ElasticSearch Version: 7.9
I can loop through each document and calculate this at client side but I am curious to know if there is already an ES query which can help to get this kid of summary from the documents in the index.

You'll need to define both the tokens array and the tokens.items array as nested in order to get the correct stats.
Then, assuming your mapping looks something along the lines of
{
"mappings": {
"properties": {
"tokens": {
"type": "nested",
"properties": {
"items": {
"type": "nested"
}
}
}
}
}
}
the following query can be executed:
GET index_name/_search
{
"size": 0,
"aggs": {
"by_token_name": {
"nested": {
"path": "tokens"
},
"aggs": {
"token_name": {
"terms": {
"field": "tokens.name.keyword"
},
"aggs": {
"by_max_actions": {
"nested": {
"path": "tokens.items"
},
"aggs": {
"max_actions": {
"terms": {
"field": "tokens.items.action.keyword"
}
}
}
}
}
}
}
}
}
}
yielding these buckets:
[
{
"key" : "comp1", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 3,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action1", <--
"doc_count" : 2
},
{
"key" : "action2", <--
"doc_count" : 1
}
]
}
}
},
{
"key" : "comp2", <--
"doc_count" : 1,
"by_max_actions" : {
"doc_count" : 2,
"max_actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "action2", <--
"doc_count" : 1
},
{
"key" : "action3", <--
"doc_count" : 1
}
]
}
}
}
]
which can be easily post-processed at client side.

Related

Count number of inner elements of array property (Including repeated values)

Given I have the following records.
[
{
"profile": "123",
"inner": [
{
"name": "John"
}
]
},
{
"profile": "456",
"inner": [
{
"name": "John"
},
{
"name": "John"
},
{
"name": "James"
}
]
}
]
I want to get something like:
"aggregations": {
"name": {
"buckets": [
{
"key": "John",
"doc_count": 3
},
{
"key": "James",
"doc_count": 1
}
]
}
}
I'm a beginner using Elasticsearch, and this seems to be a pretty simple operation to do, but I can't find how to achieve this.
If I try a simple aggs using term, it returns 2 for John, instead of 3.
Example request I'm trying:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
}
}
}
}
How can I possibly achieve this?
Additional Info: It will be used on Kibana later.
I can change mapping to whatever I want, but AFAIK Kibana doesn't like the "Nested" type. :(
You need to do a value_count aggregation, by default terms only does a doc_count, but the value_count aggregation will count the number of times a given field exists.
So, for your purposes:
{
"size": 0,
"aggs": {
"name": {
"terms": {
"field": "inner.name"
},
"aggs": {
"total": {
"value_count": {
"field": "inner.name"
}
}
}
}
}
}
Which returns:
"aggregations" : {
"name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "John",
"doc_count" : 2,
"total" : {
"value" : 3
}
},
{
"key" : "James",
"doc_count" : 1,
"total" : {
"value" : 2
}
}
]
}
}

How do I compare two source IP from two different specific log in elastic search

In Elasticsearch I want to compare two logs (natlog and Gateway log) with DSL Query.
In nat log there is srcip1 and In gateway log there is srcip2
I want to if this condition srcip1 === srcip2 satisfied, "agent.id" display in result.
On top of it I will put my already corelated query which I have made
{
"query": {
"bool": {
"should": [
{
"match": {
"location": "\\Users\\Saad\\Desktop\\nat.log"
}
},
{
"match": {
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log"
}
}
],
"must": [
{
"term": {
"data.srcip": "1.1.1.1"
}
}
]
}
},
"fields": [
"data.srcip1"
],
"_source": false
}
I tried multiple things but not succeeded.
To display summaries of data you use aggregations. In case you want to compare the different agents depending on the log type for a certain ip the query will be this one:
Ingest data
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\attendance-logs-with-ports.log",
"data": {
"srcip2": "1.1.1.1"
},
"agent": {
"id": "agent_1"
}
}
POST test_saad/_doc
{
"location": "\\Users\\Saad\\Desktop\\nat.log",
"data": {
"srcip1": "1.1.1.1"
},
"agent": {
"id": "agent_2"
}
}
Request
POST test_saad/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"term": {
"data.srcip1.keyword": "1.1.1.2"
}
},
{
"term": {
"data.srcip2.keyword": "1.1.1.2"
}
}
],
"minimum_should_match": 1
}
},
{
"bool": {
"should": [
{
"term": {
"location.keyword": """\Users\Saad\Desktop\nat.log"""
}
},
{
"term": {
"location.keyword": """\Users\Saad\Desktop\attendance-logs-with-ports.log"""
}
}
],
"minimum_should_match": 1
}
}
]
}
},
"aggs": {
"log_types": {
"terms": {
"field": "location.keyword",
"size": 10
},
"aggs": {
"agent_types": {
"terms": {
"field": "agent.id.keyword",
"size": 10
}
}
}
}
}
}
Response
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"log_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : """\Users\Saad\Desktop\nat.log""",
"doc_count" : 2,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
},
{
"key" : "agent_2",
"doc_count" : 1
}
]
}
},
{
"key" : """\Users\Saad\Desktop\attendance-logs-with-ports.log""",
"doc_count" : 1,
"agent_types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "agent_1",
"doc_count" : 1
}
]
}
}
]
}
}
}

Return top N buckets only

So in elastic search I can do something like this:
{
"aggs": {
"title": {
"terms": {
"field": "title",
"shard_size": 50,
"size": 5
}
}
},
"query": {...},
"size": 0
}
And this will return me the document counts of the top 5 titles, so we end up with something like (in part):
"buckets" : [
{
"key" : "Delivery Driver",
"doc_count" : 1495
},
{
"key" : "Assistant Manager",
"doc_count" : 1250
},
{
"key" : "Server",
"doc_count" : 1175
},
{
"key" : "Dishwasher",
"doc_count" : 966
},
{
"key" : "Team Member",
"doc_count" : 960
}
]
But now I need to have the document counts in some custom buckets, so I do something like this:
{
"aggs": {
"loc": {
"filters": {
"filters": {
"1042_2": {
"terms": {
"counties": [
...
]
}
},
"1594_2": {
"terms": {
"counties": [
...
]
}
},
"1714_2": {
"terms": {
"counties": [
...
]
}
},
"1746_2": {
"terms": {
"counties": [
...
]
}
},
"1814_2": {
"terms": {
"counties": [
...
]
}
},
"1943_2": {
"terms": {
"counties": [
...
]
}
},
"2658_2": {
"terms": {
"counties": [
...
]
}
}
}
}
}
},
"query": {...},
"size": 0
}
Note that there are 7 buckets, because we don't know which are the largest. Running this will return us:
"buckets" : {
"1042_2" : {
"doc_count" : 23687
},
"1594_2" : {
"doc_count" : 8951
},
"1714_2" : {
"doc_count" : 52555
},
"1746_2" : {
"doc_count" : 60534
},
"1814_2" : {
"doc_count" : 63956
},
"1943_2" : {
"doc_count" : 25533
},
"2658_2" : {
"doc_count" : 534
}
}
But I would like it to only return me the largest 5 instead of all the buckets. Is there a way to restrict it to only the n largest buckets in the same way that the size parameter under terms did?
The size parameter does not make sense for filters aggregation, because by specifying the filters you already explicitly specify/control the number of buckets to get created and returned.
What you may want to consider though is, that you get all potential buckets created, but then get them sorted by descending count with an order-clause.
On client side then you simply "consume" the first n buckets.

Elasticsearch aggregations: how to get bucket with 'other' results of terms aggregation?

I use aggregation to collect data from nested field and stuck a little
Example of document:
{
...
rectangle: {
attributes: [
{_id: 'some_id', ...}
]
}
ES allows group data by rectangle.attributes._id, but is there any way to get some 'other' bucket to put there documents that were not added to any of groups? Or maybe there is a way to create query to create bucket for documents by {"rectangle.attributes._id": {$ne: "{currentDoc}.rectangle.attributes._id"}}
I think bucket would be perfect because i need to do further aggregations with 'other' docs.
Or maybe there's some cool workaround
I use query like this for aggregation
"aggs": {
"attributes": {
"nested": {
"path": "rectangle.attributes"
},
"aggs": {
"attributesCount": {
"cardinality": {
"field": "rectangle.attributes._id.keyword"
}
},
"entries": {
"terms": {
"field": "rectangle.attributes._id.keyword"
}
}
}
}
}
And get this result
"buckets" : [
{
"key" : "some_parent_id",
"doc_count" : 27616,
"attributes" : {
"doc_count" : 45,
"entries" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "some_id",
"doc_count" : 45,
"attributeOptionsCount" : {
"value" : 2
}
}
]
}
}
}
]
result like this would be perfect:
"buckets" : [
{
"key" : "some_parent_id",
"doc_count" : 1000,
"attributes" : {
"doc_count" : 145,
"entries" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "some_id",
"doc_count" : 45
},
{
"key" : "other",
"doc_count" : 100
}
]
}
}
}
]
You can make use of missing value parameter. Update aggregation as below:
"aggs": {
"attributes": {
"nested": {
"path": "rectangle.attributes"
},
"aggs": {
"attributesCount": {
"cardinality": {
"field": "rectangle.attributes._id.keyword"
}
},
"entries": {
"terms": {
"field": "rectangle.attributes._id.keyword",
"missing": "other"
}
}
}
}
}

ElasticSearch: Aggregate Over a Collected Set of Results

Let's say I have a set of... burgers...
For each burger, I have a set of images relating to each component of the burger.
Unfortunately, there isn't any consistency in the structure of these components (I didn't write it).
Here is an example of two documents:
{
"bunsResource": {
"image": {
"url": "./buns_1.png",
"who": "Sam"
},
"buns": [
{
"image": {
"url": "./top-bun_1.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_1.png",
"who": "Sarah"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
},
{
"bunsResource": {
"image": {
"url": "./buns_2.png",
"who": "Jim"
},
"buns": [
{
"image": {
"url": "./top-bun_2.png",
"who": "Jim"
}
},
{
"image": {
"url": "./bottom-bun_2.png",
"who": "Kathy"
}
}
]
},
"pattyResource": {
"image": {
"url": "./patties_1.png",
"who": "Kathy"
},
"patties": [
{
"image": {
"url": "./patty_1.jpg",
"who": "Kathy"
}
}
]
}
}
What I need is a set of photographer / image count.
{
"who": "Sam",
"count": 1
},
{
"who": "Jim",
"count": 3
},
{
"who": "Sarah",
"count": 2
},
{
"who": "Kathy",
"count": 2
}
That is a UNIQUE image count, mind you!
I haven't been able to figure out how to achieve this...
I assume that I need to first resolve each burger to a unique set of url / who, then aggregate from there, but I can't figure out how to get the flattened list of url / who per burger.
It depends on whether the patties and buns arrays are nested or not. If they are not, then it's easy, you can simply run a terms aggregation using a script that gathers all the who fields from everywhere in the document:
POST not-nested/_search
{
"size": 0,
"aggs": {
"script": {
"terms": {
"script": {
"source": """
def list = new ArrayList();
list.addAll(doc['pattyResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.image.who.keyword'].values);
list.addAll(doc['bunsResource.buns.image.who.keyword'].values);
list.addAll(doc['pattyResource.patties.image.who.keyword'].values);
return list;
"""
}
}
}
}
}
That will return this:
"aggregations" : {
"script" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 2
},
{
"key" : "Sam",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
}
However, if it's nested, things get more complicated as you'll need some client-side work to figure out the final counts, but we can simplify that client-side work with a few aggregations:
POST nested/_search
{
"size": 0,
"aggs": {
"bunsWho": {
"terms": {
"field": "bunsResource.image.who.keyword"
}
},
"bunsWhoNested": {
"nested": {
"path": "bunsResource.buns"
},
"aggs": {
"who": {
"terms": {
"field": "bunsResource.buns.image.who.keyword"
}
}
}
},
"pattiesWho": {
"terms": {
"field": "pattyResource.image.who.keyword"
}
},
"pattiesWhoNested": {
"nested": {
"path": "pattyResource.patties"
},
"aggs": {
"who": {
"terms": {
"field": "pattyResource.patties.image.who.keyword"
}
}
}
}
}
}
That will return this:
"aggregations" : {
"pattiesWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
},
"bunsWhoNested" : {
"doc_count" : 4,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 2
},
{
"key" : "Kathy",
"doc_count" : 1
},
{
"key" : "Sarah",
"doc_count" : 1
}
]
}
},
"pattiesWhoNested" : {
"doc_count" : 2,
"who" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Kathy",
"doc_count" : 2
}
]
}
},
"bunsWho" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Jim",
"doc_count" : 1
},
{
"key" : "Sam",
"doc_count" : 1
}
]
}
}
And then you can simply create some client-side logic (here some sample code in Node.js) that adds the numbers up:
var whos = {};
var recordWho = function(who, count) {
whos[who] = (whos[who] || 0) + count;
};
resp.aggregations.pattiesWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.pattiesWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
console.log(whos);
=>
{ Kathy: 5, Jim: 3, Sam: 1, Sarah: 1 }

Resources