ELASTICSEARCH - Get a count of values from the most recent document - elasticsearch

I can't get a count of fields with a filtered document value.
I have this json
``
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "net",
"_type" : "_doc",
"_id" : "RTHRTH",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-05-31 19:01:01",
"data" : [...]
{
"_index" : "net",
"_type" : "_doc",
"_id" : "LLLoIJBHHM",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-06-23 15:11:59",
"data" : [...]
}
}
]
}
}
``
In the "data" field, there are more fields within other fields respectively.
I want to filter the most recent document, and then count a certain value in the most recent document.
This is my query:
`{
"query": {
"match": {
"name.keyword": "net"
}
},
"sort": [
{
"created_at.keyword": {
"order": "desc"
}
}
],
"size": 1,
"aggs": {
"CountValue": {
"terms": {
"field": "data.add.serv.desc.keyword",
"include": "nginx"
}
}
}
}`
And the output is:
`{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"CountValue" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "nginx",
"doc_count" : 2
}
]
}
}`
I suspect that doc_count is the number of documents the value appears in, not the number of times the value is repeated within the filtered document.
Any advice I will be very grateful!

Unless any of the fields under the path data.add.serv are of the nested type, the terms agg will produce per-whole-doc results, not per-field.
Exempli gratia:
POST example/_doc
{
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
produces doc_count==1.
When, however, specified as nested:
DELETE example
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
}
}
}
}
POST example/_doc
{"serv":[{"desc":"nginx"},{"desc":"nginx"},{"desc":"nginx"}]}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
we end up with doc_count==3.
This has to do with the way non-nested array types are flattened and de-duplicated. At the end, you may need to reindex your collections after having applied the nested mapping.
EDIT
In order to only take the latest doc, you could do the following:
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
},
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
}
}
}
}
then
POST example/_doc
{
"created_at" : "2020-05-31 19:01:01",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
POST example/_doc
{
"created_at" : "2020-06-23 15:11:59",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then use a terms agg of size 1, sorted by timestamp desc:
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "created_at",
"order": {
"_term": "desc"
},
"size": 1
},
"aggs": {
"NAME2": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
}
}

Related

ElasticSearch aggregation for filtering users who had both events

I've written a query which perfectly return me 6000+ events my users had:
GET /<app_logs-2022.11.23*>/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"context.identity.type": "login"
}
},
{
"term": {
"context.identity.type": "login_error"
}
}
],
"minimum_should_match": 1
}
},
"_source": [
"context.identity.user_id",
"context.identity.type"
],
"size": 3
}
And i get such set of data
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 15,
"successful" : 15,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6001,
"max_score" : 10.722837,
"hits" : [
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "bb469377-0618-49a6-a643-1201dc84c829",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "8f4e82a0-f333-4096-bfb6-767fed924093",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login_error"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "7090be5a-8b53-4723-a1ac-223476a000f1",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "75bcb301-1cee-4b3b-aa1b-adbe4c011388",
"type" : "login_error"
}
}
}
}
]
}
}
But i can't figure out how to get a number of users who had both login and login_error events, i've tried cardinality aggregation, terms and several more but all of them just split types into buckets showing sum but doesn't group by user, and i want to find how many users had problems first but then managed to login in the end.
The best i've managed to achieve is to get buckets by user_id and output cardinality for each of them by type
"aggs": {
"results": {
"terms": {
"field": "context.identity.user_id",
"size": 300
},
"aggs": {
"events": {
"cardinality": {
"field": "context.identity.type"
}
}
}
}
}
I created an example based on the sentence I want to find how many users had problems first but then managed to login in at the end.
It works like this:
Make aggs based on user_id
Make sub-aggs by type
Ignore the ones that don't contain login_error
#put mapping
PUT test_stack_login
{
"mappings": {
"properties": {
"context.identity.user_id": {
"type": "keyword"
},
"context.identity.type": {
"type": "keyword"
}
}
}
}
#put example docs
POST test_stack_login/_bulk?refresh&pretty
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":2,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":3,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login"}
#Run the query
GET test_stack_login/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "context.identity.user_id",
"size": 1000
},
"aggs": {
"context_identity_type": {
"terms": {
"field": "context.identity.type",
"size": 10
}
},
"login_error_exist": {
"bucket_selector": {
"buckets_path": {
"var1": "context_identity_type['login_error']>_count"
},
"script": "params.var1 != null"
}
}
}
}
}
}
#the result will be like in the ss
You will get user_id containing both login and login_error information in the context.identity.type field. The keys in the response will give you the user_id of which have logged in at least once unsuccessfully and once successfully logged in.
"buckets" : [
{"key" : "1" ...},
{"key" : "4" ...}
]

Query filter for searching rollup index works with epoch time fails with date math

`How do we query (filter) a rollup index?
For example, based on the query here
Request:
{
"size": 0,
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}
Response:
{
"took" : 93,
"timed_out" : false,
"terminated_early" : false,
"_shards" : ... ,
"hits" : {
"total" : {
"value": 0,
"relation": "eq"
},
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"timeline" : {
"buckets" : [
{
"key_as_string" : "2018-01-18T00:00:00.000Z",
"key" : 1516233600000,
"doc_count" : 6,
"nodes" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "a",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 5.1499998569488525
}
},
{
"key" : "b",
"doc_count" : 2,
"max_temperature" : {
"value" : 201.0
},
"avg_voltage" : {
"value" : 5.700000047683716
}
},
{
"key" : "c",
"doc_count" : 2,
"max_temperature" : {
"value" : 202.0
},
"avg_voltage" : {
"value" : 4.099999904632568
}
}
]
}
}
]
}
}
}
How to filter say last 3 days, is it possible?
For a test case, I used fixed_interval rate of 1m (one minute, and also 60 minutes) and I tried the following and the error was all query shards failed. Is it possible to query filter rollup agggregations?
Test Query for searching rollup index
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-3d/d",
"lt": "now/d"
}
}
}
"aggregations": {
"timeline": {
"date_histogram": {
"field": "timestamp",
"fixed_interval": "7d"
},
"aggs": {
"nodes": {
"terms": {
"field": "node"
},
"aggs": {
"max_temperature": {
"max": {
"field": "temperature"
}
},
"avg_voltage": {
"avg": {
"field": "voltage"
}
}
}
}
}
}
}
}

Elasticsearhc filter sub object before search

Let's say I have index like this:
{
"id": 6,
"name": "some name",
"users": [
{
"id": 1,
"name": "User1",
"isEnabled": false,
},
{
"id": 2,
"name": "User2",
"isEnabled": false,
},
{
"id": 3,
"name": "User3,
"isEnabled": true,
},
]
}
what I need is to return that index while user searching for the name some name, but also I want to filter out all not enabled users, and if there is not enabled users omit that index.
I tried to use filters like this:
{
"query": {
"bool": {
"must": {
"match": {
"name": "some name"
}
},
"filter": {
"term": {
"users.isEnabled": true
}
}
}
}
}
but in such a case I see index with all users no matter if user is enabled or not. I'm a bit new but is there a way to do so??? I can filter out all that in code after getting data from elasticsearch but in such a case it can break pagination if I remove some index without enabled users from result set.
I'm a bit new to elasticsearch, but as far I can't find how to do it. Thank you in advice!
Elasticsearch will return whole document if there is any match. If you update your mapping and make users array nested, you can achieve this by using inner hits. This is a basic example mapping that works:
{
"mappings": {
"properties": {
"name": {
"type": "text"
},
"users": {
"type": "nested"
}
}
}
}
And if you send a query like following, response will contain id and name from the parent document, and it will contain inner_hits that match to your user's isEnabled query.
{
"_source": ["id", "name"],
"query": {
"bool": {
"must": [
{
"match": {
"name": "some name"
}
},
{
"nested": {
"path": "users",
"query": {
"term": {
"users.isEnabled": {
"value": true
}
}
},
"inner_hits": {}
}
}
]
}
}
}
This is an example response
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9375811,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.9375811,
"_source" : {
"name" : "some name",
"id" : 6
},
"inner_hits" : {
"users" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.540445,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "users",
"offset" : 2
},
"_score" : 1.540445,
"_source" : {
"id" : 3,
"name" : "User3",
"isEnabled" : true
}
}
]
}
}
}
}
]
}
}
Then you can do the mapping in the application.

global sorting across different buckets after aggregation in elasticsearch

a sample in my document is as shown below.
{"rackName" : "rack005", "roomName" : "roomB", "power" : 132, "timestamp" : 1594540106208}
the thing I wanna do is get the latest data of each rack in a given room then sort them by power.
with the code below I did something to get close to my target.losing mind with the last step which seems like soring my data cross different buckets by field 'power'.
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"timestamp": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
-----------------------------------result-------------------------------------------------------
"aggregations" : {
"rk_ag" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "rack003",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "0FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack003",
"roomName" : "roomB",
"power" : 115,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack004",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "1FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack004",
"roomName" : "roomB",
"power" : 108,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack005",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "2FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack005",
"roomName" : "roomB",
"power" : 118,
"timestamp" : 1594540114492
},
"sort" : [
1594540114492
]
}
]
}
}
}
]
}
}
You're sorting by timestamp instead of power. Try this instead:
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
You can sort by multiple fields too.
Adding to #Joe's answer. As he mentioned, you can use multiple fields in the sort.
Below query would give you what you are looking for:
POST my_rack_index/_search
{
"size": 0,
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [ <---- Note this part
{
"timestamp": {
"order": "desc"
}
},
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
So now if for every rack you have two documents having same rackName with exact same power, the one with the latest timestamp would be showing up in the response.
The way sort would work is, first it would sort based on the timestamp, then it would do the sorting based on power by keeping the sort based on timestamp intact.

Elasticsearch Filtering Parents by Filtered Child Document Count

I'm attempting to do some elasticsearch query fu on a set of data I have.
I have a user document that is the parent to many child page view documents. I'm looking to return all users that have viewed a specific page an arbitrary amount of times (defined by user input box). So far, I've got a has_child query that will return me all the users that have a page view with certain ids. However, this will return those parents with all their children. Next, I've tried to write an aggregation on those query results, that will essentially do the same has_child query in aggregation form. Now, I have the right document count for my filtered child documents. I need to use this document count to go back and filter the parents. To explain the query in words, "return to me all the users that have viewed a specific page more than 4 times". It's possible that I may need to restructure my data. Any thoughts?
Here is my query thus far:
curl -XGET 'http://localhost:9200/development_users/_search?pretty=true' -d '
{
"query" : {
"has_child" : {
"type" : "page_view",
"query" : {
"terms" : {
"viewed_id" : [175,180]
}
}
}
},
"aggs" : {
"to_page_view": {
"children": {
"type" : "page_view"
},
"aggs" : {
"page_views_that_match" : {
"filter" : { "terms": { "viewed_id" : [175,180] } }
}
}
}
}
}'
This returns me a response like:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [ {
"_index" : "development_users",
"_type" : "user",
"_id" : "22548",
"_score" : 1.0,
"_source":{"id":22548,"account_id":1009}
} ]
},
"aggregations" : {
"to_page_view" : {
"doc_count" : 53,
"page_views_that_match" : {
"doc_count" : 2
}
}
}
}
Associated Mappings:
{
"development_users" : {
"mappings" : {
"page_view" : {
"dynamic" : "false",
"_parent" : {
"type" : "user"
},
"_routing" : {
"required" : true
},
"properties" : {
"created_at" : {
"type" : "date",
"format" : "date_time"
},
"id" : {
"type" : "integer"
},
"viewed_id" : {
"type" : "integer"
},
"time_on_page" : {
"type" : "integer"
},
"title" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"updated_at" : {
"type" : "date",
"format" : "date_time"
},
"url" : {
"type" : "string"
}
}
},
"user" : {
"dynamic" : "false",
"properties" : {
"account_id" : {
"type" : "integer"
},
"id" : {
"type" : "integer"
}
}
}
}
}
}
Okay, so this is kind of involved. I made a few simplifications to keep it straight in my head. First, I used this mapping:
PUT /test_index
{
"mappings": {
"page_view": {
"_parent": {
"type": "development_user"
},
"properties": {
"viewed_id": {
"type": "string"
}
}
},
"development_user": {
"properties": {
"id": {
"type": "string"
}
}
}
}
}
Then I added some data. In this little universe, I have three users and two pages. I want to find users who have viewed "page_a" at least twice, so if I construct the correct query only user 3 will be returned.
POST /test_index/development_user/_bulk
{"index":{"_type":"development_user","_id":1}}
{"id":"user_1"}
{"index":{"_type":"page_view","_parent":1}}
{"viewed_id":"page_a"}
{"index":{"_type":"development_user","_id":2}}
{"id":"user_2"}
{"index":{"_type":"page_view","_parent":2}}
{"viewed_id":"page_b"}
{"index":{"_type":"development_user","_id":3}}
{"id":"user_3"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_b"}
To get that answer we'll use aggregations. Notice that I don't want documents returned (the normal way), but I do want to filter down the documents we analyze, because it will make things more efficient. So I use the same basic filter you had before.
So the aggregation tree starts with terms_parent_id which will just separate parent documents. Inside that I have children_page_view which filters the child documents down to the ones I want ("page_a"), and next to it in the hierarchy is bucket_selector_page_id_term_count which uses a bucket selector (you'll need ES 2.x) to filter the parent documents by those meeting the criterium, and then finally a top hits aggregation which shows us the documents that match the requirements.
POST /test_index/development_user/_search
{
"size": 0,
"query": {
"has_child": {
"type": "page_view",
"query": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
},
"aggs": {
"terms_parent_id": {
"terms": {
"field": "id"
},
"aggs": {
"children_page_view": {
"children": {
"type": "page_view"
},
"aggs": {
"filter_page_ids": {
"filter": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
}
},
"bucket_selector_page_id_term_count": {
"bucket_selector": {
"buckets_path": {
"children_count": "children_page_view>filter_page_ids._count"
},
"script": "children_count >= 2"
}
},
"top_hits_users": {
"top_hits": {
"_source": {
"include": [
"id"
]
}
}
}
}
}
}
}
which returns:
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"terms_parent_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user_3",
"doc_count": 1,
"children_page_view": {
"doc_count": 3,
"filter_page_ids": {
"doc_count": 2
}
},
"top_hits_users": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "development_user",
"_id": "3",
"_score": 1,
"_source": {
"id": "user_3"
}
}
]
}
}
}
]
}
}
}
Here's all the code I used:
http://sense.qbox.io/gist/43f24461448519dc884039db40ebd8e2f5b7304f

Resources