How to derive a field from two fields in an Elasticsearch index? - elasticsearch

I have an index with fields:
room_name
start_date (start time room is used)
end_date (end time room is used)
I am creating a curl command wherein I can get the time when a room was used.
Is it possible?
Here is current curl command:
curl -XGET "https://localhost:9200/testindex/_search?pretty" -H 'Content-Type: application/json' -d'
{
"aggs": {
"room_bucket":{
"terms": {
"field": "room_name.keyword",
},
"aggs":{
"hour_bucket": {
"terms": {
"script": {
"inline": "def l = doc[\"start_date \"].value;\nif ( l <= 20 && l >= 9 ) {\n return l;\n}",
"lang": "painless"
},
"order": {
"_key": "asc"
},
"value_type": "long"
}
}
}
}
}
}'
Here is the result:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : 1.0,
"hits" : [
{
"_index" : "testindex",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"log_version" : 1,
"start_date" : 10,
"end_date" : 11,
"room_name" : "room_Y"
}
},
{
"_index" : "testindex",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"log_version" : 1,
"start_date" : 11,
"end_date" : 13,
"room_name" : "room_V"
}
},
{
"_index" : "testindex",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"log_version" : 1,
"start_date" : 10,
"end_date" : 12,
"room_name" : "room_Y"
}
}
]
},
"aggregations" : {
"room_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "room_V",
"doc_count" : 1,
"hour_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 11,
"doc_count" : 1
}
]
}
},
{
"key" : "room_Y",
"doc_count" : 1,
"hour_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 1
}
]
}
}
]
}
}
}
But my expected result in the "aggregations" is the following:
"aggregations" : {
"room_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "room_V",
"doc_count" : 1,
"hour_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 11,
"doc_count" : 1
},
{
"key" : 12,
"doc_count" : 1
},
{
"key" : 13,
"doc_count" : 1
}
]
}
},
{
"key" : "room_Y",
"doc_count" : 1,
"hour_bucket" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 10,
"doc_count" : 2
},
{
"key" : 11,
"doc_count" : 2
},
{
"key" : 12,
"doc_count" : 1
}
]
}
}
]
}
}
In the current result, it only reads the start_date.
However, in the expected output, Room_V should have "key" = 11, "key" = 12, "key" = 13 (doc_count should be 1 for each key) because based on start_date and end_date, the room was used from 11 - 13.

You can achieve what you want by leveraging LongStream and creating an array of all the hours in the interval, like this:
curl -XGET "https://localhost:9200/testindex/_search?pretty" -H 'Content-Type: application/json' -d'
{
"aggs": {
"room_bucket": {
"terms": {
"field": "room_name.keyword"
},
"aggs": {
"hour_bucket": {
"terms": {
"script": {
"inline": """
return LongStream.rangeClosed(doc.start_date.value, doc.end_date.value).toArray();
""",
"lang": "painless"
},
"order": {
"_key": "asc"
},
"value_type": "long"
}
}
}
}
}
}'

Related

How to get word count in docs as a aggregate over time in elastic search?

I am trying to get word count trends in docs as aggregate result . Although using the following approach I am able to get the doc count aggregation result but I am not able to find any resources using which I can get word count for the month of jan , feb & mar
PUT test/_doc/1
{
"description" : "one two three four",
"month" : "jan"
}
PUT test/_doc/2
{
"description" : "one one test test test",
"month" : "feb"
}
PUT test/_doc/3
{
"description" : "one one one test",
"month" : "mar"
}
GET test/_search
{
"size": 0,
"query": {
"match": {
"description": {
"query": "one"
}
}
},
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
}
}
}
}
OUTPUT
{
"took" : 706,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1
},
{
"key" : "jan",
"doc_count" : 1
},
{
"key" : "mar",
"doc_count" : 1
}
]
}
}
}
EXPECTED WORD COUNT OVER MONTH
"aggregations" : {
"monthly_count" : {
"buckets" : [
{
"key" : "feb",
"word_count" : 2
},
{
"key" : "jan",
"word_count" : 1
},
{
"key" : "mar",
"word_count" : 3
}
]
}
}
Maybe this query can help you:
GET test/_search
{
"size": 0,
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
},
"aggs": {
"count_word_one": {
"terms": {
"script": {
"source": """
def str = doc['description.keyword'].value;
def array = str.splitOnToken(' ');
int i = 0;
for (item in array) {
if(item == 'one'){
i++
}
}
return i;
"""
},
"size": 10
}
}
}
}
}
}
Response:
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "2",
"doc_count" : 1
}
]
}
},
{
"key" : "jan",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1
}
]
}
},
{
"key" : "mar",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 1
}
]
}
}
]
}
}

How to design the index to perform viewed by and viewed by me information

UserA views UserB
UserA views UserC
UserD views UserA
Who Viewed You queries:-
Who Viewed You should show UserD for UserA
Who Viewed You should show UserA for UserB
Who Viewed You should show UserA for UserC
Viewed By Me queries:-
Viewed By Me should show UserA for UserD
How should we model the users index, to fetch the above information
users index contains first_name, last_name, gender, ...
I would just save a array in a visitors fields (or visited depending of the lower cardinality)
I guess that the docs can be huge so to optimize (and avoid a large number of updates), I would have a "visits_logs" indices with just logs and a LCM with a short delete phase. (one index a day and keeping one week of data before deletion)
{"visitor": "userA", "visited": "userB", "#timestamp": 12345678990}
Then at night, use a transformation of a manual aggregation to populate an aggregation index per period:
PUT visits/_doc
{
"visitor": "UserA",
"#timestamp": "today",
"visited": {
"users": ["UserB", "UserC", "UserD"],
"quantity": 3
}
Details really depends on your real use case and volume of your data.
But I think it's a robust solution.
UPDATE:
The queries would be:
If you want to know all users visited by UserA
GET test/_search
{
"query": {
"match": {
"visitor": "UserA"
}
}
}
Response will looks like this and you just have to merge visited arrays
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.4700036,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "5k-z3XQBDjdqjSSDl_K5",
"_score" : 0.4700036,
"_source" : {
"#timestamp" : "today",
"visited" : {
"users" : [
"UserB",
"UserC",
"UserD"
],
"quantity" : 3
},
"visitor" : "UserA"
}
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "Ksaz3XQBk-8NpR_boPe2",
"_score" : 0.4700036,
"_source" : {
"#timestamp" : "today",
"visited" : {
"users" : [
"UserB",
"UserC",
"UserD"
],
"quantity" : 3
},
"visitor" : "UserA"
}
}
]
}
}
If you want to get "who visit userB"
GET test/_search
{
"query": {
"match": {
"visited.users": "UserB"
}
},
"_source": ["#timestamp", "visitor"]
}
And the answers are then visitors.
You can have a more qualified result with aggregations
GET test/_search
{
"size": 0,
"query": {
"match": {
"visited.users": "UserB"
}
},
"aggs": {
"visitors": {
"terms": {
"field": "visitor.keyword",
"size": 10
}
}
}
}
With a result like
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"visitors" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "UserA",
"doc_count" : 2
}
]
}
}
}
and for visited
GET test/_search
{
"size": 0,
"query": {
"match": {
"visitor": "UserA"
}
},
"aggs": {
"visits": {
"terms": {
"field": "visited.users.keyword",
"size": 10
}
}
}
}
with a result like:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"visits" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "UserB",
"doc_count" : 2
},
{
"key" : "UserC",
"doc_count" : 2
},
{
"key" : "UserD",
"doc_count" : 2
}
]
}
}
}

Get an aggregate count in elasticsearch based on particular uniqueid field

I have created an index and indexed the document in elasticsearch it's working fine but here the challenge is i have to get an aggregate count of category field based on uniqueid i have given my sample documents below.
{
"UserID":"A1001",
"Category":"initiated",
"policyno":"5221"
},
{
"UserID":"A1001",
"Category":"pending",
"policyno":"5222"
},
{
"UserID":"A1001",
"Category":"pending",
"policyno":"5223"
},
{
"UserID":"A1002",
"Category":"completed",
"policyno":"5224"
}
**Sample output for UserID - "A1001"**
initiated-1
pending-2
**Sample output for UserID - "A1002"**
completed-1
How to get the aggregate count from above given Json documents like the sample output mentioned above
I suggest a terms aggregation as shown in the following:
{
"size": 0,
"aggs": {
"By_ID": {
"terms": {
"field": "UserID.keyword"
},
"aggs": {
"By_Category": {
"terms": {
"field": "Category.keyword"
}
}
}
}
}
}
Here is a snippet of the response:
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"By_ID" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "A1001",
"doc_count" : 3,
"By_Category" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "pending",
"doc_count" : 2
},
{
"key" : "initiated",
"doc_count" : 1
}
]
}
},
{
"key" : "A1002",
"doc_count" : 1,
"By_Category" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "completed",
"doc_count" : 1
}
]
}
}
]
}
}

How can i extend an elastic search date range histogram aggregation query?

Hi I have an elastic search index named mep-report.
Each document has a status field. The possible values for status fields are "ENROUTE", "SUBMITTED", "DELIVERED", "FAILED" . Below is the sample elastic search index with 6 documents.
{
"took" : 10,
"timed_out" : false,
"_shards" : {
"total" : 13,
"successful" : 13,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1094313,
"max_score" : 1.0,
"hits" : [
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837500",
"_score" : 1.0,
"_source" : {
"status" : "ENROUTE",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837501",
"_score" : 1.0,
"_source" : {
"status" : "ENROUTE",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837502",
"_score" : 1.0,
"_source" : {
"status" : "SUBMITTED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837503",
"_score" : 1.0,
"_source" : {
"status" : "DELIVERED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837504",
"_score" : 1.0,
"_source" : {
"status" : "FAILED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
},
{
"_index" : "mep-reports-2019.09.11",
"_type" : "doc",
"_id" : "68e8e03f-baf8-4bfc-a920-58e26edf835c-353899837504",
"_score" : 1.0,
"_source" : {
"status" : "FAILED",
"#timestamp" : "2019-09-11T10:21:26.000Z"
}
}
}
I would like to find an aggregation histogram distribution something like to get messages_processed, message_delivered,messages_failed .
messages_processed : 3 ( 2 documents in status ENROUTE + 1 Document with status SUBMITTED )
message_delivered 1 ( 1 document with status DELIVERED )
messages_failed : 2 ( 2 documents with status FAILED )
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 13,
"successful" : 13,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 21300,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"performance_over_time" : {
"buckets" : [
{
"key_as_string" : "2020-02-21",
"key" : 1582243200000,
"doc_count" : 6,
"message_processed": 3,
"message_delivered": 1,
"message_failed": 2
}
]
}
}
}
So the following is my current query and i would like to modify it to get some additional statistics such as message_processed , message_delivered, message_failed. kindly let me know .
{ "size": 0, "query": { "bool": { "must": [ { "range": { "#timestamp": { "from": "2020-02-21T00:00Z", "to": "2020-02-21T23:59:59.999Z", "include_lower": true, "include_upper": true, "format": "yyyy-MM-dd'T'HH:mm:ss.SSSZ ||yyyy-MM-dd'T'HH:mmZ", "boost": 1.0 } } } ], "adjust_pure_negative": true, "boost": 1.0 } }, "aggregations": { "performance_over_time": { "date_histogram": { "field": "#timestamp", "format": "yyyy-MM-dd", "interval": "1d", "offset": 0, "order": { "_key": "asc" }, "keyed": false, "min_doc_count": 0 } } } }
You are almost there with the query, you just need to add Terms Aggregation and looking at your request, I've come up with a Scripted Terms Aggregation.
I've also modified the date histogram aggregation field interval to calendar_interval so that you get the values as per the calendar date.
Query Request:
POST <your_index_name>/_search
{
"size": 0,
"query":{
"bool":{
"must":[
{
"range":{
"#timestamp":{
"from":"2019-09-10",
"to":"2019-09-12",
"include_lower":true,
"include_upper":true,
"boost":1.0
}
}
}
],
"adjust_pure_negative":true,
"boost":1.0
}
},
"aggs":{
"message_processed":{
"date_histogram": {
"field": "#timestamp",
"calendar_interval": "1d" <----- Note this
},
"aggs": {
"my_messages": {
"terms": {
"script": { <----- Core Logic of Terms Agg
"source": """
if(doc['status'].value=="ENROUTE" || doc['status'].value == "SUBMITTED"){
return "message_processed";
}else if(doc['status'].value=="DELIVERED"){
return "message_delivered"
}else {
return "message_failed"
}
""",
"lang": "painless"
},
"size": 10
}
}
}
}
}
}
Note that the core logic what you are looking for is inside the scripted terms aggregation. Logic is self explainable if you go through it. Feel free to modify the logic that fits you.
For the sample date you've shared, you would get the result in the below format:
Response:
{
"took" : 144,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 6,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"message_processed" : {
"buckets" : [
{
"key_as_string" : "2019-09-11T00:00:00.000Z",
"key" : 1568160000000,
"doc_count" : 6,
"my_messages" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "message_processed",
"doc_count" : 3
},
{
"key" : "message_failed",
"doc_count" : 2
},
{
"key" : "message_delivered",
"doc_count" : 1
}
]
}
}
]
}
}
}

Select aggregations based on sub aggregation results doc count

I am aiming to only select those aggregations that have min_doc_count match defined in sub aggregations. Not sure if it is possible.
Basically I want to select only those buckets that have propertyid belonging to a particular import.
Here is my query.
GET properties/_search
{
"size": 0,
"query": {
"terms": {
"Agency_Id": [
"16"
]
}
},
"aggregations": {
"property_id": {
"terms": {
"field": "PropertyId",
"min_doc_count": 2,
"size": 10000
},
"aggregations": {
"import_filter": {
"filter": {
"term": {
"Import_Id": "90040"
}
},
"aggregations": {
"import_id": {
"terms": {
"field": "Import_Id",
"min_doc_count": 1,
"size": 10000
}
}
}
}
}
}
}
}
Actual result
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1163,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"property_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "011162330",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6065590",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6289352",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "gd-00-022386",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 0,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
}
]
}
}
}
Expected
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1163,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"property_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "011162330",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6065590",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6289352",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
}
]
}
}
}
Based on my understanding of your query, you need Bucket selector aggregation
Query:
GET properties/_search
{
"size": 0,
"query": {
"terms": {
"Agency_Id": [
"16"
]
}
},
"aggregations": {
"property_id": {
"terms": {
"field": "PropertyId",
"min_doc_count": 2,
"size": 10000
},
"aggregations": {
"import_filter": {
"filter": {
"term": {
"Import_Id": "90040"
}
},
"aggregations": {
"import_id": {
"terms": {
"field": "Import_Id",
"min_doc_count": 1,
"size": 10000
}
}
}
},
"mybucket_selector": { ---> select product bucket if import bucket has any value
"bucket_selector": {
"buckets_path": {
"FinalCount": "import_filter>import_id._bucket_count"
},
"script": "params.FinalCount>0"
}
}
}
}
}
}

Resources