How to get word count in docs as a aggregate over time in elastic search? - elasticsearch

I am trying to get word count trends in docs as aggregate result . Although using the following approach I am able to get the doc count aggregation result but I am not able to find any resources using which I can get word count for the month of jan , feb & mar
PUT test/_doc/1
{
"description" : "one two three four",
"month" : "jan"
}
PUT test/_doc/2
{
"description" : "one one test test test",
"month" : "feb"
}
PUT test/_doc/3
{
"description" : "one one one test",
"month" : "mar"
}
GET test/_search
{
"size": 0,
"query": {
"match": {
"description": {
"query": "one"
}
}
},
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
}
}
}
}
OUTPUT
{
"took" : 706,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1
},
{
"key" : "jan",
"doc_count" : 1
},
{
"key" : "mar",
"doc_count" : 1
}
]
}
}
}
EXPECTED WORD COUNT OVER MONTH
"aggregations" : {
"monthly_count" : {
"buckets" : [
{
"key" : "feb",
"word_count" : 2
},
{
"key" : "jan",
"word_count" : 1
},
{
"key" : "mar",
"word_count" : 3
}
]
}
}

Maybe this query can help you:
GET test/_search
{
"size": 0,
"aggs": {
"monthly_count": {
"terms": {
"field": "month.keyword"
},
"aggs": {
"count_word_one": {
"terms": {
"script": {
"source": """
def str = doc['description.keyword'].value;
def array = str.splitOnToken(' ');
int i = 0;
for (item in array) {
if(item == 'one'){
i++
}
}
return i;
"""
},
"size": 10
}
}
}
}
}
}
Response:
"aggregations" : {
"monthly_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "feb",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "2",
"doc_count" : 1
}
]
}
},
{
"key" : "jan",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1
}
]
}
},
{
"key" : "mar",
"doc_count" : 1,
"count_word_one" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 1
}
]
}
}
]
}
}

Related

Get an aggregate count in elasticsearch based on particular uniqueid field

I have created an index and indexed the document in elasticsearch it's working fine but here the challenge is i have to get an aggregate count of category field based on uniqueid i have given my sample documents below.
{
"UserID":"A1001",
"Category":"initiated",
"policyno":"5221"
},
{
"UserID":"A1001",
"Category":"pending",
"policyno":"5222"
},
{
"UserID":"A1001",
"Category":"pending",
"policyno":"5223"
},
{
"UserID":"A1002",
"Category":"completed",
"policyno":"5224"
}
**Sample output for UserID - "A1001"**
initiated-1
pending-2
**Sample output for UserID - "A1002"**
completed-1
How to get the aggregate count from above given Json documents like the sample output mentioned above
I suggest a terms aggregation as shown in the following:
{
"size": 0,
"aggs": {
"By_ID": {
"terms": {
"field": "UserID.keyword"
},
"aggs": {
"By_Category": {
"terms": {
"field": "Category.keyword"
}
}
}
}
}
}
Here is a snippet of the response:
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"By_ID" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "A1001",
"doc_count" : 3,
"By_Category" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "pending",
"doc_count" : 2
},
{
"key" : "initiated",
"doc_count" : 1
}
]
}
},
{
"key" : "A1002",
"doc_count" : 1,
"By_Category" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "completed",
"doc_count" : 1
}
]
}
}
]
}
}

ElasticSearch - Sort on the subaggregation

I am quite new to elasticsearch, I am trying to sort on a subaggregations. that is my results should be sorted based on the sub aggregations first. I have tried lot of things to enable this sort but it isn't working. Can anyone help with this?
{
"aggs": {
"distinct_part": {
"terms": {
"field": "part",
"size": 1000
}
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer",
"size": 1000
}
}
}
}
I am trying to sort on the manufacturer, my entire result should be sorted on that? Can someone point me on how I can achieve that?
I tried to do a test locally with your query. I did a small correction if I understood your issue well. I ingested the following data in the index "subsorting":
"part": "car",
"manufacturer": "brandA"
"part": "car",
"manufacturer": "brandB"
"part": "car",
"manufacturer": "brandC"
"part": "motor",
"manufacturer": "brandA"
"part": "motor",
"manufacturer": "brandB"
"part": "motor",
"manufacturer": "brandC"
Note: Both part and manufacturer are mapped as text.
GET subsorting/_search
{
"size": 0,
"aggs": {
"distinct_part": {
"terms": {
"field": "part.keyword",
"size": 1000
},
"aggs": {
"distinct_manufacturer": {
"terms": {
"field": "manufacturer.keyword",
"order": {
"_key": "asc"
},
"size": 1000
}
}
}
}
}
}
If both fields "part" and "manufacturer" are mapped as keywords, remove the ".keywords" from the query.
The response from the above query is as follows if sorted as ascending order:
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 2
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandA",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandC",
"doc_count" : 1
}
]
}
}
]
}
}
If you need the result as descending order, here is the response where "_key": "desc":
"aggregations" : {
"distinct_part" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "motor",
"doc_count" : 4,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 2
}
]
}
},
{
"key" : "car",
"doc_count" : 3,
"distinct_manufacturer" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "brandC",
"doc_count" : 1
},
{
"key" : "brandB",
"doc_count" : 1
},
{
"key" : "brandA",
"doc_count" : 1
}
]
}
}
]
}
}
Links:
https://www.elastic.co/guide/en/elasticsearch/reference/7.9/search-aggregations-bucket-terms-aggregation.html

Select aggregations based on sub aggregation results doc count

I am aiming to only select those aggregations that have min_doc_count match defined in sub aggregations. Not sure if it is possible.
Basically I want to select only those buckets that have propertyid belonging to a particular import.
Here is my query.
GET properties/_search
{
"size": 0,
"query": {
"terms": {
"Agency_Id": [
"16"
]
}
},
"aggregations": {
"property_id": {
"terms": {
"field": "PropertyId",
"min_doc_count": 2,
"size": 10000
},
"aggregations": {
"import_filter": {
"filter": {
"term": {
"Import_Id": "90040"
}
},
"aggregations": {
"import_id": {
"terms": {
"field": "Import_Id",
"min_doc_count": 1,
"size": 10000
}
}
}
}
}
}
}
}
Actual result
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1163,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"property_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "011162330",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6065590",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6289352",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "gd-00-022386",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 0,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
}
]
}
}
}
Expected
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1163,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"property_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "011162330",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6065590",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
},
{
"key" : "6289352",
"doc_count" : 2,
"import_filter" : {
"doc_count" : 1,
"import_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 90040,
"doc_count" : 1
}
]
}
}
}
]
}
}
}
Based on my understanding of your query, you need Bucket selector aggregation
Query:
GET properties/_search
{
"size": 0,
"query": {
"terms": {
"Agency_Id": [
"16"
]
}
},
"aggregations": {
"property_id": {
"terms": {
"field": "PropertyId",
"min_doc_count": 2,
"size": 10000
},
"aggregations": {
"import_filter": {
"filter": {
"term": {
"Import_Id": "90040"
}
},
"aggregations": {
"import_id": {
"terms": {
"field": "Import_Id",
"min_doc_count": 1,
"size": 10000
}
}
}
},
"mybucket_selector": { ---> select product bucket if import bucket has any value
"bucket_selector": {
"buckets_path": {
"FinalCount": "import_filter>import_id._bucket_count"
},
"script": "params.FinalCount>0"
}
}
}
}
}
}

how to get buckets count in elasticsearch aggregations?

I'm trying to get how many buckets on an aggregation in specific datetime range,
{
"size": 0,
"aggs": {
"filtered_aggs": {
"filter": {
"range": {
"datetime": {
"gte": "2017-03-01T00:00:00.000Z",
"lte": "2017-06-01T00:00:00.000Z"
}
}
},
"aggs": {
"addr": {
"terms": {
"field": "region",
"size": 10000
}
}
}
}
}
}
output:
"took" : 317,
"timed_out" : false,
"num_reduce_phases" : 3,
"_shards" : {
"total" : 1118,
"successful" : 1118,
"failed" : 0
},
"hits" : {
"total" : 1899658551,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"filtered_aggs" : {
"doc_count" : 88,
"addr" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "NY",
"doc_count" : 36
},
{
"key" : "CA",
"doc_count" : 13
},
{
"key" : "JS",
"doc_count" : 7
..........
Is there a way to return both requests (buckets + total bucket count) in one search?
I'm using Elasticsearch 5.5.0
Can I get all of them?

Sorting in sub aggregated result of a sub sub aggregated in elasticsearch

I have to rewrite the SQL Query to NOSQL Query.
SELECT count(1) as total,
count(CASE WHEN updated >= now() - '1 day'::interval THEN 1 END) as daily,
count(CASE WHEN updated >= now() - '7 day'::interval THEN 1 END) as weekly,
count(CASE WHEN updated >= now() - '30 day'::interval THEN 1 END) as monthly,
status_code, state
FROM alerts
GROUP BY status_code, state
ORDER BY total DESC, status_code, state
Following is output for SQL Query
total | daily | weekly | monthly | status_code | state
------------------------------------------------------------------------------------
2 0 0 1 test1 ACTIVE
2 0 1 2 test1 INACTIVE
2 1 1 1 test2 INACTIVE
1 0 0 1 test3 ACTIVE
I got struck while ordering the 'total' column while writing NOSQL Query
Below is the NOSQL Query i used
{
"stateAggregation": {
"terms": {
"field": "state"
},
"aggs": {
"statusCodeAggregation": {
"terms": {
"field": "status_code"
} ,
"aggs": {
"total" : {
"date_range": {
"field": "updated",
"ranges": [{ "to": "now" }]
}
},
"daily" : {
"date_range": {
"field": "updated",
"ranges": [{ "from": "now-1d/d" }]
}
},
"weekly" : {
"date_range": {
"field": "updated",
"ranges": [{ "from": "now-7d/d" }]
}
},
"monthly" : {
"date_range": {
"field": "updated",
"ranges": [{ "from": "now-30d/d" }]
}
}
}
}
}
}
}
Following is output for NOSQL Query
{
"took" : 8,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"aggregations" : {
"stateAggregation" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "active",
"doc_count" : 3,
"statusCodeAggregation" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "test 1",
"doc_count" : 2,
"weekly" : {
"buckets" : [ {
"key" : "2015-09-04T00:00:00.000Z-*",
"from" : 1.4413248E12,
"from_as_string" : "2015-09-04T00:00:00.000Z",
"doc_count" : 0
} ]
},
"total" : {
"buckets" : [ {
"key" : "*-2015-09-11T12:42:58.463Z",
"to" : 1.441975378463E12,
"to_as_string" : "2015-09-11T12:42:58.463Z",
"doc_count" : 2
} ]
},
"monthly" : {
"buckets" : [ {
"key" : "2015-08-12T00:00:00.000Z-*",
"from" : 1.4393376E12,
"from_as_string" : "2015-08-12T00:00:00.000Z",
"doc_count" : 1
} ]
},
"daily" : {
"buckets" : [ {
"key" : "2015-09-10T00:00:00.000Z-*",
"from" : 1.4418432E12,
"from_as_string" : "2015-09-10T00:00:00.000Z",
"doc_count" : 0
} ]
}
}, {
"key" : "test",
"doc_count" : 1,
"weekly" : {
"buckets" : [ {
"key" : "2015-09-04T00:00:00.000Z-*",
"from" : 1.4413248E12,
"from_as_string" : "2015-09-04T00:00:00.000Z",
"doc_count" : 1
} ]
},
"total" : {
"buckets" : [ {
"key" : "*-2015-09-11T12:42:58.463Z",
"to" : 1.441975378463E12,
"to_as_string" : "2015-09-11T12:42:58.463Z",
"doc_count" : 1
} ]
},
"monthly" : {
"buckets" : [ {
"key" : "2015-08-12T00:00:00.000Z-*",
"from" : 1.4393376E12,
"from_as_string" : "2015-08-12T00:00:00.000Z",
"doc_count" : 1
} ]
},
"daily" : {
"buckets" : [ {
"key" : "2015-09-10T00:00:00.000Z-*",
"from" : 1.4418432E12,
"from_as_string" : "2015-09-10T00:00:00.000Z",
"doc_count" : 1
} ]
}
} ]
}
}, {
"key" : "mute",
"doc_count" : 2,
"statusCodeAggregation" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "test",
"doc_count" : 2,
"weekly" : {
"buckets" : [ {
"key" : "2015-09-04T00:00:00.000Z-*",
"from" : 1.4413248E12,
"from_as_string" : "2015-09-04T00:00:00.000Z",
"doc_count" : 1
} ]
},
"total" : {
"buckets" : [ {
"key" : "*-2015-09-11T12:42:58.463Z",
"to" : 1.441975378463E12,
"to_as_string" : "2015-09-11T12:42:58.463Z",
"doc_count" : 2
} ]
},
"monthly" : {
"buckets" : [ {
"key" : "2015-08-12T00:00:00.000Z-*",
"from" : 1.4393376E12,
"from_as_string" : "2015-08-12T00:00:00.000Z",
"doc_count" : 2
} ]
},
"daily" : {
"buckets" : [ {
"key" : "2015-09-10T00:00:00.000Z-*",
"from" : 1.4418432E12,
"from_as_string" : "2015-09-10T00:00:00.000Z",
"doc_count" : 1
} ]
}
} ]
}
} ]
}
}
}
Can anyone please help me out in modifying the NOSQL query for applying order on 'total' aggregation?
When i try to add order on total in status code aggregation
"statusCodeAggregation": {
"terms": {
"field": "status_code",
"order" :{ "total._count" : "desc" }
}
Then i got the following error
AggregationExecutionException[Invalid terms aggregation order path [total._count]. Terms buckets can only be sorted on a sub-aggregator path that is built out of zero or more single-bucket aggregations within the path and a final single-bucket or a metrics aggregation at the path end.]}

Resources