How to perform sub-aggregation in elasticsearch? - elasticsearch

I have a set of article documents in elasticsearch with fields content and publish_datetime.
I am trying to retrieve most frequent words from articles with publish year == 2021.
GET articles/_search
{
"query": {
"match_all": {}
},
"aggs": {
"word_counts": {
"terms": {
"field": "content"
}
},
"publish_datetime": {
"terms": {
"field": "publish_datetime"
}
},
"aggs": {
"word_counts_2021": {
"bucket_selector": {
"buckets_path": {
"word_counts": "word_counts",
"pd": "publish_datetime"
},
"script": "LocalDateTime.parse(params.pd).getYear() == 2021"
}
}
}
}
}
This fails on
{
"error" : {
"root_cause" : [
{
"type" : "parsing_exception",
"reason" : "Unknown aggregation type [word_counts_2021]",
"line" : 17,
"col" : 25
}
],
"type" : "parsing_exception",
"reason" : "Unknown aggregation type [word_counts_2021]",
"line" : 17,
"col" : 25,
"caused_by" : {
"type" : "named_object_not_found_exception",
"reason" : "[17:25] unknown field [word_counts_2021]"
}
},
"status" : 400
}
which does not make sense, because word_counts2021 is the name of the aggregation accordings to docs. It's not an aggregation type. I am the one who pics the name, so I thought it could have had basically any value.
Does anyone have any idea, what's going on there. So far, it seems pretty unintuitive service to me.

The agg as you have it written seems to be filtering publish_datetime buckets so that you only include those in the year 2021 to do that you must nest the sub-agg under that particular terms aggregation.
Like so:
GET articles/_search
{
"query": {
"match_all": {}
},
"aggs": {
"word_counts": {
"terms": {
"field": "content"
}
},
"publish_datetime": {
"terms": {
"field": "publish_datetime"
}
"aggs": {
"word_counts_2021": {
"bucket_selector": {
"buckets_path": {
"pd": "publish_datetime"
},
"script": "LocalDateTime.parse(params.pd).getYear() == 2021"
}
}
}
}
}
}
But, if that field has a date time type, I would suggest simply filtering with a range query and then aggregating your documents.

Related

Elasticsearch, composite and sub(?) aggregations

I'm using composite to scroll through whole data. (it's like pagination)
Suppose a car selling data,
For each day, I'd like to count the number of cars sold per car-brand
{
day1: {
honda: 3,
bmw: 5
},
day2: {
honda: 4,
audi: 1,
tesla:5
}
}
I'm doing something like the following but it doesn't work
GET _search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"date": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "1d"
},
"aggs": {
"car_brand": {
"terms": {
"field": "car_brands"
}
}
}
}
}
]
}
}
}
}
with error message
{
"error" : {
"root_cause" : [
{
"type" : "x_content_parse_exception",
"reason" : "[14:17] [composite] failed to parse field [sources]"
}
],
"type" : "x_content_parse_exception",
"reason" : "[14:17] [composite] failed to parse field [sources]",
"caused_by" : {
"type" : "illegal_state_exception",
"reason" : "expected value but got [FIELD_NAME]"
}
},
"status" : 400
}
Composite aggs cannot directly accept sub-aggs. Go with
GET _search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"date": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "1d"
}
}
},
{
"car_brand": {
"terms": {
"field": "car_brands"
}
}
}
]
}
}
}
}
instead.

I need to get average document count by date in elasticsearch

I want to get average document count by date without getting the whole bunch of buckets data and get average value by hand cause there are years of data and when I group by the date I get too_many_buckets_exception.
So my current query is
{
"query": {
"bool": {
"must": [],
"filter": []
}
},
"aggs": {
"groupByChannle": {
"terms": {
"field": "channel"
},
"aggs": {
"docs_per_day": {
"date_histogram": {
"field": "message_date",
"fixed_interval": "1d"
}
}
}
}
}
}
How can I get an average doc count grouped by message_date(day) and channel without taking buckets array of this data
"buckets" : [
{
"key_as_string" : "2018-03-17 00:00:00",
"key" : 1521244800000,
"doc_count" : 4027
},
{
"key_as_string" : "2018-03-18 00:00:00",
"key" : 1521331200000,
"doc_count" : 10133
},
...thousands of rows
]
my index structure looks like this
"mappings" : {
"properties" : {
"channel" : {
"type" : "keyword"
},
"message" : {
"type" : "text"
},
"message_date" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
},
}
}
By this query, I want to get JUST A AVERAGE DOC COUNT BY DATE and nothing else
"avg_count": {
"avg_bucket": {
"buckets_path": "docs_per_day>_count"
}
}
after docs_per_day ending this.
avg_count provides average count.
_count refers the bucket count
I think, that you can use stats aggregation with the script :
{
"size": 0,
"aggs": {
"term": {
"terms": {
"field": "chanel"
},
"aggs": {
"stats": {
"stats": {
"field": "message_date"
}
},
"result": {
"bucket_script": {
"buckets_path": {
"max" : "stats.max",
"min" : "stats.min",
"count" : "stats.count"
},
"script": "params.count/(params.max - params.min)/1000/86400)"
}
}
}
}
}
}

Count the percentage of character fields

I want to count the percentage of specified field data.
this is my Restful API:
Restful API:
GET _search
{
"_source": {
"includes": [ "FIRST_SWITCHED","LAST_SWITCHED","IPV4_DST_ADDR","L4_DST_PORT","IPV4_SRC_ADDR","L7_PROTO_NAME","IN_BYTES","IN_PKTS","OUT_BYTES","OUT_PKTS"]
},
"from" : 0, "size" : 10000,
"query": {
"bool": {
"must": [
{
"match" : { "_index" : "logstash-2017.12.22" }
},
{
"match_phrase":{"IPV4_SRC_ADDR":"192.168.0.159"}
},
{
"range" : {
"LAST_SWITCHED" : {
"gte" : 1513683600
}
}
}
]
}
},
"aggs": {
"IN_PKTS": {
"sum": {
"field": "IN_PKTS"
}
},
"IN_BYTES": {
"sum": {
"field": "IN_BYTES"
}
},
"OUT_BYTES": {
"sum": {
"field": "OUT_BYTES"
}
},
"OUT_PKTS": {
"sum": {
"field": "OUT_PKTS"
}
},
"percent":{
"significant_terms" : {
"field" : "L7_PROTO_NAME",
"percentage":{}
}},
"protocol" : {
"terms" : {
"field" : "PROTOCOL",
"include" : ["17", "6"]
}
},
"Using_port_count" : {
"cardinality" : {
"field" : "L4_SRC_PORT"
}
}
}
}
but there's some errors.
this is error messages:
error messages:
"reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [L7_PROTO_NAME] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
thank you in advance!
ok, I find the answer!
just add .keyword at here then it can run!
"field" : "L7_PROTO_NAME.keyword"

ElasticSearch: Is min_doc_count supported for Metric Aggregations

I am new to Elastic Search and am trying to make a query with Metric aggregation for my docs. But when I add the field: min_doc_count=1 for my sum metric aggregation, I get an error:
`
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "[sum] unknown field [min_doc_count], parser not found"
}
],
"type": "illegal_argument_exception",
"reason": "[sum] unknown field [min_doc_count], parser not found"
},
"status": 400
}
`
What am I missing here?
`
{
"aggregations" : {
"myKey" : {
"sum" : {
"field" : "field1",
"min_doc_count": 1
}
}
}
}
`
I'm not sure why/where you have the sum keyword?
The idea of min_doc_count is to make sure buckets returned by a given aggs query contain at least N documents, the example below would only return subject buckets for subjects that appear in 10 or more documents.
GET _search
{
"aggs" : {
"docs_per_subject" : {
"terms" : {
"field" : "subject",
"min_doc_count": 10
}
}
}
}
So with that in mind, yours would refactor to the following... Although when setting min_doc_count to 1, it's not really necessary to keep the parameter at all.
GET _search
{
"aggs" : {
"docs_per_subject" : {
"terms" : {
"field" : "field1",
"min_doc_count": 1
}
}
}
}
If you wish to sum only non-zero values of field you can filter those zero-values out in a query section:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"field": {
"gt": 0
}
}
}
]
}
},
"aggregations": {
"myKey": {
"sum": {
"field": "field1"
}
}
}
}
See Bool Query and Range Term

Elasticsearch sort inside top_hits aggregation

I have an index of messages where I store messageHash for each message too. I also have many more fields along with them. There are multiple duplicate message fields in the index e.g. "Hello". I want to retrieve unique messages.
Here is the query I wrote to search unique messages and sort them by date. I mean the message with the latest date among all duplicates is what I want
to be returned.
{
"query": {
"bool": {
"must": {
"match_phrase": {
"message": "Hello"
}
}
}
},
"sort": [
{
"date": {
"order": "desc"
}
}
],
"aggs": {
"top_messages": {
"terms": {
"field": "messageHash"
},
"aggs": {
"top_messages_hits": {
"top_hits": {
"sort": [
{
"date": {
"order": "desc"
}
},
"_score"
],
"size": 1
}
}
}
}
}
}
The problem is that it's not sorted by date. It's sorted by doc_count. I just get the sort values in the response, not the real sorted results. What's wrong? I'm now wondering if it is even possible to do it.
EDIT:
I tried subsituting "terms" : { "field" : "messageHash", "order" : { "mydate" : "desc" } } , "aggs" : { "mydate" : { "max" : { "field" : "date" } } } for "terms": { "field": "messageHash" } but I get:
{
"error" : {
"root_cause" : [
{
"type" : "parsing_exception",
"reason" : "Found two sub aggregation definitions under [top_messages]",
"line" : 1,
"col" : 412
}
],
"type" : "parsing_exception",
"reason" : "Found two sub aggregation definitions under [top_messages]",
"line" : 1,
"col" : 412
},
"status" : 400
}

Resources