Converting SQL query to ElasticSearch Query - caching

I want to convert the following sql query to Elasticsearch one. can any one help in this.
select csgg, sum(amount) from table1
where type in ('a','b','c') and year=2016 and fc="33" group by csgg having sum(amount)=0
I tried following way:enter code here
{
"size": 500,
"query" : {
"constant_score" : {
"filter" : {
"bool" : {
"must" : [
{"term" : {"fc" : "33"}},
{"term" : {"year" : 2016}}
],
"should" : [
{"terms" : {"type" : ["a","b","c"] }}
]
}
}
}
},
"aggs": {
"group_by_csgg": {
"terms": {
"field": "csgg"
},
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
}
}
}
}
}
but not sure if I am doing right as its not validating the results.
seems query to be added inside aggregation.

Assuming that you use Elasticsearch 2.x, there is a possibility to have the having-semantics in Elasticsearch.
I'm not aware of a possibility prior 2.0.
You can use the new Pipeline Aggregation Bucket Selector Aggregation, which only selects the buckets, which meet a certain criteria:
POST test/test/_search
{
"size": 0,
"query" : {
"constant_score" : {
"filter" : {
"bool" : {
"must" : [
{"term" : {"fc" : "33"}},
{"term" : {"year" : 2016}},
{"terms" : {"type" : ["a","b","c"] }}
]
}
}
}
},
"aggs": {
"group_by_csgg": {
"terms": {
"field": "csgg",
"size": 100
},
"aggs": {
"sum_amount": {
"sum": {
"field": "amount"
}
},
"no_amount_filter": {
"bucket_selector": {
"buckets_path": {"sumAmount": "sum_amount"},
"script": "sumAmount == 0"
}
}
}
}
}
}
However there are two caveats. Depending on your configuration, it might be necessary to enable scripting like that:
script.aggs: true
script.groovy: true
Moreover, as it works on the parent buckets it is not guaranteed that you get all buckets with amount = 0. If the terms aggregation selects only terms with sum amount != 0, you will have no result.

Related

I need to get average document count by date in elasticsearch

I want to get average document count by date without getting the whole bunch of buckets data and get average value by hand cause there are years of data and when I group by the date I get too_many_buckets_exception.
So my current query is
{
"query": {
"bool": {
"must": [],
"filter": []
}
},
"aggs": {
"groupByChannle": {
"terms": {
"field": "channel"
},
"aggs": {
"docs_per_day": {
"date_histogram": {
"field": "message_date",
"fixed_interval": "1d"
}
}
}
}
}
}
How can I get an average doc count grouped by message_date(day) and channel without taking buckets array of this data
"buckets" : [
{
"key_as_string" : "2018-03-17 00:00:00",
"key" : 1521244800000,
"doc_count" : 4027
},
{
"key_as_string" : "2018-03-18 00:00:00",
"key" : 1521331200000,
"doc_count" : 10133
},
...thousands of rows
]
my index structure looks like this
"mappings" : {
"properties" : {
"channel" : {
"type" : "keyword"
},
"message" : {
"type" : "text"
},
"message_date" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
},
}
}
By this query, I want to get JUST A AVERAGE DOC COUNT BY DATE and nothing else
"avg_count": {
"avg_bucket": {
"buckets_path": "docs_per_day>_count"
}
}
after docs_per_day ending this.
avg_count provides average count.
_count refers the bucket count
I think, that you can use stats aggregation with the script :
{
"size": 0,
"aggs": {
"term": {
"terms": {
"field": "chanel"
},
"aggs": {
"stats": {
"stats": {
"field": "message_date"
}
},
"result": {
"bucket_script": {
"buckets_path": {
"max" : "stats.max",
"min" : "stats.min",
"count" : "stats.count"
},
"script": "params.count/(params.max - params.min)/1000/86400)"
}
}
}
}
}
}

ElasticSearch Query aggregations per #timestamp hour

im making a Query on elasticSearch, of metricbeat, to rate the most used process per hourly, in these moment i'm aggregating per process start time, and process name, i need to "divide" these groups using field "#timestamp" hourly
that's my actual query
GET metricbeat*/_search?
{"query": {
"bool": {
"must": [
{ "wildcard" : { "beat.hostname" : "ibmcx*" }},
{ "range": {
"#timestamp": {
"gte": "2019-03-22T00:00:00",
"lte": "2019-03-23T00:00:00"}}},
{"terms" : { "beat.hostname" : ["ibmcxapp101", "ibmcxapp102", "ibmcxapp103",
"ibmcxapp104", "ibmcxapp105", "ibmcxapp106", "ibmcxapp107",
"ibmcxapp108", "ibmcxapp109", "ibmcxapp110", "ibmcxapp111",
"ibmcxapp112", "ibmcxapp113", "ibmcxapp114", "ibmcxapp115",
"ibmcxapp116", "ibmcxapp117", "ibmcxapp118", "ibmcxapp119",
"ibmcxapp120", "ibmcxapp121", "ibmcxapp122", "ibmcxxaa100",
"ibmcxxaa101", "ibmcxxaa102", "ibmcxxaa103", "ibmcxxaa104",
"ibmcxxaa105", "ibmcxxaa106", "ibmcxxaa107", "ibmcxxaa108",
"ibmcxxaa109", "ibmcxxaa110", "ibmcxxaa111", "ibmcxxaa112",
"ibmcxxaa201", "ibmcxxaa202", "ibmcxxaa203", "ibmcxxaa204"
] }},
{"exists": {"field": "system.process.cmdline"}}
],
"must_not": [
{"term" : { "system.process.username" : "NT AUTHORITY\\SYSTEM" }},
{"term" : { "system.process.username" : "NT AUTHORITY\\NETWORK SERVICE" }},
{"term" : { "system.process.username" : "NT AUTHORITY\\LOCAL SERVICE" }},
{"term" : { "system.process.username" : "NT AUTHORITY\\Servicio de red"}},
{"term" : { "system.process.username" : "" }}
]
}
},
"size": 0,
"aggs": {
"group_by_start_time": {
"terms": {
"field": "system.process.cpu.start_time"
},
"aggs": {
"group_by_name": {
"terms": {
"field": "system.process.name.keyword"
}
}
}
}
},
"size": 0,
"sort" : [
{ "system.process.cpu.start_time" : {"order" : "asc"}},
{ "#timestamp" : {"order" : "asc"}},
{ "system.process.pid" : {"order" : "desc"}}
]}
It's a bit hard to follow and reproduce — a minimal example (I think the entire query is not really needed) and sample docs would go a long way.
If you want to have an hourly aggregation, the first thing you'll need to do is that aggregation and then run the others inside.
The minimal example for an hourly aggregation would be:
POST /metricbeat*/_search?size=0
{
"aggs" : {
"metrics_per_hour" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
}
}
}
}
Folding in the other aggregation would look like this:
POST /metricbeat*/_search?size=0
{
"aggs" : {
"metrics_per_hour" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs" : {
...
}
}
}
}
PS: If you are using a daily index pattern, you could just use the right day instead of the wildcard one and then skip this part of the query:
"range": {
"#timestamp": {
"gte": "2019-03-22T00:00:00",
"lte": "2019-03-23T00:00:00"
}
}

Count the percentage of character fields

I want to count the percentage of specified field data.
this is my Restful API:
Restful API:
GET _search
{
"_source": {
"includes": [ "FIRST_SWITCHED","LAST_SWITCHED","IPV4_DST_ADDR","L4_DST_PORT","IPV4_SRC_ADDR","L7_PROTO_NAME","IN_BYTES","IN_PKTS","OUT_BYTES","OUT_PKTS"]
},
"from" : 0, "size" : 10000,
"query": {
"bool": {
"must": [
{
"match" : { "_index" : "logstash-2017.12.22" }
},
{
"match_phrase":{"IPV4_SRC_ADDR":"192.168.0.159"}
},
{
"range" : {
"LAST_SWITCHED" : {
"gte" : 1513683600
}
}
}
]
}
},
"aggs": {
"IN_PKTS": {
"sum": {
"field": "IN_PKTS"
}
},
"IN_BYTES": {
"sum": {
"field": "IN_BYTES"
}
},
"OUT_BYTES": {
"sum": {
"field": "OUT_BYTES"
}
},
"OUT_PKTS": {
"sum": {
"field": "OUT_PKTS"
}
},
"percent":{
"significant_terms" : {
"field" : "L7_PROTO_NAME",
"percentage":{}
}},
"protocol" : {
"terms" : {
"field" : "PROTOCOL",
"include" : ["17", "6"]
}
},
"Using_port_count" : {
"cardinality" : {
"field" : "L4_SRC_PORT"
}
}
}
}
but there's some errors.
this is error messages:
error messages:
"reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [L7_PROTO_NAME] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
thank you in advance!
ok, I find the answer!
just add .keyword at here then it can run!
"field" : "L7_PROTO_NAME.keyword"

Elasticsearch Filter Query

I am using elasticsearch 1.5.2. I stored some products with a field named "allergic" and some others without this field. And the values of this field can be fish or milk or nuts etc. I want to make a query and to get as a result only products which doesn't have at all this field called "allergic" and to integrate this to an other aggregation query. I want to make just one query: first eliminate products which have "allergic" field and then execute the aggregation query of the second block.
How to integrate this :
{
"constant_score" : {
"filter" : {
"missing" : { "field" : "allergic" }
}
}
}
to this aggregation query:
POST tes1/_search?search_type=count
{
"aggs" : {
"fruits" : {
"filter" : {
"query":{
"query_string": {
"query": "Fruits",
"fields": [
"category"
]
}
}},
"aggs" : {
"minprice": {
"top_hits": {
"sort": [
{
"prix en €/kg": {
"order": "asc"
}
}
], "size":400
}
}
}
}} }
You need to add the query part before the aggregation call. This will filter the results and then run aggregation on the resultset.
POST tes1/_search
{
"_source": false,
"size": 1000,
"query":
{ "constant_score" : {
"filter" : {
"missing" : { "field" : "allergic" }
}
}
},
"aggs" : {
"fruits" : {
"filter" : {
"query":{
"query_string": {
"query": "Fruits",
"fields": [
"category"
]
}
}},
"aggs" : {
"minprice": {
"top_hits": {
"sort": [
{
"prix en €/kg": {
"order": "asc"
}
}
], "size":400
}
}
}
}} }
On a side note please consider upgrading ElasticSearch to the latest version as 1.x is no longer supported.

Post filter on subaggregation in elasticsearch

I am trying to run a post filter on the aggregated data, but it is not working as i expected. Can someone review my query and suggest if i am doing anything wrong here.
"query" : {
"bool" : {
"must" : {
"range" : {
"versionDate" : {
"from" : null,
"to" : "2016-04-22T23:13:50.000Z",
"include_lower" : false,
"include_upper" : true
}
}
}
}
},
"aggregations" : {
"associations" : {
"terms" : {
"field" : "association.id",
"size" : 0,
"order" : {
"_term" : "asc"
}
},
"aggregations" : {
"top" : {
"top_hits" : {
"from" : 0,
"size" : 1,
"_source" : {
"includes" : [ ],
"excludes" : [ ]
},
"sort" : [ {
"versionDate" : {
"order" : "desc"
}
} ]
}
},
"disabledDate" : {
"filter" : {
"missing" : {
"field" : "disabledDate"
}
}
}
}
}
}
}
STEPS in the query:
Filter by indexDate less than or equal to a given date.
Aggregate based on formId. Forming buckets per formId.
Sort in descending order and return top hit result per bucket.
Run a subaggregation filter after the sort subaggregation and remove all the documents from buckets where disabled date is not null.(Which is not working)
The whole purpose of post_filter is to run after aggregations have been computed. As such, post_filter has no effect whatsoever on aggregation results.
What you can do in your case is to apply a top-level filter aggregation so that documents with no disabledDate are not taken into account in aggregations, i.e. consider only documents with disabledDate.
{
"query": {
"bool": {
"must": {
"range": {
"versionDate": {
"from": null,
"to": "2016-04-22T23:13:50.000Z",
"include_lower": true,
"include_upper": true
}
}
}
}
},
"aggregations": {
"with_disabled": {
"filter": {
"exists": {
"field": "disabledDate"
}
},
"aggs": {
"form.id": {
"terms": {
"field": "form.id",
"size": 0
},
"aggregations": {
"top": {
"top_hits": {
"size": 1,
"_source": {
"includes": [],
"excludes": []
},
"sort": [
{
"versionDate": {
"order": "desc"
}
}
]
}
}
}
}
}
}
}
}

Resources