How to calculate average X per Y in elastic search? - elasticsearch

Let's say I have a list of events, like 'pageview'. I want to calculate average pageviews per session.
My document looks like this
{
sessionID: 'xxx',
action: 'pageview'
}
So what I'm tried to do is to first aggregate by sessionID and then apply avg. child aggregation, but it's not what I expected.
I'm very new to ElasticSeach. What would be the logic to generate such aggregation in EC?
Thanks

You've started correctly by aggregating on the sessionID field. Then you need another filter sub-aggregation on the action field to match only pageviewactions. Your aggregation query would look like this:
{
"size": 0,
"aggs": {
"sessions": {
"terms": {
"field": "sessionID"
},
"aggs": {
"pageviews": {
"filter": {
"term": {
"action": "pageview"
}
}
}
}
}
}
}
This is going to give you the total doc_count for each of your sessions and in each session bucket you'll get the total doc_count for pageview actions within that session.
The average can then easily be calculated with
response.aggregations.sessions.forEach(function(session) {
var actionsInSession = session.doc_count;
var pageviewActions = session.pageviews.doc_count;
var avg = pageviewActions / actionsInSession;
// do something with the average value
});
UPDATE
If you're using (or willing to use) ES 2.0, you can get ES to calculate those averages for you using pipeline aggregations.
{
"size": 0,
"aggs": {
"sessions": {
"terms": {
"field": "sessionID"
},
"aggs": {
"total": {
"value_count": {
"field": "sessionID"
}
},
"pageviews": {
"filter": {
"term": {
"action": "pageview"
}
},
"aggs": {
"cnt": {
"value_count": {
"field": "action"
}
}
}
},
"avg": {
"bucket_script": {
"buckets_path": {
"total": "total",
"pageviews": "pageviews > cnt"
},
"script": "pageviews / total"
}
}
}
}
}
}
In each sessionID bucket, you'll get an avg value for the number of pageview action vs the number of total actions for that session.

Related

How to do proportions in Elastic search query

I have a field in my data that has four unique values for all the records. I have to aggregate the records based on each unique value and find the proportion of each field in the data. Essentially, (Number of records in each unique field/total number of records). Is there a way to do this with elastic search dashboards? I have used terms aggregation to aggregate the fields and applied value_count metric aggregation to get the doc_count value. But I am not able to use the bucket script to do the division. I am getting the error ""buckets_path must reference either a number value or a single value numeric metric aggregation, got: [StringTerms] at aggregation [latest_version]""
Below is my code:
{
"size": 0,
"aggs": {
"BAR": {
"date_histogram": {
"field": "timestamp",
"calendar_interval": "day"
},
"aggs": {
"latest_version": {
"filter": {
"match_phrase": {
"log": "main_filter"
}
},
"aggs": {
"latest_version_count": {
"terms": {
"field": "field_name"
},
"aggs": {
"version_count": {
"value_count": {
"field": "field_name"
}
}
}
},
"sum_buckets": {
"sum_bucket": {
"buckets_path": "latest_version_count>_count"
}
}
}
},
"BAR-percentage": {
"bucket_script": {
"buckets_path": {
"eachVersionCount": "latest_version>latest_version_count",
"totalVersionCount": "latest_version>sum_buckets"
},
"script": "params.eachVersionCount/params.totalVersionCount"
}
}
}
}
}
}

Get very large total result count from pipeline aggregation

I have a query that I'm executing on an event table, which finds all productIds for product events where the active field changed from one date to another. This query returns an extremely large dataset, which I plan to paginate using partitions.
In order to know how large my partitions should be, I need a total count of docs returned by this query. However, If I run the query itself and return all of the docs, I unsurprisingly get a memory error (this occurs even if I use filter to return just the count).
Is there a way to process and return just the total result count?
{
"query": {
"bool": {
"should": [{
"range": {
"timeRange": { "gte": "2022-05-22T00:00:00.000Z", "lte": "2022-05-22T00:00:00.000Z" }
}, {
"range": {
"timeRange": { "gte": "2022-05-01T00:00:00.000Z", "lte": "2022-05-01T00:00:00.000Z" }
}
}
]
}
},
"version": true,
"aggs": {
"total_entities": {
"stats_bucket": {
"buckets_path": "group_by_entity_id>distinct_val_count"
}
},
"group_by_entity_id": {
"terms": {
"field": "productId",
"size": 500000
},
"aggs": {
"distinct_val_count": {
"cardinality": {
"field": "active"
}
},
"distinct_val_count_filter": {
"bucket_selector": {
"buckets_path": {
"distinct_val_count": "distinct_val_count"
},
"script": "params.distinct_val_count > 1"
}
}
}
}
}
}

sql to es : get limit page and order result on agg

SELECT
max( timestamp ) AS first_time,
min( timestamp ) AS last_time,
src_ip,
threat_target ,
count(*) as count
FROM
traffic
GROUP BY
src_ip,
threat_target
ORDER BY
first_time desc
LIMIT 0 ,10
I want to get this result, but I don't know how to get limit size and where to use sort
{
"size": 0,
"aggregations": {
"src_ip": {
"aggregations": {
"threat_target": {
"aggregations": {
"last_time": {
"max": {
"field": "`timestamp`"
}
},
"first_time": {
"min": {
"field": "`timestamp`"
}
}
},
"terms": {
"field": "threat_target.keyword"
}
}
},
"terms": {
"field": "src_ip.keyword"
}
}
}
}
Aggregation Pagination is generally not supported in Elastic Search, however, composite aggregation provides a way to paginate your aggregation.
Unlike the other multi-bucket aggregation the composite aggregation can be used to paginate all buckets from a multi-level aggregation efficiently.
Excerpt from Composite-Aggregation ES Docs.
CHECK: THIS
Except "ORDER BY first_time desc", below query should run fine for you. I don't think ordering on any fields other than the grouping fields (src_ip,
threat_target) is possible.
GET traffic/_search
{
"size": 0,
"aggs": {
"my_bucket": {
"composite": {
"size": 2, //<=========== PAGE SIZE
/*"after":{ // <========== INCLUDE THIS FROM Second request onwards, passing after_key of the last output here for next page
"src_ip" : "1.2.3.5",
"threat_target" : "T3"
},*/
"sources": [
{
"src_ip": {
"terms": {
"field": "source_ip",
"order": "desc"
}
}
},
{
"threat_target": {
"terms": {
"field": "threat_target"
}
}
}
]
},
"aggs": {
"first_time": {
"max": {
"field": "first_time"
}
}
}
}
}
}

Can I sort grouped search result by formula?

I am trying to implement query which will sort aggregated results by the formula.
For example, we have the next entities:
{
"price":"1000",
"zip":"77777",
"field1":"1",
"field2":"5"
},
{
"price":"2222",
"zip":"77777",
"field1":"2",
"field2":"5"
},
{
"price":"1111",
"zip":"77777",
"field1":"1",
"field2":"5"
}
Now, my query without sorting looks like:
POST /entities/_search {
"size": 0,
"query": {
"term": {
"zip": {
"value": "77777"
}
}
},
"aggs": {
"my composite": {
"composite": {
"size": 500,
"sources": [{
"field1_term": {
"terms": {
"field": "field1"
}
}
},
{
"field2_term": {
"terms": {
"field": "field2"
}
}
}
]
},
"aggs": {
"avg_price_per_group": {
"avg": {
"field": "price"
}
},
"results_per_group": {
"top_hits": {
"size": 100,
"_source": {
"include": ["entity_id", "price"]
}
}
}
}
}
}
}
The first one I need to group result by field1 and field2 and then calculate the average price for each group.
Then I need to divide the price of each doc by average price value and sort documents based on this value.
Is it possible to do this somehow?

Compute the "fill rate" of a field in Elasticsearch

I would like to compute the ratio of fields that have a value in my index.
I managed to count how many documents miss the field:
GET profiles/_search
{
"aggs": {
"profiles_wo_country": {
"missing": {
"field": "country"
}
}
},
"size": 0
}
I also managed to count how many documents have the filed:
GET profiles/_search
{
"query": {
"filtered": {
"query": {"match_all": {}},
"filter": {
"exists": {
"field": "country"
}
}
}
},
"size": 0
}
Naturally I can also get the total number of documents in the index. How can I compute the ratio?
An easy way to get the numbers you need out of a query is using the following query
POST profiles/_search?filter_path=hits.total,aggregations.existing.doc_count
{
"size": 0,
"aggs": {
"existing": {
"filter": {
"exists": {
"field": "tag"
}
}
}
}
}
You'll get an response like this one:
{
"hits": {
"total": 37258601
},
"aggregations": {
"existing": {
"doc_count": 9287160
}
}
}
And then in your client code, you can simply do
fill_rate = (aggregations.existing.doc_count / hits.total) * 100
And you're good to go.

Resources