Get very large total result count from pipeline aggregation - elasticsearch

I have a query that I'm executing on an event table, which finds all productIds for product events where the active field changed from one date to another. This query returns an extremely large dataset, which I plan to paginate using partitions.
In order to know how large my partitions should be, I need a total count of docs returned by this query. However, If I run the query itself and return all of the docs, I unsurprisingly get a memory error (this occurs even if I use filter to return just the count).
Is there a way to process and return just the total result count?
{
"query": {
"bool": {
"should": [{
"range": {
"timeRange": { "gte": "2022-05-22T00:00:00.000Z", "lte": "2022-05-22T00:00:00.000Z" }
}, {
"range": {
"timeRange": { "gte": "2022-05-01T00:00:00.000Z", "lte": "2022-05-01T00:00:00.000Z" }
}
}
]
}
},
"version": true,
"aggs": {
"total_entities": {
"stats_bucket": {
"buckets_path": "group_by_entity_id>distinct_val_count"
}
},
"group_by_entity_id": {
"terms": {
"field": "productId",
"size": 500000
},
"aggs": {
"distinct_val_count": {
"cardinality": {
"field": "active"
}
},
"distinct_val_count_filter": {
"bucket_selector": {
"buckets_path": {
"distinct_val_count": "distinct_val_count"
},
"script": "params.distinct_val_count > 1"
}
}
}
}
}
}

Related

Use distinct field for count with significant_terms in Elastic Search

Is there a way to get the signification_terms aggregation to use document counts based on a distinct field?
I have an index with posts and their hashtags but they are from multiple sources so there will be multiple ones with the same permalink field but I only want to count unique permalinks per each hashtag. I have managed to get the unique totals using the cardinality aggregation: (ie "cardinality": { field": "permalink.keyword"}) but can't work out how to do this with the Significant terms aggregation. My query is as follows:
GET /posts-index/_search
{
"aggregations": {
"significant_hashtag": {
"significant_terms": {
"background_filter": {
"bool": {
"filter": [
{
"range": {
"created": {
"gte": 1656414622,
"lte": 1656630000
}
}
}
]
}
},
"field": "hashtag.keyword",
"mutual_information": {
"background_is_superset": false,
"include_negatives": true
},
"size": 100
}
}
},
"query": {
"bool": {
"filter": [
{
"range": {
"created": {
"gte": 1656630000,
"lte": 1659308400
}
}
}
]
}
},
"size": 0
}

Elasticsearch Pagination with timestamp range

Elasticsearch official documentation introduce that elasticsearch can realize pagination by composite aggregations.
The composite aggregation will fetch data many times to get all results.
So my question is, Can I use range from now-1h to now when I execute composite aggregation?
If I can. How to composite aggregation query keep source data unchanging when every range query have different now.
If I can't. My query below has no error and the result seems to be right.
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": "now-1h"
}
}
}
]
}
},
"aggs": {
"user_device": {
"composite": {
"after": {
"user_name": "alen.lv"
},
"size": 100,
"sources": [
{
"user_name": {
"terms": {
"field": "user_name"
}
}
}
]
},
"aggs": {
"user_mac": {
"terms": {
"field": "user_mac",
"size": 1000
}
}
}
}
}
}

ElasticSearch: Filter by distinct count during aggregation

The following query returns distinct Ids in order by largest distinct count of Ids. What I would like to do is "include only those IDs for which total number of documents is less than 2000"
{
"size": "0",
"query": {
"range": {
"#timestamp": {
"gte": "2020-10-20T00:00:00",
"lt": "2020-10-21T00:00:00"
}
}
},
"aggs": {
"ids": {
"terms": {
"field": "Id.keyword",
"size": 1000
}
}
}
}
I tried adding filter by 'doc_count' but that didn't help. How do I do this?
You can filter the buckets using bucket_selector aggregation
Bucket Selector Aggregation is a parent pipeline aggregation which executes a script which determines
whether the current bucket will be retained in the parent multi-bucket
aggregation.
{
"size": "0",
"query": {
"range": {
"#timestamp": {
"gte": "2020-10-20T00:00:00",
"lt": "2020-10-21T00:00:00"
}
}
},
"aggs": {
"ids": {
"terms": {
"field": "Id.keyword",
"size": 1000
},
"aggs": {
"count_filter": {
"bucket_selector": {
"buckets_path": {
"values": "_count"
},
"script": "params.values < 2000" <-- note this
}
}
}
}
}
}

Need aggregation of only the query results

I need to do an aggregation but only with the limited results I get form the query, but it is not working, it returns other results outside the size limit of the query. Here is the query I am doing
{
"size": 500,
"query": {
"bool": {
"must": [
{
"term": {
"tags.keyword": "possiblePurchase"
}
},
{
"term": {
"clientName": "Ci"
}
},
{
"range": {
"firstSeenDate": {
"gte": "now-30d"
}
}
}
],
"must_not": [
{
"term": {
"tags.keyword": "skipPurchase"
}
}
]
}
},
"sort": [
{
"firstSeenDate": {
"order": "desc"
}
}
],
"aggs": {
"byClient": {
"terms": {
"field": "clientName",
"size": 25
},
"aggs": {
"byTarget": {
"terms": {
"field": "targetName",
"size": 6
},
"aggs": {
"byId": {
"terms": {
"field": "id",
"size": 5
}
}
}
}
}
}
}
}
I need the aggregations to only consider the first 500 results of the query, sorted by the field I am requesting on the query. I am completely lost. Thanks for the help
Scope of the aggregation is the number of hits of your query, the size parameter is only used to specify the number of hits to fetch and display.
If you want to restrict the scope of the aggregation on the first n hits of a query, I would suggest the sampler aggregation in combination with your query

Query elasticsearch with multiple numeric ranges

{
"query": {
"filtered": {
"query": {
"match": {
"log_path": "message_notification.log"
}
},
"filter": {
"numeric_range": {
"time_taken": {
"gte": 10
}
}
}
}
},
"aggs": {
"distinct_user_ids": {
"cardinality": {
"field": "user_id"
}
}
}
}
I have to run this query 20 times as i want to know notification times above each of the following thresholds- [10,30,60,120,240,300,600,1200..]. Right now, i am running a loop and making 20 queries for fetching this.
Is there a more sane way to query elasticsearch once and get ranges that fall into these thresholds respectively?
What you probably want is a "range aggregation".
Here is the possible query where you can add more range or alter them -
{
"size": 0,
"query": {
"match": {
"log_path": "message_notification.log"
}
},
"aggs": {
"intervals": {
"range": {
"field": "time_taken",
"ranges": [
{
"to": 50
},
{
"from": 50,
"to": 100
},
{
"from": 100
}
]
},
"aggs": {
"distinct_user_ids": {
"cardinality": {
"field": "user_id"
}
}
}
}
}
}

Resources