limiting date histogram to a date range without affecting results - elasticsearch

I want to limit the results to a date range while performing the date histogram. But it seems to affect the results set (hits). Is there any way that I can do the same, but not affect the hits area?

Filter aggregation would be an ideal match here.
{
"query": {
"match": {
"Content": "my query"
}
},
"aggs": {
"filterByDate": {
"filter": {
"range": {
"<dateField>": {
"gte": "<StartDate>",
"lt": "<EndDate>"
}
}
},
"aggs": {
"dateStats": {
"date_histogram": {
"field": "<dateField>"
}
}
}
}
}
}

Related

Use distinct field for count with significant_terms in Elastic Search

Is there a way to get the signification_terms aggregation to use document counts based on a distinct field?
I have an index with posts and their hashtags but they are from multiple sources so there will be multiple ones with the same permalink field but I only want to count unique permalinks per each hashtag. I have managed to get the unique totals using the cardinality aggregation: (ie "cardinality": { field": "permalink.keyword"}) but can't work out how to do this with the Significant terms aggregation. My query is as follows:
GET /posts-index/_search
{
"aggregations": {
"significant_hashtag": {
"significant_terms": {
"background_filter": {
"bool": {
"filter": [
{
"range": {
"created": {
"gte": 1656414622,
"lte": 1656630000
}
}
}
]
}
},
"field": "hashtag.keyword",
"mutual_information": {
"background_is_superset": false,
"include_negatives": true
},
"size": 100
}
}
},
"query": {
"bool": {
"filter": [
{
"range": {
"created": {
"gte": 1656630000,
"lte": 1659308400
}
}
}
]
}
},
"size": 0
}

Get very large total result count from pipeline aggregation

I have a query that I'm executing on an event table, which finds all productIds for product events where the active field changed from one date to another. This query returns an extremely large dataset, which I plan to paginate using partitions.
In order to know how large my partitions should be, I need a total count of docs returned by this query. However, If I run the query itself and return all of the docs, I unsurprisingly get a memory error (this occurs even if I use filter to return just the count).
Is there a way to process and return just the total result count?
{
"query": {
"bool": {
"should": [{
"range": {
"timeRange": { "gte": "2022-05-22T00:00:00.000Z", "lte": "2022-05-22T00:00:00.000Z" }
}, {
"range": {
"timeRange": { "gte": "2022-05-01T00:00:00.000Z", "lte": "2022-05-01T00:00:00.000Z" }
}
}
]
}
},
"version": true,
"aggs": {
"total_entities": {
"stats_bucket": {
"buckets_path": "group_by_entity_id>distinct_val_count"
}
},
"group_by_entity_id": {
"terms": {
"field": "productId",
"size": 500000
},
"aggs": {
"distinct_val_count": {
"cardinality": {
"field": "active"
}
},
"distinct_val_count_filter": {
"bucket_selector": {
"buckets_path": {
"distinct_val_count": "distinct_val_count"
},
"script": "params.distinct_val_count > 1"
}
}
}
}
}
}

Difference between the result of two date fields then getting average

I am looking to get the average of the difference between two different fields in an elastic DB, I have been able to write a query to return the last 1000 results, however I am not sure how I go about getting the difference between each result then getting an overall average.
Elastic query below:
POST my_index/_search
{
"size":1000,
"_source": ["date.time.received","date.time.sent"],
"query": {
"bool": {
"must": [
{
"range": {
"date.time.received": {
"gte": "2019-06-19"
}
}
},
{
"range": {
"date.time.sent": {
"gte": "2019-06-19"
}
}
}
]
}
}
}
I am using average aggregation and script
POST testindex5/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"date.time.received": {
"gte": "2019-06-19"
}
}
},
{
"range": {
"date.time.sent": {
"gte": "2019-06-19"
}
}
}
]
}
},
"aggs": {
"avg_resp": {
"avg": {
"script": "(doc['date.time.received'].value.toInstant().toEpochMilli()- doc['date.time.sent'].value.toInstant().toEpochMilli())/1000/86400" ---> convert to days
}
}
}
}

Elasticsearch: Group by timeframe

I made this query to get the number of requests a user made in the last month (or day) compared to the rest of the users.
{
"query": {
"bool": {
"must": [
{
"range": {
"created": {
"gte": 1554854400000
}
}
}
]
}
},
"aggs": {
"requests": {
"filters": {
"other_bucket_key": "all",
"filters": {
"user": {
"match": {
"user_id": "XXXXXX"
}
}
}
}
}
}
}
These are all the requests made in the selected period of time.
Now, I want to get the number of requests / day the user made in the last month compared to the rest of the users.
I was able to obtain this using Date Histogram Aggregation for the total number of requests made but I can't figure out how to split that into user and the rest.
I don't know if that's possible or maybe there's another way of doing this.
You're on the right path, you simply need to combine the date_histogram daily aggregation and the filters aggregation you already have:
{
"query": {
"bool": {
"must": [
{
"range": {
"created": {
"gte": 1554854400000
}
}
}
]
}
},
"aggs": {
"per_day": {
"date_histogram": {
"field": "created",
"interval": "day"
},
"aggs": {
"requests": {
"filters": {
"other_bucket_key": "all",
"filters": {
"user": {
"match": {
"user_id": "XXXXXX"
}
}
}
}
}
}
}
}
}
For each day, you're going to get the number of requests made by the user vs the number of requests for all other users.

How to limit a date histogram aggregation of nested documents to a specific date range?

Version
Using Elasticsearch 1.7.2
Objective
I would like to create a graph of the number of predictions made by users per day for the last n days. In this case, 10 days.
Current query
{
"size": 0,
"aggs": {
"predictions": {
"nested": {
"path": "user_answers"
},
"aggs": {
"predictions_over_time": {
"date_histogram": {
"field": "user_answers.created",
"interval": "day",
"format": "yyyy-MM-dd",
"min_doc_count": 0
}
}
}
}
}
}
Issue
This query will return a histogram but will return buckets for all available dates across all documents. It doesn't restrict to a specific date range.
What have I tried?
I've tried a number of approaches to solving this, all of which have failed.
* Range filter, then histogram that
* Date range aggregation, then histogram the buckets
* Using extended_bounds with, full dates, now-10d and also timestamps
* Trying a range filter inside the histogram aggregation
Any guidance would be appreciated! Thanks.
query didn't work for me in that situation, what I used is a third aggs:
{
"size": 0,
"aggs": {
"user_answers": {
"nested": { "path": "user_answers" },
"aggs": {
"timed_user_answers": {
"filter": {
"range": {
"user_answers.created": {
"gte": "now",
"lte": "now -10d"
}
}
},
"aggs": {
"predictions_over_time": {
"date_histogram": {
"field": "user_answers.created",
"interval": "day",
"format": "yyyy-MM-dd",
"min_doc_count": 0
}
}
}
}
}
}
}
}
One aggs specifies nested, one specifies filter, and the last specifies the actual aggregation. Don't know why this syntax makes sense, but you seem to not be able to use two on the same aggs.
You need to add a query. Query can be anything except from post_filter. It should be nested and contain date range. One of the ways is to define a constant score query. Inside constant score query, use a nested filter which should use a range filter.
{
"query": {
"constant_score": {
"filter": {
"nested": {
"path": "user_answers",
"filter": {
"range": {
"user_answers.created": {
"gte": "now",
"lte": "now -10d"
}
}
}
}
}
}
}
}
Confirm if this works for you.

Resources