Elastic get documents with min/max and last value - elasticsearch

Let's say I have an elastic index with the following data:
{"var1": 14, "time": "2019-02-12T13:01:00.000Z"}
{"var2": 1423, "time": "2019-02-12T13:01:02.000Z"}
{"var3": 114, "time": "2019-02-12T13:01:03.000Z"}
{"var2": 214, "time": "2019-02-12T13:01:04.000Z"}
{"var3": 414, "time": "2019-02-12T13:01:05.000Z"}
{"var1": 124, "time": "2019-02-12T13:01:06.000Z"}
{"var2": 914, "time": "2019-02-12T13:01:07.000Z"}
{"var3": 8614, "time": "2019-02-12T13:01:06.000Z"}
{"var2": 74, "time": "2019-02-12T13:01:07.000Z"}
{"var3": 174, "time": "2019-02-12T13:01:08.000Z"}
{"var4": 144, "time": "2019-02-12T13:01:09.000Z"}
{"var4": 714, "time": "2019-02-12T13:01:10.000Z"}
{"var4": 813, "time": "2019-02-12T13:01:11.000Z"}
{"var2": 65, "time": "2019-02-12T13:01:12.000Z"}
{"var1": 321, "time": "2019-02-12T13:01:13.000Z"}
I would like to write ONE query that can retrieve the minimum of a variable, the maximum of a variable and the last n values of a variable in a given time interval.
It is important that I need the actual document that has the min or the max or the last value (this is why I'm using top_hits for the min and max instead of the min or max aggregations).
So far I have this query:
{
"query": {
"bool": {
"must": [
{
"range": {
"time": {
"gte": "2019-02-12T13:01:00.000Z",
"lt": "2019-02-12T13:01:15.000Z"
}
}
}
]
}
},
"size": 0,
"aggs": {
"max_var1": {
"top_hits": {
"size": 1,
"sort": [{
"var1": {"order": "desc"}
}]
}
},
"min_var2": {
"top_hits": {
"size": 1,
"sort": [{
"var2": {"order": "asc"}
}]
}
},
"last_var4": {
"top_hits": {
"size": 3,
"sort": [{
"time": {"order": "desc"}
}],
"_source": ["var4"]
}
}
}
}
The query returns correctly the min and the max value but it doesn't return the correct last 3 value for var4, because it takes the last from all the documents in the given time interval, and not the documents that have var4 in it.
So the question is how to get the last n documents for a given variable inside this query.
I know I could use the multi search API to execute several queries at once, but I would like to know if it is possible to have it in one query.
Thanks.

Filtered aggregation to the rescue. Simply make sure to constrain the last_var4 aggregation to only those docs that actually have the field var4.
{
...
"last_var4": {
"filter": {
"bool": {
"filter": {
"exists": {
"field": "var4"
}
}
}
},
"aggs": {
"last_var4": {
"top_hits": {
"size": 3,
"sort": [
{
"time": {
"order": "desc"
}
}
],
"_source": [
"var4"
]
}
}
}
}
}
}

Related

Calculate the counts of last snapshot of a record in ElasticSearch

I am storing snapshots of data in ElasticSearch. I want to perform count metric aggregation on latest snapshot of each entry, the purpose is to know what state my current (latest) data are in
I have something like this
[
{
"id": 2,
"state": "deleted",
"timestamp": "2019-11-20T18:18:09+00:00"
},
{
"id": 2,
"state": "published",
"timestamp": "2019-11-19T18:18:09+00:00"
},
{
"id": 3,
"state": "published",
"timestamp": "2019-10-17T18:18:09+00:00"
},
{
"id": 3,
"state": "draft",
"timestamp": "2019-10-16T18:18:09+00:00"
}
]
I tried this
POST /snapshots/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"2": {
"terms": {
"field": "state.keyword",
},
"aggs": {
"1": {
"top_hits": {
"size": 1,
"sort": [
{
"timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}
But the problem is it first create a bucket and in that bucket it does the sorting and calculate the top_hits so instead of
deleted = 1
published = 1
draft = 0
It returns
deleted = 1
published = 1
draft = 1

Elasticsearch Query 30Day Price Difference

I currently have an elasticsearch indexs for a product that spans a year each index separated by month (i think, could be by year if i dont have as much data as i think i do). Each day a process grabs all the prices of these products and puts them into elasticsearch. I am trying to build a query that can give me the percent change within the last 30days of each product.
Example...
{
"timestamp": "2019-09-18T02:38:51.417Z",
"productId": 1,
"marketPrice": 5.00,
"lowPrice": 4.30
},
{
"timestamp": "2019-08-30T02:38:51.417Z", (THIS SHOULD BE IGNORED)**
"productId": 1,
"marketPrice": 100.00,
"lowPrice": 200.15
},
{
"timestamp": "2019-08-18T02:38:51.417Z",
"productId": 1,
"marketPrice": 10.00,
"lowPrice": 2.15
},
{
"timestamp": "2019-09-18T02:38:51.417Z",
"productId": 2,
"marketPrice": 2.00,
"lowPrice": 1.00
},
{
"timestamp": "2019-08-18T02:38:51.417Z",
"productId": 2,
"marketPrice": 3.00,
"lowPrice": 2.00
}
Result Example
{
"productId": 1,
"marketPriceChangeWithin30Days": 200%,
"lowPriceChangeWithin30Days": 200%
},
{
"productId": 2,
"marketPriceChangeWithin30Days": 150%,
"lowPriceChangeWithin30Days": 200%
}
** The (THIS SHOULD BE IGNORED) is because the only two values that should be compared are the latest timestamp and the closest timestamp that is around 30days in the past.
The query would then return the product id 1 and 2 with the percent changed in the result as shown in the example response.
You can leverage the derivative pipeline aggregation to achieve exactly what you expect:
POST /sales/_search
{
"size": 0,
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "timestamp",
"interval": "month"
},
"aggs": {
"marketPrice": {
"sum": {
"field": "marketPrice"
}
},
"lowPrice": {
"sum": {
"field": "lowPrice"
}
},
"marketPriceDiff": {
"derivative": {
"buckets_path": "marketPrice"
}
},
"lowPriceDiff": {
"derivative": {
"buckets_path": "lowPrice"
}
}
}
}
}
}
UPDATE:
Given your updated requirements, I'd suggest using the serial_diff pipeline aggregation with a lag of 30 days:
POST /sales/_search
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "now-31d",
"lte": "now"
}
}
},
"aggs": {
"products": {
"terms": {
"field": "productId",
"size": 10
},
"aggs": {
"histo": {
"date_histogram": {
"field": "timestamp",
"interval": "day",
"min_doc_count": 0
},
"aggs": {
"marketPrice": {
"avg": {
"field": "marketPrice"
}
},
"lowPrice": {
"avg": {
"field": "lowPrice"
}
},
"30d_diff_marketPrice": {
"serial_diff": {
"buckets_path": "marketPrice",
"lag": 30
}
},
"30d_diff_lowPrice": {
"serial_diff": {
"buckets_path": "lowPrice",
"lag": 30
}
}
}
}
}
}
}
}

Unique values query without aggregations

We have an index of unique products where each document represents a single product, with the following fields: product_id, group_id, group_score, and product_score.
Consider the following index:
{
"product_id": "100-001",
"group_id": "100",
"group_score": 100,
"product_score": 60,
},
{
"product_id": "100-002",
"group_id": "100",
"group_score": 100,
"product_score": 40,
},
{
"product_id": "100-001",
"group_id": "100",
"group_score": 100,
"product_score": 50,
},
{
"product_id": "200-001",
"group_id": "200",
"group_score": 73,
"product_score": 20,
},
{
"product_id": "200-002",
"group_id": "200",
"group_score": 73,
"product_score": 53,
}
Every group contains ~1-200 products.
We are trying to a query that matches the following conditions:
1. Products should be sorted by their group_score (desc).
2. No more than one product per group_id.
3. Get the product with the highest product_score within the group.
For example, applying the query on the above should return:
{
"product_id": "100-001"
},
{
"product_id": "200-002"
}
We ended up with the following query:
{
"size": 0,
"aggs": {
"group_by_group_id": {
"terms": {
"field": "group_id",
"order":{
"max_group_score":"desc"
}
},
"aggs": {
"top_scores_hits": {
"top_hits": {
"sort": [
{
"product_score": {
"order": "desc"
}
}
],
"size": 1
}
},
"max_group_score":{
"max":{
"field":"group_score"
}
}
}
}
}
}
The problem is that the query is really slow because of the aggregations and the search performance is important.
We would love to hear your opinion about a better/efficient solution.
Changing the index structure is tolerable.

elastic search : Aggregating the specific nested documents only

I want to aggregate the specific nested documents which satisfies the given query.
Let me explain it through an example. I have inserted two records in my index:
First document is,
{
"project": [
{
"subject": "maths",
"marks": 47
},
{
"subject": "computers",
"marks": 22
}
]
}
second document is,
{
"project": [
{
"subject": "maths",
"marks": 65
},
{
"subject": "networks",
"marks": 72
}
]
}
Which contains the subject along with the marks in each record. From that documents, I need to have an average of maths subject alone from the given documents.
The query I tried is:
{
"size": 0,
"aggs": {
"avg_marks": {
"avg": {
"field": "project.marks"
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "project.subject:maths",
"analyze_wildcard": true,
"default_field": "*"
}
}
]
}
}
}
Which is returning the result of aggregating all the marks average which is not required.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"avg_marks": {
"value": 51.5
}
}
}
I just need an average of maths subject from the given documents, in which the expected result is 56.00
any help with the query or idea will be helpful.
Thanks in advance.
First you need in your mapping to specify that index have nested field like following:
PUT /nested-index {
"mappings": {
"document": {
"properties": {
"project": {
"type": "nested",
"properties": {
"subject": {
"type": "keyword"
},
"marks": {
"type": "long"
}
}
}
}
}
}
}
then you insert your docs:
PUT nested-index/document/1
{
"project": [
{
"subject": "maths",
"marks": 47
},
{
"subject": "computers",
"marks": 22
}
]
}
then insert second doc:
PUT nested-index/document/2
{
"project": [
{
"subject": "maths",
"marks": 65
},
{
"subject": "networks",
"marks": 72
}
]
}
and then you do aggregation but specify that you have nested structure like this:
GET nested-index/_search
{
"size": 0,
"aggs": {
"subjects": {
"nested": {
"path": "project"
},
"aggs": {
"subjects": {
"terms": {
"field": "project.subject",
"size": 10
},
"aggs": {
"average": {
"avg": {
"field": "project.marks"
}
}
}
}
}
}
}
}
and why your query is not working and why give that result is because when you have nested field and do average it sums all number from one array if in that array you have some keyword doesn't matter that you want to aggregate only by one subject.
So if you have those two docs because in both docs you have math subject avg will be calculated like this:
(47 + 22 + 65 + 72) / 4 = 51.5
if you want avg for networks it will return you (because in one document you have network but it will do avg over all values in array):
65 + 72 = 68.5
so you need to use nested structure in this case.
If you are interested just for one subject you can than do aggregation just for subject equal to something like this (subject equal to "maths"):
GET nested-index/_search
{
"size": 0,
"aggs": {
"project": {
"nested": {
"path": "project"
},
"aggs": {
"subjects": {
"filter": {
"term": {
"project.subject": "maths"
}
},
"aggs": {
"average": {
"avg": {
"field": "project.marks"
}
}
}
}
}
}
}
}

How to add paging in Elasticsearch's aggregation?

I have an elasticsearch request as below:
{
"size":0,
"aggs":{
"group_by_state":{
"terms":{
"field":"poi_id"
},
"aggs":{
"sum(price)":{
"sum":{
"field":"price"
}
}
}
}
}
}
I want to add paging in this requst, just like
select poi_id, sum(price) from table group by poi_id limit 0,2
I've searched a lot, and found a link about it:https://github.com/elastic/elasticsearch/issues/4915.
But still I didn't get the implementation method.
Is there any way to implement it by Elasticsearch itself but not my application?
I am working through a solution for paging aggregation results currently. What you want to use is partition. This section in the official docs is very helpful.
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_filtering_values_with_partitions
To adapt your example, the terms setting would be updated as follows.
{
"size":0,
"aggs":{
"group_by_state":{
"terms":{
"field":"poi_id",
"include": {
"partition": 0,
"num_of_partitions": 100
},
"size": 10000
},
"aggs":{
"sum(price)":{
"sum":{
"field":"price"
}
}
}
}
}
}
This will group your results into 100 partitions (num_of_partitions), with a max size of 10k results in each (size), and retrieve the first such partition (partition: 0)
If you have more than 10k unique values for the field you are aggregating on (and want to return all values) you will want to increase the size value or potentially compute size and num_of_partitions dynamically based on the cardinality of your field.
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#search-aggregations-metrics-cardinality-aggregation
You might also want to use the show_term_doc_count_error setting to make sure your aggregation is returning accurate counts. https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_per_bucket_document_count_error
Hope that's helpful.
Late to the party, but just discovered 'composite' aggregations in v6.3+. These allow:
1. A more 'Sql like' grouping
2. Pagination by use of the 'after_key'.
Saved our day, hope it will help others too.
Example, getting number of hits per hour between 2 dates, grouped on 5 fields:
GET myindex-idx/_search
{
"query": {
"bool": {
"must": [
{"match": {"docType": "myDOcType"}},
{"range": {
"#date": {"gte": "2019-06-19T21:00:00", "lt": "2019-06-19T22:00:00"}
}
}
]
}
},
"size": 0,
"aggs": {
"mybuckets": {
"composite": {
"size": 100,
"sources": [
{"#date": {
"date_histogram": {
"field": "#date",
"interval": "hour",
"format": "date_hour"}
}
},
{"field_1": {"terms": {"field": "field_1"}}},
{"field_2": {"terms": {"field": "field_2"}}},
{"field_3": {"terms": {"field": "field_3"}}},
{"field_4": {"terms": {"field": "field_4"}}},
{"field_5": {"terms": {"field": "field_5"}}}
]
}
}
}
}
Produces:
{
"took": 255,
"timed_out": false,
"_shards": {
"total": 80,
"successful": 80,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 46989,
"max_score": 0,
"hits": []
},
"aggregations": {
"mybuckets": {
"after_key": {
"#date": "2019-06-19T21",
"field_1": 262,
"field_2": 347,
"field_3": 945,
"field_4": 2258,
"field_5": 0
},
"buckets": [
{
"key": {
"#date": "2019-06-19T21",
"field_1": 56,
"field_2": 106,
"field_3": 13224,
"field_4": 46239,
"field_5": 0
},
"doc_count": 3
},
{
"key": {
"#date": "2019-06-19T21",
"field_1": 56,
"field_2": 106,
"field_3": 32338,
"field_4": 76919,
"field_5": 0
},
"doc_count": 2
},
....
Following paging query issued like this, using the 'after_key object in the queries 'after' object:
GET myindex-idx/_search
{
"query": {
"bool": {
"must": [
{"match": {"docType": "myDOcType"}},
{"range": {
"#date": {"gte": "2019-06-19T21:00:00", "lt": "2019-06-19T22:00:00"}
}
}
]
}
},
"size": 0,
"aggs": {
"mybuckets": {
"composite": {
"size": 100,
"sources": [
{"#date": {
"date_histogram": {
"field": "#date",
"interval": "hour",
"format": "date_hour"}
}
},
{"field_1": {"terms": {"field": "field_1"}}},
{"field_2": {"terms": {"field": "field_2"}}},
{"field_3": {"terms": {"field": "field_3"}}},
{"field_4": {"terms": {"field": "field_4"}}},
{"field_5": {"terms": {"field": "field_5"}}}
],
"after": {
"#date": "2019-06-19T21",
"field_1": 262,
"field_2": 347,
"field_3": 945,
"field_4": 2258,
"field_5": 0
}
}
}
}
}
This pages through the results, until the mybuckets returns empty
You can use the parameters from and size in your request. See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html for more information. Your request would be something like this:
{
"from" : 0,
"size" : 10,
"aggs":{
"group_by_state":{
"terms":{
"field":"poi_id"
},
"aggs":{
"sum(price)":{
"sum":{
"field":"price"
}
}
}
}
}
}

Resources