elasticsearch return hits found in aggregation - elasticsearch

I am trying to get rows from my database that have a unique 'sku' field.
I have a working query which counts this number correctly, my query:
GET _search
{
"size": 0,
"aggs": {
"unique_products":{
"cardinality":{
"field":"sku.keyword"
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
Returns this result:
{
"took" : 129,
"timed_out" : false,
"_shards" : {
"total" : 514,
"successful" : 514,
"skipped" : 98,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 150,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_products" : {
"value" : 38
}
}
}
Which correctly reports the number of unique_products as 38.
I am trying to edit this query so that it will actually return all 38 unique products, but am unsure how, I started by trying to return the top hit from the agg result:
GET _search
{
"size": 0,
"aggs": {
"unique_products":{
"cardinality":{
"field":"sku.keyword"
}
},
"top_hits": {
"size": 1,
"_source": {
"include": [
"sku", "source_store"
]
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
But got an error in my result saying:
{
"error": {
"root_cause": [
{
"type": "parsing_exception",
"reason": "Expected [START_OBJECT] under [size], but got a [VALUE_NUMBER] in [top_hits]",
"line": 10,
"col": 13
}
],
"type": "parsing_exception",
"reason": "Expected [START_OBJECT] under [size], but got a [VALUE_NUMBER] in [top_hits]",
"line": 10,
"col": 13
},
"status": 400
}
Is a cardinality agg still my best bet for returning all 38 unique products? thanks

While the cardinality aggregation gives the unique count, it cannot accept sub-aggs. In other words top_hits cannot be used here directly.
The approach was correct but you may first want to bucketize the skus and then retrieve the underlying docs using top_hits:
{
"size": 0,
"aggs": {
"unique_products": {
"cardinality": {
"field": "sku.keyword"
}
},
"terms_agg": {
"terms": {
"field": "sku.keyword",
"size": 100
},
"aggs": {
"top_hits_agg": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"sku",
"source_store"
]
}
}
}
}
}
},
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "(merch1: 'Dog') AND ((store_name: 'walmart')) AND product_gap: 'yes'"
}
},
{
"range": {
"capture_date": {
"format": "date",
"gte": "2020-05-13",
"lte": "2020-08-03"
}
}
}
]
}
}
}
FYI The reason your query threw an exception is that top_hits is an agg type and, just like unique_products, it was missing its own name.

Related

Fetch the details of events occurred exactly x times in desired duration

In ElasticSearch, I need to fetch the records only if the Event name occurred exactly x times in n days or a particular duration.
Sample index data is as below:
{"event":{"name":"event1"},"timestamp":"2010-06-20"}
I'm able to get the records of the minimum occurrence of desired event name in a particular duration. But instead of minimum, I want the exact matching count. Here's what I tried:
{
"_source": true,
"size": 0,
"query": {
"bool": {
"filter":
{
"range": { "timestamp": { "gte": "2010", "lte": "2016" }}
},
"must":
[
{ "match": { "event.name.keyword": "event1" }}
]
}
},
"aggs": {
"occurrence": {
"terms": {
"field": "event.name.keyword",
"min_doc_count": 5,
"size": 10
}
}
}
}
Another way to achieve the same is by using value_count. But here as well, I'm unable to add a condition to match exact occurrences.
{
"_source": true,
"size": 0,
"query": {
"bool": {
"filter":
{
"range": { "timestamp": { "gte": "2010", "lte": "2016" }}
},
"must":
[
{ "match": { "event.name.keyword": "event1" }}
]
}
},
"aggs": {
"occurrence": {
"value_count": {
"field": "event.name.keyword"
}
}
}
}
It provides the output as (Other output is removed for brevity):
"aggregations" : {
"occurrence" : {
"value" : 2
}
}
But I need to add a condition in the output of aggr (occurrence here) to exactly match the occurrence so that I can get the records only if the event occurred exactly x times.
Can some ES experts help me on this?
You can use Bucket Selector Aggregation and add condition as shown below for the count. Below query will give you only event which is occurs total 5 times. You can add a query clause for whatever filter you want to apply like date range or event name or anything else.
{
"size": 0,
"aggs": {
"count": {
"terms": {
"field": "event.name.keyword",
"size": 10
},
"aggs": {
"val_count": {
"value_count": {
"field": "event.name.keyword"
}
},
"selector": {
"bucket_selector": {
"buckets_path": {
"my_var1": "val_count"
},
"script": "params.my_var1 == 5"
}
}
}
}
}
}
You will get result something like below:
"aggregations" : {
"count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "event1",
"doc_count" : 5,
"val_count" : {
"value" : 5
}
},
{
"key" : "event8",
"doc_count" : 5,
"val_count" : {
"value" : 5
}
}
]
}
}

Elasticsearch filter results by field

I'm new to dsl and this seems simple. The code should count total entries by the hour, within the date range specified. I added a bool such that the results should have a field called 'message' which should contain '[success'
GET sample_index/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"message": "[sucess"
}
}
]
},
"range": {
"timestamp": {
"gte": "2021-01-01",
"lte": "2021-01-10"
}
}
},
"aggs": {
"hit_count_per_day": {
"date_histogram": {
"field": "timestamp",
"calendar_interval": "hour"
}
}
}
}
The error returned is
{
"error" : {
"root_cause" : [
{
"type" : "parsing_exception",
"reason" : "[bool] malformed query, expected [END_OBJECT] but found [FIELD_NAME]",
"line" : 13,
"col" : 5
}
],
"type" : "parsing_exception",
"reason" : "[bool] malformed query, expected [END_OBJECT] but found [FIELD_NAME]",
"line" : 13,
"col" : 5
},
"status" : 400
}
You need to include the range query also in the must clause. Modify your query as shown below
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"message": "[sucess"
}
},
{
"range": {
"timestamp": {
"gte": "2021-01-01",
"lte": "2021-01-10"
}
}
}
]
}
},
"aggs": {
"hit_count_per_day": {
"date_histogram": {
"field": "timestamp",
"calendar_interval": "hour"
}
}
}
}

Elasticsearch, composite and sub(?) aggregations

I'm using composite to scroll through whole data. (it's like pagination)
Suppose a car selling data,
For each day, I'd like to count the number of cars sold per car-brand
{
day1: {
honda: 3,
bmw: 5
},
day2: {
honda: 4,
audi: 1,
tesla:5
}
}
I'm doing something like the following but it doesn't work
GET _search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"date": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "1d"
},
"aggs": {
"car_brand": {
"terms": {
"field": "car_brands"
}
}
}
}
}
]
}
}
}
}
with error message
{
"error" : {
"root_cause" : [
{
"type" : "x_content_parse_exception",
"reason" : "[14:17] [composite] failed to parse field [sources]"
}
],
"type" : "x_content_parse_exception",
"reason" : "[14:17] [composite] failed to parse field [sources]",
"caused_by" : {
"type" : "illegal_state_exception",
"reason" : "expected value but got [FIELD_NAME]"
}
},
"status" : 400
}
Composite aggs cannot directly accept sub-aggs. Go with
GET _search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"date": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "1d"
}
}
},
{
"car_brand": {
"terms": {
"field": "car_brands"
}
}
}
]
}
}
}
}
instead.

How to aggregate until a certain value is reached in ElasticSearch?

I would like to aggregate a list of documents (each of them has two fields - timestamp and amount) by "amount" field until a certain value is reached. For example I would like to get list of documents sorted by timestamp which total amount is equal to 100. Is it possible to do in one query?
Here is my query which returns total amount - I would like to add here a condition to stop aggregation when a certain value is reached.
{
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": 1525168583
}
}
}
]
}
},
"aggs": {
"total_amount": {
"sum": {
"field": "amount"
}
}
},
"sort": [
"timestamp"
],
"size": 10000
}
Thank You
It's perfectly possible using a combination of function_score scripting for mimicking sorting, filter aggs for the range gte query and a healthy amount of scripted_metric aggs to limit the summation up to a certain amount.
Let's first set up a mapping and ingest some docs:
PUT summation
{
"mappings": {
"properties": {
"timestamp": {
"type": "date",
"format": "epoch_second"
}
}
}
}
POST summation/_doc
{
"context": "newest",
"timestamp": 1587049128,
"amount": 20
}
POST summation/_doc
{
"context": "2nd newest",
"timestamp": 1586049128,
"amount": 30
}
POST summation/_doc
{
"context": "3rd newest",
"timestamp": 1585049128,
"amount": 40
}
POST summation/_doc
{
"context": "4th newest",
"timestamp": 1585049128,
"amount": 30
}
Then perform the query:
GET summation/_search
{
"size": 0,
"aggs": {
"filtered_agg": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1585049128
}
}
},
{
"function_score": {
"query": {
"match_all": {}
},
"script_score": {
"script": {
"source": "return (params['now'] - doc['timestamp'].date.toMillis())",
"params": {
"now": 1587049676
}
}
}
}
}
]
}
},
"aggs": {
"limited_sum": {
"scripted_metric": {
"init_script": """
state['my_hash'] = new HashMap();
state['my_hash'].put('sum', 0);
state['my_hash'].put('docs', new ArrayList());
""",
"map_script": """
if (state['my_hash']['sum'] <= 100) {
state['my_hash']['sum'] += doc['amount'].value;
state['my_hash']['docs'].add(doc['context.keyword'].value);
}
""",
"combine_script": "return state['my_hash']",
"reduce_script": "return states[0]"
}
}
}
}
}
}
yielding
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"filtered_agg" : {
"meta" : { },
"doc_count" : 4,
"limited_sum" : {
"value" : {
"docs" : [
"newest",
"2nd newest",
"3rd newest",
"4th newest"
],
"sum" : 120
}
}
}
}
}
I've chosen here to only return the doc.contexts but you can adjust it to retrieve whatever you like -- be it IDs, amounts etc.

How can I aggregate over the _score

I tried to run an aggregate query over the _score field on Elastic Search with no results. Seems it is not possible to use the _score field, maybe because it is not a field of the document. How can I aggregate over the _score ?
This is my query:
{
"_source": false, "explain": false, "from": 0, "size": 0,
"aggs" : {
"score_ranges" : {
"range" : {
"field" : "_score",
"ranges" : [
{ "to" : 50 },
{ "from" : 50, "to" : 75 },
{ "from" : 75 }
]
}
}
},
"query": {
"function_score": {
"query": {
"match_all": { }
}
}
}
}
"aggs": {
"scores_histogram": {
"histogram": {
"script": "return _score.doubleValue() * 10",
"interval": 3
}
}
}
or, with ranges:
"aggs": {
"score_ranges": {
"range": {
"script": "_score",
"ranges": [
{
"to": 50
},
{
"from": 50,
"to": 75
},
{
"from": 75
}
]
}
}
}
And you need to enable dynamic scripting.

Resources