Elastic aggregation on specific values from within one field - elasticsearch

I am migrating my db from postgres to elasticsearch. My postgres query looks like this:
select site_id, count(*) from r_2332 where site_id in ('1300','1364') and date >= '2021-01-25' and date <= '2021-01-30'
The expected result is as follows:
site_id count
1300 1234
1364 2345
I am trying to derive the same result from elasticsearch aggs. I have tried the following:
GET /r_2332/_search
{
"query": {
"bool" : {
"should" : [
{"match" : {"site_id": "1300"}},
{"match" : {"site_id": "1364"}}
],"minimum_should_match": 1
}
},
"aggs" : {
"footfall" : {
"range" : {
"field" : "date",
"ranges" : [
{
"from":"2021-01-21",
"to":"2021-01-30"
}
]
}
}
}
}
This gives me the result as follows:
"aggregations":{"footfall":{"buckets":[{"key":"2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z","from":1.6111872E12,"from_as_string":"2021-01-21T00:00:00.000Z","to":1.6119648E12,"to_as_string":"2021-01-30T00:00:00.000Z","doc_count":2679}]}
and this:
GET /r_2332/_search
{
"query": {
"terms": {
"site_id": [ "1300", "1364" ],
"boost": 1.0
}
},
"aggs" : {
"footfall" : {
"range" : {
"field" : "date",
"ranges" : [
{
"from":"2021-01-21",
"to":"2021-01-30"
}
]
}
}
}
}
This provided the same result:
"aggregations":{"footfall":{"buckets":[{"key":"2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z","from":1.6111872E12,"from_as_string":"2021-01-21T00:00:00.000Z","to":1.6119648E12,"to_as_string":"2021-01-30T00:00:00.000Z","doc_count":2679}]}
How do I get the result separately for each site_id?

You can use a combination of terms and range aggregation to achieve your task
Adding a working example with index data, search query and search result
Index Data:
{
"site_id":1365,
"date":"2021-01-24"
}
{
"site_id":1300,
"date":"2021-01-22"
}
{
"site_id":1300,
"date":"2020-01-22"
}
{
"site_id":1364,
"date":"2021-01-24"
}
Search Query:
{
"size": 0,
"aggs": {
"siteId": {
"terms": {
"field": "site_id",
"include": [
1300,
1364
]
},
"aggs": {
"footfall": {
"range": {
"field": "date",
"ranges": [
{
"from": "2021-01-21",
"to": "2021-01-30"
}
]
}
}
}
}
}
}
Search Result:
"aggregations": {
"siteId": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1300,
"doc_count": 2,
"footfall": {
"buckets": [
{
"key": "2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z",
"from": 1.6111872E12,
"from_as_string": "2021-01-21T00:00:00.000Z",
"to": 1.6119648E12,
"to_as_string": "2021-01-30T00:00:00.000Z",
"doc_count": 1 // note this
}
]
}
},
{
"key": 1364,
"doc_count": 1,
"footfall": {
"buckets": [
{
"key": "2021-01-21T00:00:00.000Z-2021-01-30T00:00:00.000Z",
"from": 1.6111872E12,
"from_as_string": "2021-01-21T00:00:00.000Z",
"to": 1.6119648E12,
"to_as_string": "2021-01-30T00:00:00.000Z",
"doc_count": 1 // note this
}
]
}
}
]
}
}

This might perform better
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
"1300",
"1365"
]
}
},
{
"range": {
"date": {
"gte": "2021-01-21",
"lte": "2021-01-24"
}
}
}
]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id"
}
}
}
}

Related

Elasticsearch - getting aggregated data based on unique values from field

In my elasticsearch (7.13) index, I have the following dataset:
maid site_id date hour
m1 1300 2021-06-03 1
m1 1300 2021-06-03 2
m1 1300 2021-06-03 1
m2 1300 2021-06-03 1
I am trying to get unique count of records for each date and site_id from the above table. The desired result is
maid site_id date count
m1 1300 2021-06-03 1
m2 1300 2021-06-03 1
I have millions of maid for each site_id and the dates spans across two years. I am using the following code with cardinality on maid assuming that it will return the unique maid's.
GET /r_2332/_search
{
"size":0,
"aggs": {
"site_id": {
"terms": {
"field": "site_id",
"size":100,
"include": [
1171, 1048
]
},"aggs" : {
"bydate" : {
"range" : {
"field": "date","ranges" : [
{
"from": "2021-04-08",
"to": "2021-04-22"
}
]
},"aggs" : {
"rdate" : {
"terms" : {
"field":"date"
},"aggs" :{
"maids" : {
"cardinality": {
"field": "maid"
}
}
}
}
}
}
}
}
}
}
This still returns the data with all the duplicate values. How do I include maid field into my query where I get the data filtered on unique maid values.
You can use multi terms aggregation along with cardinality aggregation if you want to get unique documents based on site_id and maid
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
"1300",
"1301"
]
}
},
{
"range": {
"date": {
"gte": "2021-06-02",
"lte": "2021-06-03"
}
}
}
]
}
},
"aggs": {
"group_by": {
"multi_terms": {
"terms": [
{
"field": "site_id"
},
{
"field": "maid.keyword"
}
]
},
"aggs": {
"type_count": {
"cardinality": {
"field": "site_id"
}
}
}
}
}
}
Search Result will be
"aggregations": {
"group_by": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
1300,
"m1"
],
"key_as_string": "1300|m1",
"doc_count": 3,
"type_count": {
"value": 1 // note this
}
},
{
"key": [
1300,
"m2"
],
"key_as_string": "1300|m2",
"doc_count": 1,
"type_count": {
"value": 1 // note this
}
}
]
}

ElasticSearch Nested NumericRangeQuery using min value from list for comparison

I have the following data:
[{
"id": "1",
"listItems": [
{
"key": "li1",
"value": 100
},
{
"key": "li2",
"value": 5000
}
]
},
{
"id": "2",
"listItems": [
{
"key": "li3",
"value": 200
},
{
"key": "li2",
"value": 2000
}
]
}]
I'm trying to do a NumericRangeQuery filter so that the MIN value in each document's listItems match up between a range. So for example, my range is 150 to 15000.
The only way I know how to write this is using a script query but it doesn't appear to work as the code still seems to grab any value under the listItems to attempt to match up against the range instead of grabbing the MIN like I told it to. Here's my query:
{
"track_total_hits": true,
"from": 0,
"min_score": 0.0,
"query": {
"bool": {
"must": [
{
"nested": {
"path": "listItems",
"query": {
"script": {
"script": "double minVal = 0; minVal = doc['listItems.value'][0]; for (wp in doc['listItems.value']) {if (wp < minVal) { minVal = wp;}} return minVal >= 150 && minVal <= 15000"
}
}
}
}
]
}
}}
Anybody seeing something I don't?
The search query performs the following aggregation :
Terms aggregation on the id field
Min aggregation on listItems.value
Bucket Selector aggregation that is a parent pipeline aggregation that executes a script that determines whether the current bucket will be retained in the parent multi-bucket aggregation.
Adding a working example with index mapping, index data, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"listItems": {
"type": "nested"
},
"id":{
"type":"text",
"fielddata":"true"
}
}
}
}
Index Data:
{
"id" : "1",
"listItems" :
[
{
"key" : "li1",
"value" : 100
},
{
"key" : "li2",
"value" : 5000
}
]
}
{
"id" : "2",
"listItems" :
[
{
"key" : "li3",
"value" : 200
},
{
"key" : "li2",
"value" : 2000
}
]
}
Search Query:
{
"size": 0,
"aggs": {
"id_terms": {
"terms": {
"field": "id"
},
"aggs": {
"nested_entries": {
"nested": {
"path": "listItems"
},
"aggs": {
"min_position": {
"min": {
"field": "listItems.value"
}
}
}
},
"value_range": {
"bucket_selector": {
"buckets_path": {
"totalValues": "nested_entries>min_position"
},
"script": "params.totalValues >= 150 && params.totalValues < 15000"
}
}
}
}
}
}
Search Result:
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "2",
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 200.0
}
}
}
]
}
}

Elasticsearch: aggregation and select docs only having max value of field

I am using elastic search 6.5.
Basically, based on my query my index can return multiple documents, I need only those documents which has the max value for a particular field.
E.g.
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"size" : 10000
}
The above query will return lots of documents/messages as per the query. The sample data returned is:
"header" : {
"id" : "Text_20190702101200123_111",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T10:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702151200123_121",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex2",
"timestamp": "2019-07-02T15:12:00.123Z",
}
}
-----------------
"header" : {
"id" : "Text_20190702081200133_124",
"date" : "2019-07-02"
"field": "ABC"
},
"body" : {
"meta" : {
"field" : "myregex1",
"timestamp": "2019-07-02T08:12:00.133Z",
}
}
So based on the above 3 documents, I only want the max timestamp one to be shown i.e. "timestamp": "2019-07-02T15:12:00.123Z"
I only want one document in above example.
I tried doing it as below:
{
"query": {
"bool": {
"must": [
{
"match": { "header.date" : "2019-07-02" }
},
{
"match": { "header.field" : "ABC" }
},
{
"bool": {
"should": [
{
"regexp": { "body.meta.field": "myregex1" }
},
{
"regexp": { "body.meta.field": "myregex2" }
}
]
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "header.id",
"order": { "group_docs" : "desc" }
},
"aggs" : {
"group_docs": { "max" : { "field": "body.meta.tiemstamp" } }
}
}
},
"size": "10000"
}
Executing the above, I am still getting all the 3 documents, instead of only one.
I do get the buckets though, but I need only one of them and not all the buckets.
The output in addition to all the records,
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Text_20190702151200123_121",
"doc_count": 29,
"group_docs": {
"value": 1564551683867,
"value_as_string": "2019-07-02T15:12:00.123Z"
}
},
{
"key": "Text_20190702101200123_111",
"doc_count": 29,
"group_docs": {
"value": 1564551633912,
"value_as_string": "2019-07-02T10:12:00.123Z"
}
},
{
"key": "Text_20190702081200133_124",
"doc_count": 29,
"group_docs": {
"value": 1564510566971,
"value_as_string": "2019-07-02T08:12:00.133Z"
}
}
]
}
}
What am I missing here?
Please note that I can have more than one messages for same timestamp. So I want them all i.e. all the messages/documents belonging to the max time stamp.
In above example there are 29 messages for same timestamp (It can go to any number). So there are 29 * 3 messages being retrieved by my query after using the above aggregation.
Basically I am able to group correctly, I am looking for something like HAVING in SQl?

Elasticsearch: Can I return only the cardinality of a buckets agg, without returning all the buckets?

Take the following query and result,
POST index/_search
{
"size": 0,
"aggs": {
"perDeviceAggregation": {
"terms": {
"field": "deviceID"
},
"aggs": {
"score_avg": {
"avg": {
"field": "device_score"
}
}
}
},
"count":{
"cardinality": {
"field": "deviceID"
}
}
}
}
result:
"aggregations": {
"aads": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "aa",
"doc_count": 3,
"score_avg": {
"value": 3.8
}
},
{
"key": "bb",
"doc_count": 1,
"score_avg": {
"value": 3.8
}
}
]
},
"count": {
"value": 2
}
}
That's great. But in my situation, I don't really care about information about each bucket. I only want to know the # of buckets. Something like the following:
"aggregations": {
"aads": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"bucket_count": 2
}
}
Is this possible in Elasticsearch?
Edit:
You might wonder why I calculate an average (which limits using terms instead of cardinality) if I don't care about what's in buckets. I do use the average to do a range aggregation. My actual problem is like folowing: The above question was simplified.
POST index/_search
{
"size": 0,
"aggs" : {
"mos_over_time" : {
"range" : {
"field" : "device_score",
"ranges" : [
{ "from" : 0.0, "to" : 2.6 },
{ "from" : 2.6, "to" : 4.0 },
{ "from" : 4.0 }
]
},
"aggs": {
"perDeviceAggregation": {
"terms": {
"field": "deviceID"
},
"aggs": {
"score_avg": {
"avg": {
"field": "device_score"
}
}
}
},
"count":{
"cardinality": {
"field": "deviceID"
}
}
}
}
}
}

Elasticsearch query array field

I used elastic search in my project. I stored some values to ES. I want to query the array field from elastic search. I have to get how many time the array of value came. For example, You could see the below code, In that, image and price are coming two times.
{
"missing_fields_arr": ["images", "price"]
},
{
"missing_fields_arr": ["price"]
},
{
"missing_fields_arr": ["images"]
},
{
"missing_fields_arr": ["images", "price"]
}
and I expected output should be
"aggregations": {
"missing_fields": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "images, price",
"doc_count": 2
},
{
"key": "price",
"doc_count": 1
},
{
"key": "images",
"doc_count": 1
}
]
}
}
My code is here,
{
"query":{
"bool":{
"must":[
{
"range": {
"#timestamp":{
"gte": "2017-07-20T00:00:00.000Z",
"lte": "2017-07-28T23:59:59.999Z"
}
}
},
{
"term": {
"tracker_name": true
}
}
]
}
},
"from": 0,
"size": 0,
"aggregations" : {
"missing_fields": {"terms": {"field": "missing_fields_arr.raw", "size": 0} }
}
}
You need to use the count api it's much more efficient than the search:
of course combined with a little bit of regex
ex :
curl -XGET 'localhost:9200/product/item/_count?pretty' -H 'Content-Type:application/json' -d'\
{ "query" : { "term" : { "image|price" } } } '
GET /product/item/_count
{
"query" : {
"term" : { "image|price"}
}
}
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-count.html
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-valuecount-aggregation.html

Resources