I have an ES query where I want to perform some query to retrieve 100 elements matching the query criteria and then perform aggregation on those values. But what happens is the query returns 100 hits and aggregation returns 100 buckets if i provide size 100, but the hits do not match the value in the buckets.
I tried loading all values with "size": 0 but I have large records and this takes a lot of time.
I have also tried using 2 queries (using Terms agg which is quite heavy) but I want to accomplish this with one query if possible. Is there any way to achieve this?
{
"size": 10,
"query": {
"bool": {
"must": [
{
"range": {
"amount": {
"gte": 10000,
"lte": 20000
}
}
}
]
}
},
"_source": {
"include":["id","amount"]
},
"aggs": {
"ID": {
"terms": {
"field": "id"
},
"aggs": {
"SumAgg": {
"sum: {
"field": "paidAmount"
}
}
}
}
}
}
Edit:
Here is the response:
"hits": {
"total": 712,
"max_score": 1,
"hits": [
{
"_score": 1,
"_source": {
"amount": 15732,
"id": 18xxxxxxx108
}
},
{
"_score": 1,
"_source": {
"amount": 11485,
"id": 33xxxxxxx107
}
},
{
"_score": 1,
"_source": {
"amount": 16757,
"id": 34xxxxxxx286
}
},
{
"_score": 1,
"_source": {
"amount": 16134,
"id": 29xxxxxxx018
}
},
{
"_score": 1,
"_source": {
"amount": 11767,
"id": 11xxxxxxx017
}
},
{
"_score": 1,
"_source": {
"amount": 16744,
"id": 38xxxxxxx106
}
},
{
"_score": 1,
"_source": {
"amount": 10587,
"id": 34xxxxxxx113
}
},
{
"_score": 1,
"_source": {
"amount": 18704,
"id": 34xxxxxxx177
}
},
{
"_score": 1,
"_source": {
"amount": 10077,
"id": 13xxxxxxx306
}
},
{
"_score": 1,
"_source": {
"amount": 12812,
"id": 46xxxxxxx334
}
}
]
},
"aggregations": {
"ID": {
"doc_count_error_upper_bound": 7,
"sum_other_doc_count": 702,
"buckets": [
{
"key": 24,
"doc_count": 1,
"SumAgg": {
"value": 17176
}
},
{
"key": 27,
"doc_count": 1,
"SumAgg": {
"value": 19924
}
},
{
"key": 81,
"doc_count": 1,
"SumAgg": {
"value": 19784
}
},
{
"key": 93,
"doc_count": 1,
"SumAgg": {
"value": 10942
}
},
{
"key": 124,
"doc_count": 1,
"SumAgg": {
"value": 12337
}
},
{
"key": 148,
"doc_count": 1,
"SumAgg": {
"value": 18604
}
},
{
"key": 158,
"doc_count": 1,
"SumAgg": {
"value": 14680
}
},
{
"key": 217,
"doc_count": 1,
"SumAgg": {
"value": 17295
}
},
{
"key": 273,
"doc_count": 1,
"SumAgg": {
"value": 10989
}
},
{
"key": 321,
"doc_count": 1,
"SumAgg": {
"value": 13917
}
}
]
}
}
I want the ids to be the same in both context.
Just realized that elasticsearch has an aggregation called Sampler Aggregation which allows you to run an aggregation query on only the top samples.
This was an experimental feature till version 5.x but it appears that they have released this post 6.0 version onwards. My bad for missing out on this!! :(
Below is how your query would be formulated:
Query:
POST <your_index_name>/_search
{
"size":10,
"query":{
"bool":{
"must":[
{
"range":{
"amount":{
"gte":10000,
"lte":20000
}
}
}
]
}
},
"_source":{
"include":[
"id",
"amount"
]
},
"aggs":{
"mysampler":{
"sampler":{ <---- Note this
"shard_size":10
},
"aggs":{
"ID":{
"terms":{
"field":"id"
},
"aggs":{
"SumAgg":{
"sum":{
"field":"amount"
}
}
}
}
}
}
}
}
Hope this helps and if you think this helps, feel free to accept this as answer and/or upvote it!!
Related
I am using elaticsearch v7.1.0 to perform composite aggregation to paginate my results.
The data in the index I am querying and aggregating looks like this.
{
"sequence": "SEQ-A123",
"timestamp": "2022-05-11T12:26:54Z",
"owner": "b96e1abb08d44a6a9871f567aa392167",
"serialNo": "A5645",
"value": 45,
"ctags": [
{
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
},
{
"sequence": "SEQ-B123",
"timestamp": "2022-05-11T12:26:54Z",
"owner": "b96e1abb08d44a6a9871f567aa392165",
"serialNo": "A8456",
"value": 87,
"ctags": [
{
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
},
{
"sequence": "SEQ-C123",
"timestamp": "2022-05-11T12:26:54Z",
"owner": "b96e1abb08d44a6a9871f567aa392165",
"serialNo": "A59",
"value": 87,
"ctags": [
{
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
}, ...
The Query I am executing on elasticsearch is this.
{
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": "2022-05-01T00:00:00.000Z",
"lte": "2022-05-30T23:59:59.999Z"
}
}
},
{
"terms": {
"sequence.keyword": [
"SEQ-A123",
"SEQ-B123"
]
}
}
],
"must_not": [
{
"term": {
"serialNo.keyword": "test"
}
}
]
}
},
"size": 0,
"aggs": {
"sequence": {
"composite": {
"sources": [
{
"bkt_sequence": {
"terms": {
"field": "sequence.keyword"
}
}
}
],
"after": {
"bkt_sequence": ""
},
"size": 2
},
"aggs": {
"serialNo": {
"terms": {
"field": "serialNo.keyword"
},
"aggs": {
"usageStats": {
"stats": {
"field": "value"
}
},
"ctags": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"owner",
"ctags"
]
}
}
}
}
}
}
}
}
}
The result I am getting against this query looks like this.
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 94,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"sequence": {
"after_key": {
"bkt_sequence": "SEQ-B123"
},
"buckets": [
{
"key": {
"bkt_sequence": "SEQ-A123"
},
"doc_count": 47,
"serialNo": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 37,
"buckets": [
"0": {
"key": "A5645",
"doc_count": 1,
"ctags": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 2,
"hits": [
"0": {
"_index": "seq-serial",
"_type": "_doc",
"_id": "1",
"_score": 2,
"_source": {
"owner": "b96e1abb08d44a6a9871f567aa392167",
"ctags": [
"0": {
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
}
}
]
}
},
"usageStats": {
"count": 1,
"min": 45,
"max": 45,
"avg": 45,
"sum": 45
}
},
"1": {
"key": "A5646",
"doc_count": 1,
"ctags": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 2,
"hits": [
"0": {
"_index": "seq-serial",
"_type": "_doc",
"_id": "27",
"_score": 2,
"_source": {
"owner": "b96e1abb08d44a6a9871f567aa392169",
"ctags": [
"0": {
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
}
}
]
}
},
"usageStats": {
"count": 1,
"min": 85,
"max": 85,
"avg": 85,
"sum": 85
}
},
"2": {
...
},
"3": {
...
},
"4": {
...
},
"5": {
...
},
"6": {
...
},
"7": {
...
},
"8": {
...
},
"9": {
...
}
]
}
},
{
"key": {
"bkt_sequence": "SEQ-B123"
},
"doc_count": 47,
"serialNo": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 37,
"buckets": [
"0": {
"key": "A8456",
"doc_count": 1,
"ctags": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 2,
"hits": [
"0": {
"_index": "seq-serial",
"_type": "_doc",
"_id": "48",
"_score": 2,
"_source": {
"owner": "b96e1abb08d44a6a9871f567aa392167",
"ctags": [
"0": {
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
}
}
]
}
},
"usageStats": {
"count": 1,
"min": 45,
"max": 45,
"avg": 45,
"sum": 45
}
},
"1": {
"key": "A7590",
"doc_count": 1,
"ctags": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 2,
"hits": [
"0": {
"_index": "seq-serial",
"_type": "_doc",
"_id": "74",
"_score": 2,
"_source": {
"owner": "b96e1abb08d44a6a9871f567aa392169",
"ctags": [
"0": {
"name": "project",
"value": "cd8041f817634e7784b8c0cb5b069d4b"
}
]
}
}
]
}
},
"usageStats": {
"count": 1,
"min": 85,
"max": 85,
"avg": 85,
"sum": 85
}
},
"2": {
...
},
"3": {
...
},
"4": {
...
},
"5": {
...
},
"6": {
...
},
"7": {
...
},
"8": {
...
},
"9": {
...
}
]
}
}
]
}
}
}
As you can see there are total 94 documents that are hit with this query.
47 belongs to the SEQ-A123 bucket and other 47 belongs to SEQ-B123 bucket but out of 47 only 10 documents are returned in the response.
How can I get all 47 documents in the result and still use pagination at sequence field level?
Terms Aggregation by default returns only top 10 documents.
Just need to add size in the terms aggregation in the aggregation serialNo.
Below is how your query would look like:
POST test_index/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": "2022-05-01T00:00:00.000Z",
"lte": "2022-05-30T23:59:59.999Z"
}
}
},
{
"terms": {
"sequence.keyword": [
"SEQ-A123",
"SEQ-B123"
]
}
}
],
"must_not": [
{
"term": {
"serialNo.keyword": "test"
}
}
]
}
},
"size": 0,
"aggs": {
"sequence": {
"composite": {
"sources": [
{
"bkt_sequence": {
"terms": {
"field": "sequence.keyword"
}
}
}
],
"after": {
"bkt_sequence": ""
},
"size": 2
},
"aggs": {
"serialNo": {
"terms": {
"field": "serialNo.keyword",
"size": 100 <----------- Note this here
},
"aggs": {
"usageStats": {
"stats": {
"field": "value"
}
},
"ctags": {
"top_hits": {
"size": 1,
"_source": {
"include": [
"owner",
"ctags"
]
}
}
}
}
}
}
}
}
}
I have an elasticsearch index (v6.8) that contains documents that may share a similar value for a field.
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 123,
"owner": "B",
"price": 20
},
{
"siren": 456,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "C",
"price": 30
}
]
I would like to get all documents from owner A and B, but deduplicated on the siren field. The result would be. I don't care which deduplicated line is returned (from owner A or B).
[
{
"siren": 123,
"owner": "A",
"price": 10
},
{
"siren": 456,
"owner": "A",
"price": 10
}
]
Also, I would like my aggregations to count documents deduplicated on the same field.
I have tried
{
"query": {
"bool": {
"must": [
[
{
"terms": {
"owner": [
"A",
"B"
]
}
}
]
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "price",
"size": 20
}
}
}
}
But this counts multiple times the "same" document.
You can use terms aggregation on the siren field along with top hits aggregation
{
"size":0,
"query": {
"bool": {
"must": [
{
"terms": {
"owner.keyword": [
"A",
"B"
]
}
}
]
}
},
"aggs": {
"by_price": {
"terms": {
"field": "siren",
"size": 20
},
"aggs": {
"top_sales_hits": {
"top_hits": {
"_source": {
"includes": [
"siren",
"owner",
"price"
]
},
"size": 1
}
}
}
}
}
}
Search Result will be
"aggregations": {
"by_price": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123,
"doc_count": 2,
"top_sales_hits": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 123,
"price": 10
}
}
]
}
}
},
{
"key": 456,
"doc_count": 1,
"top_sales_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "66226467",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"owner": "A", // note this
"siren": 456,
"price": 10
}
}
]
}
}
}
]
}
}
I am new to elastic search and requesting some help.
Basically I have some 2 million documents in my elastic search and the documents look like below:
{
"_index": "flipkart",
"_type": "PSAD_ThirdParty",
"_id": "430001_MAM_2016-02-04",
"_version": 1,
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 206,
"bookedNumbers": 15000,
"ut": 1454567400000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
I want to write an aggregation query which satisfies below conditions:
1) First query based on "_index", "_type" and "pliId".
2) Do aggregation sum on metrics.value based on metrics.id = "Metric1".
Basically I need to query records based on some fields and aggregate sum on a particular metrics value based on metrics id.
Please can you help me in getting my query right.
Your metrics field needs to be of type nested:
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
}
}
}
If you want Metric1 to match, meaning upper-case letter, then as you see above the id needs to be not_analyzed.
Then, if you only want metrics.id = "Metric1" aggregations, you need something like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"pliId": 206
}
}
]
}
}
}
},
"aggs": {
"by_metrics": {
"nested": {
"path": "metrics"
},
"aggs": {
"metric1_only": {
"filter": {
"bool": {
"must": [
{
"term": {
"metrics.id": {
"value": "Metric1"
}
}
}
]
}
},
"aggs": {
"by_metric_id": {
"terms": {
"field": "metrics.id"
},
"aggs": {
"total_delivery": {
"sum": {
"field": "metrics.value"
}
}
}
}
}
}
}
}
}
}
Created new index:
Method : PUT ,
URL : http://localhost:9200/google/
Body:
{
"mappings": {
"PSAD_Primary": {
"properties": {
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
},
"value": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then I inserted some 200 thousand documents and than ran the query and it worked.
Response:
{
"took": 34,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "google",
"_type": "PSAD_Primary",
"_id": "383701291_MAM_2016-01-06",
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 221244,
"bookedNumbers": 15000,
"ut": 1452061800000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
]
},
"aggregations": {
"by_metrics": {
"doc_count": 3,
"metric1_only": {
"doc_count": 1,
"by_metric_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Metric1",
"doc_count": 1,
"total_delivery": {
"value": 70
}
}
]
}
}
}
}
}
Given the following elasticsearch document, how would I construct a search that would sum the values of the seconds column for a given datetime range?
See below for my current query.
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "searchdb",
"_type": "profile",
"_id": "1825",
"_score": 1,
"_source": {
"id": 1825,
"market": "Chicago",
"geo_location": {
"lat": 41.1234,
"lon": -87.5678
},
"hourly_values": [
{
"datetime": "1997-07-16T19:00:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
}
]
}
},
{
"_index": "searchdb",
"_type": "profile",
"_id": "1808",
"_score": 1,
"_source": {
"id": 1808,
"market": "Chicago",
"geo_location": {
"lat": 41.1234,
"lon": -87.5678
},
"hourly_values": [
{
"datetime": "1997-07-16T19:00:00.00+00:00",
"seconds": 900
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 1200
},
{
"datetime": "1997-07-16T19:20:00.00+00:00",
"seconds": 800
}
]
}
}
]
}
Below is my current query. The problem with it is it doesn't take into consideration the datetime field. I need to be able to sum only the seconds values that fall within a given datetime range in the query.
{
"aggs": {
"Ids": {
"terms": {
"field": "id",
"size": 0
},
"aggs": {
"Nesting": {
"nested": {
"path": "hourly_values"
},
"aggs": {
"availability": {
"sum": {
"field": "hourly_values.seconds"
}
}
}
}
}
}
}
}
I know you can use a range, something like this:
"filter" : {
"range" : { "timestamp" : { "from" : "now/1d+9.5h", "to" : "now/1d+16h" }}
}
but I can't figure out how to integrate that into my query to get the desired output.
For clarity, my desired output is to return each of the objects returned from the query, and the values of the summation of the seconds fields, but I only want to sum the values for the given time range.
I think this can be done with filter aggregation
Try this
{
"aggs": {
"Ids": {
"terms": {
"field": "id",
"size": 0
},
"aggs": {
"Nesting": {
"nested": {
"path": "hourly_values"
},
"aggs": {
"filtered_result": {
"filter": {
"query": {
"range": {
"hourly_values.datetime": {
"gt": "1997-07-16T19:10:00.00+00:00",
"lt": "1997-07-16T19:22:00.00+00:00"
}
}
}
},
"aggs": {
"availability": {
"sum": {
"field": "hourly_values.seconds"
}
}
}
}
}
}
}
}
},
"size": 0
}
The result I get
"aggregations": {
"Ids": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1808",
"doc_count": 1,
"Nesting": {
"doc_count": 3,
"filtered_result": {
"doc_count": 2,
"availability": {
"value": 2000
}
}
}
},
{
"key": "1825",
"doc_count": 1,
"Nesting": {
"doc_count": 3,
"filtered_result": {
"doc_count": 2,
"availability": {
"value": 2400
}
}
}
}
]
}
}
Does this help?
I have documents in elasticsearch (1.5) that looks like:
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
not all of the documents contains the three options (male/female/unknown)
i would like to get the sum of all values per each gender name. like that:
{
"buckets": [
{
"key": "unknown",
"doc_count": 112,
"gender_a": {
"value": 462
}
},
{
"key": "male",
"doc_count": 107,
"gender_a": {
"value": 438
}
},
{
"key": "female",
"doc_count": 36,
"gender_a": {
"value": 186
}
}
]
}
i tried this query:
{
"aggs": {
"gender_name": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_sum": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
but something weird is going on, and i don't get the right values.
any idea what i am missing ?
You will probably need to make sure that your "gender" property has type "nested". With that, I was able to make the following do what I think you're asking.
First I set up a simple index:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"gender": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"value": {
"type": "long"
}
}
}
}
}
}
}
Then added a couple of docs:
PUT /test_index/doc/1
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
PUT /test_index/doc/2
{
"gender": [
{
"name": "male",
"value": 8
},
{
"name": "female",
"value": 4
}
]
}
Then I was able to get total counts by gender name as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"genders": {
"nested": {
"path": "gender"
},
"aggs": {
"gender_terms": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_name_value_sums": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"genders": {
"doc_count": 5,
"gender_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 2,
"gender_name_value_sums": {
"value": 88,
"value_as_string": "88.0"
}
},
{
"key": "male",
"doc_count": 2,
"gender_name_value_sums": {
"value": 97,
"value_as_string": "97.0"
}
},
{
"key": "unknown",
"doc_count": 1,
"gender_name_value_sums": {
"value": 12,
"value_as_string": "12.0"
}
}
]
}
}
}
}
Here is the code I used to test it:
http://sense.qbox.io/gist/d4533215806b858aa2cc1565546d167fdec3c973