elasticsearch terms and sum aggregation - elasticsearch

I have documents in elasticsearch (1.5) that looks like:
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
not all of the documents contains the three options (male/female/unknown)
i would like to get the sum of all values per each gender name. like that:
{
"buckets": [
{
"key": "unknown",
"doc_count": 112,
"gender_a": {
"value": 462
}
},
{
"key": "male",
"doc_count": 107,
"gender_a": {
"value": 438
}
},
{
"key": "female",
"doc_count": 36,
"gender_a": {
"value": 186
}
}
]
}
i tried this query:
{
"aggs": {
"gender_name": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_sum": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
but something weird is going on, and i don't get the right values.
any idea what i am missing ?

You will probably need to make sure that your "gender" property has type "nested". With that, I was able to make the following do what I think you're asking.
First I set up a simple index:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"gender": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"value": {
"type": "long"
}
}
}
}
}
}
}
Then added a couple of docs:
PUT /test_index/doc/1
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
PUT /test_index/doc/2
{
"gender": [
{
"name": "male",
"value": 8
},
{
"name": "female",
"value": 4
}
]
}
Then I was able to get total counts by gender name as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"genders": {
"nested": {
"path": "gender"
},
"aggs": {
"gender_terms": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_name_value_sums": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"genders": {
"doc_count": 5,
"gender_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 2,
"gender_name_value_sums": {
"value": 88,
"value_as_string": "88.0"
}
},
{
"key": "male",
"doc_count": 2,
"gender_name_value_sums": {
"value": 97,
"value_as_string": "97.0"
}
},
{
"key": "unknown",
"doc_count": 1,
"gender_name_value_sums": {
"value": 12,
"value_as_string": "12.0"
}
}
]
}
}
}
}
Here is the code I used to test it:
http://sense.qbox.io/gist/d4533215806b858aa2cc1565546d167fdec3c973

Related

Elasticsearch - Count number of occurrence perd field per document

Is it possible to calculate the number of occurence of distinct values in a list field.
For example, let the following data:
[
{
"page":1,
"colors":[
{
"color": red
},
{
"color": white
},
{
"color": red
}
]
},
{
"page":2,
"colors":[
{
"color": yellow
},
{
"color": yellow
}
]
}
]
Is it possible to get a result as the follwing:
{
"page":1,
"colors_count":[
{
"Key": red,
"Count": 2
},
{
"Key": white,
"Count": 1
},
]
},
{
"page":2,
"colors_count":[
{
"Key": yellow,
"Count": 2
}
]
}
I tried using term aggregation but I got the number of distinct values, so for page:1 i got red:1 and white:1.
Yes, you can do it. you will have to use nested_field type and nested_Agg
Mapping:
PUT colors
{
"mappings": {
"properties": {
"page" : { "type": "keyword" },
"colors": {
"type": "nested",
"properties": {
"color": {
"type": "keyword"
}
}
}
}
}
}
Insert Documents:
PUT colors/_doc/1
{
"page": 1,
"colors": [
{
"color": "red"
},
{
"color": "white"
},
{
"color": "red"
}
]
}
PUT colors/_doc/2
{
"page": 2,
"colors": [
{
"color": "yellow"
},
{
"color": "yellow"
}
]
}
Query:
GET colors/_search
{
"size" :0,
"aggs": {
"groupByPage": {
"terms": {
"field": "page"
},
"aggs": {
"colors": {
"nested": {
"path": "colors"
},
"aggs": {
"genres": {
"terms": {
"field": "colors.color"
}
}
}
}
}
}
}
}
Output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"groupByPage": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", // page field value
"doc_count": 1,
"colors": {
"doc_count": 3,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "red",
"doc_count": 2
},
{
"key": "white",
"doc_count": 1
}
]
}
}
},
{
"key": "2", // page field value
"doc_count": 1,
"colors": {
"doc_count": 2,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "yellow",
"doc_count": 2
}
]
}
}
}
]
}
}
}

Is there any way to aggregate only on query hits in elasticsearch?

I have an ES query where I want to perform some query to retrieve 100 elements matching the query criteria and then perform aggregation on those values. But what happens is the query returns 100 hits and aggregation returns 100 buckets if i provide size 100, but the hits do not match the value in the buckets.
I tried loading all values with "size": 0 but I have large records and this takes a lot of time.
I have also tried using 2 queries (using Terms agg which is quite heavy) but I want to accomplish this with one query if possible. Is there any way to achieve this?
{
"size": 10,
"query": {
"bool": {
"must": [
{
"range": {
"amount": {
"gte": 10000,
"lte": 20000
}
}
}
]
}
},
"_source": {
"include":["id","amount"]
},
"aggs": {
"ID": {
"terms": {
"field": "id"
},
"aggs": {
"SumAgg": {
"sum: {
"field": "paidAmount"
}
}
}
}
}
}
Edit:
Here is the response:
"hits": {
"total": 712,
"max_score": 1,
"hits": [
{
"_score": 1,
"_source": {
"amount": 15732,
"id": 18xxxxxxx108
}
},
{
"_score": 1,
"_source": {
"amount": 11485,
"id": 33xxxxxxx107
}
},
{
"_score": 1,
"_source": {
"amount": 16757,
"id": 34xxxxxxx286
}
},
{
"_score": 1,
"_source": {
"amount": 16134,
"id": 29xxxxxxx018
}
},
{
"_score": 1,
"_source": {
"amount": 11767,
"id": 11xxxxxxx017
}
},
{
"_score": 1,
"_source": {
"amount": 16744,
"id": 38xxxxxxx106
}
},
{
"_score": 1,
"_source": {
"amount": 10587,
"id": 34xxxxxxx113
}
},
{
"_score": 1,
"_source": {
"amount": 18704,
"id": 34xxxxxxx177
}
},
{
"_score": 1,
"_source": {
"amount": 10077,
"id": 13xxxxxxx306
}
},
{
"_score": 1,
"_source": {
"amount": 12812,
"id": 46xxxxxxx334
}
}
]
},
"aggregations": {
"ID": {
"doc_count_error_upper_bound": 7,
"sum_other_doc_count": 702,
"buckets": [
{
"key": 24,
"doc_count": 1,
"SumAgg": {
"value": 17176
}
},
{
"key": 27,
"doc_count": 1,
"SumAgg": {
"value": 19924
}
},
{
"key": 81,
"doc_count": 1,
"SumAgg": {
"value": 19784
}
},
{
"key": 93,
"doc_count": 1,
"SumAgg": {
"value": 10942
}
},
{
"key": 124,
"doc_count": 1,
"SumAgg": {
"value": 12337
}
},
{
"key": 148,
"doc_count": 1,
"SumAgg": {
"value": 18604
}
},
{
"key": 158,
"doc_count": 1,
"SumAgg": {
"value": 14680
}
},
{
"key": 217,
"doc_count": 1,
"SumAgg": {
"value": 17295
}
},
{
"key": 273,
"doc_count": 1,
"SumAgg": {
"value": 10989
}
},
{
"key": 321,
"doc_count": 1,
"SumAgg": {
"value": 13917
}
}
]
}
}
I want the ids to be the same in both context.
Just realized that elasticsearch has an aggregation called Sampler Aggregation which allows you to run an aggregation query on only the top samples.
This was an experimental feature till version 5.x but it appears that they have released this post 6.0 version onwards. My bad for missing out on this!! :(
Below is how your query would be formulated:
Query:
POST <your_index_name>/_search
{
"size":10,
"query":{
"bool":{
"must":[
{
"range":{
"amount":{
"gte":10000,
"lte":20000
}
}
}
]
}
},
"_source":{
"include":[
"id",
"amount"
]
},
"aggs":{
"mysampler":{
"sampler":{ <---- Note this
"shard_size":10
},
"aggs":{
"ID":{
"terms":{
"field":"id"
},
"aggs":{
"SumAgg":{
"sum":{
"field":"amount"
}
}
}
}
}
}
}
}
Hope this helps and if you think this helps, feel free to accept this as answer and/or upvote it!!

Convert sql query to elasticsearch

I need to convert this query into elastic search, but I am facing the problem that in elastic search (having) is not supported yet.
Select sum(count) as count,prop1
from
(
SELECT Count(*) as count,prop1 FROM [table1] group by prop1,prop2
having count = 1
)
group by prop1
order by count desc limit 10
I try this query in elastic search:
`GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term":
{
"field": "test"
}
}
]
}
},
"aggs": {
"aggregation": {
"terms": {
"field": "prop1"
},
"aggs": {
"subaggregation": {
"terms": {
"field": "prop2",
"order": {
"_count": "desc"
}
}
},
"test":{
"bucket_selector": {
"buckets_path":
{
"test1": "_count"
},
"script":"params.test1 == 1"
}
}
}
}
}
}`
Here is the mapping that I use:
PUT /index
{
"mappings" : {
"timeline" : {
"properties" : {
"prop1" : {
"type" : "keyword"
},
"prop2" : {
"type" : "keyword"
}
}
}
}
}
but I cannot get the sub-aggregation buckets who have count == 1
Here is the output of the suggested answer :
{
"took": 344,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 852146,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 646,
"sum_other_doc_count": 37299,
"buckets": [
{
"key": "porp1-key",
"doc_count": 348178,
"prop2": {
"doc_count_error_upper_bound": 130,
"sum_other_doc_count": 345325,
"buckets": [
{
"key": "e1552d2d-da84-4588-9b65-16c33848bb94_1",
"doc_count": 558,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "04b1a8eb-f876-459b-af9b-855493318dca_426",
"doc_count": 383,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "b165d2c7-6a23-4a4d-adbb-3b2a79d4c627_80",
"doc_count": 344,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c4ea55dc-c3b3-492b-98a2-1ad004212c3d_99",
"doc_count": 297,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "dfc1ae22-5c7f-49ab-8488-207661b43716_294",
"doc_count": 264,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "28815490-e7ce-420b-bab8-57a6ffc3f56a_572",
"doc_count": 239,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c3c56ec8-e0ff-46ea-841d-cc22b2dc65f6_574",
"doc_count": 217,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "473289b8-fb73-4cbb-b8d7-a5386846745f_34",
"doc_count": 187,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "670cb862-7976-4fd5-ba3f-3f8b7c03d615_11",
"doc_count": 185,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "41870755-96dd-4a00-ab76-632a1dfaecb5_341",
"doc_count": 179,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
}
]
},
"final": {
"value": 0
}
} ]
}
}
}
Try this. Aggregation final will give you the desired output.
GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"field": "test"
}
}
]
}
},
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final": {
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Working code :
PUT prop
{
"mappings": {
"prop": {
"properties": {
"prop1": {
"type": "keyword"
},
"prop2": {
"type": "keyword"
}
}
}
}
}
POST _bulk
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q1"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q5"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q6"}
GET prop/prop/_search
{
"size": 0,
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final":{
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Output :
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "p1",
"doc_count": 3,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q2",
"doc_count": 2,
"prop2_count": {
"value": 2
},
"prop2_check": {
"value": 0
}
},
{
"key": "q1",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 1
}
},
{
"key": "p2",
"doc_count": 2,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q5",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
},
{
"key": "q6",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 2
}
}
]
}
}
}

Elastic Search multi-value field aggregation

My indexed documents have a schema:
{
...
'authors': [{'first name': 'John', 'last name': 'Smith'},
{'first name': 'Mark', 'last name': 'Spencer'}]
...
}
I would like to search them and aggregate by the individual authors, so get a list with top authors which occurred in my hits. Terms aggregation seems to be a match for my needs, but I'm not able to get it working for field with a list of values. Any help?
You will probably want to use a nested type, then you can use a nested aggregation on the author names.
As an example, I set up a simple index like this:
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"title": {
"type": "string"
},
"authors": {
"type": "nested",
"properties": {
"first_name": {
"type": "string"
},
"last_name": {
"type": "string"
}
}
}
}
}
}
}
Then added a couple of docs:
PUT /test_index/doc/1
{
"title": "Book 1",
"authors": [
{
"first_name": "John",
"last_name": "Smith"
},
{
"first_name": "Mark",
"last_name": "Spencer"
}
]
}
PUT /test_index/doc/2
{
"title": "Book 2",
"authors": [
{
"first_name": "Ben",
"last_name": "Jones"
},
{
"first_name": "Tom",
"last_name": "Lawrence"
}
]
}
Then I can get the list of (analyzed) author last names with:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_authors": {
"nested": {
"path": "authors"
},
"aggs": {
"author_last_names": {
"terms": {
"field": "authors.last_name"
}
}
}
}
}
}
...
{
"took": 71,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_authors": {
"doc_count": 4,
"author_last_names": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "jones",
"doc_count": 1
},
{
"key": "lawrence",
"doc_count": 1
},
{
"key": "smith",
"doc_count": 1
},
{
"key": "spencer",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/ca94cc11a12f8e4fed5c62c52966128b9a6f58de

Find min value of field in nested array object after aggregation

I would like to find the minimum value of a field in a nested array object after aggregation.
Data example:
[
{
"id": "i1",
"version": 1,
"entries": [
{
"name": "n1",
"position": 1
}, {
"name": "n2",
"position": 2
}
]
}, {
"id": "i1"
"version": 2,
"entries": [
{
"name": "n2",
"position": 3
}, {
"name": "n3",
"position": 4
}
]
},
{
"id": "i2",
"version": 1,
"entries": [
{
"name": "n1",
"position": 8
}, {
"name": "n2",
"position": 7
}
]
}, {
"id": "i2"
"version": 2,
"entries": [
{
"name": "n2",
"position": 6
}, {
"name": "n3",
"position": 5
}
]
}
]
Pseudo Query:
SELECT min(entries["n2"].position) WHERE entries.name="n2" GROUP BY id;
Expected Result:
[
{
"id": "i1",
"min(position)": 2
}, {
"id": "i2",
"min(position)": 6
}
]
I can do this in code, but it's not performant, as I need to return the document sources which can be quite large.
I am thinking of denormalizing the data, but would like to first know if this request is not possible at all.
You can do it by nesting several aggregations like this:
terms agg -> nested agg -> filter agg -> min agg
To test it I set up an index:
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"doc": {
"properties": {
"entries": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"position": {
"type": "long"
}
}
},
"id": {
"type": "string"
},
"version": {
"type": "long"
}
}
}
}
}
And indexed your docs:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"id":"i1","version":1,"entries":[{"name":"n1","position":1},{"name":"n2","position":2}]}
{"index":{"_id":2}}
{"id":"i1","version":2,"entries":[{"name":"n2","position":3},{"name":"n3","position":4}]}
{"index":{"_id":3}}
{"id":"i2","version":1,"entries":[{"name":"n1","position":8},{"name":"n2","position":7}]}
{"index":{"_id":4}}
{"id":"i2","version":2,"entries":[{"name":"n2","position":6},{"name":"n3","position":5}]}
Here is the query:
POST /test_index/_search?search_type=count
{
"aggs": {
"id_terms": {
"terms": {
"field": "id"
},
"aggs": {
"nested_entries": {
"nested": {
"path": "entries"
},
"aggs": {
"filter_name": {
"filter": {
"term": {
"entries.name": "n2"
}
},
"aggs": {
"min_position": {
"min": {
"field": "position"
}
}
}
}
}
}
}
}
}
}
and the result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "i1",
"doc_count": 2,
"nested_entries": {
"doc_count": 4,
"filter_name": {
"doc_count": 2,
"min_position": {
"value": 2,
"value_as_string": "2.0"
}
}
}
},
{
"key": "i2",
"doc_count": 2,
"nested_entries": {
"doc_count": 4,
"filter_name": {
"doc_count": 2,
"min_position": {
"value": 6,
"value_as_string": "6.0"
}
}
}
}
]
}
}
}
Here is the code I used all together:
http://sense.qbox.io/gist/34a013099ef07fb527d9d7cf8490ad1bbafa718b

Resources