ElasticSearch equivalent query for SQL group by multiple columns - elasticsearch

I have a few million documents with name and version (both of type keyword) as properties in each. What is the equivalent Elastic query for group by name, version?
I have tried the following query:
{
"size":0,
"query": {
"bool": {
"filter": {
"range": {
"time": {
"gte": "2017-01-28",
"lte": "2017-02-28"
}
}
}
}
},
"aggs": {
"group_by_name": {
"terms": {
"field": "name"
},
"aggs": {
"group_by_version": {
"terms": {
"field": "version"
}
}
}
}
}
}
However the results are not same as doing Group by name, version.
The results are grouped by name and within each group, they are grouped by version.
How do I modify the above query to group by name, version tuple and return results in descending order?
Your help is greatly appreciated.
Update:
What i get is:
{
"took": 1424,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 115,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_name": {
"doc_count_error_upper_bound": 2,
"sum_other_doc_count": 115,
"buckets": [
{
"key": "product1",
"doc_count": 50,
"group_by_version": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 50,
"buckets": [
{
"key": "1.0",
"doc_count": 40
},
{
"key": "2.0",
"doc_count": 10
},
]
}
},
{
"key": "product3",
"doc_count": 35,
"group_by_version": {
"doc_count_error_upper_bound": 4,
"sum_other_doc_count": 35,
"buckets": [
{
"key": "8.0",
"doc_count": 20
},
{
"key": "9.0",
"doc_count": 15
}
]
}
},
{
"key": "product2",
"doc_count": 30,
"group_by_version": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 30,
"buckets": [
{
"key": "4.0",
"doc_count": 25
},
{
"key": "5.0",
"doc_count": 5
}
]
}
}
]
}
}
}
What i want is:
name, version count
product1 1.0 40
product2 4.0 25
product3 8.0 20
product3 9.0 15
product1 2.0 10
product2 5.0 5

Related

How to write an elastic query to find consecutive intervals advancing one day

I cannot find out how to query elastic to find data for multiple intervals, increasing in one day increments through the end of the month.
For instance, I want to look at 7 day intervals in the month of January. 1-7, 2-8, 3-9, 4-10, etc. But I'm getting like this for the given query: 1-7, 8-15, 16-23, etc.
Does anyone know if this is possible in elastic or how to write a query with results I wrote above for consecutive days?
Here is my attempt:
{
"size": 0,
"query": {
"bool": {,
"filter": [
{
"range": {
"associated_datetime": {
"gte": "14/12/2021 19:31:56",
"lte": "14/12/2022 19:31:56",
"format": "dd/MM/yyyy HH:mm:ss"
}
}
}
]
}
},
"aggs": {
"incident": {
"date_histogram": {
"field": "associated_datetime",
"calendar_interval": "week"
},
"aggs": {
"associated_to.id": {
"terms": {
"size": 10000,
"field": "associated_to.id"
}
}
}
}
}
}
Output for the above query looks like this (aggregation object):
"aggregations": {
"incident": {
"buckets": [
{
"key_as_string": "2022-01-03T00:00:00.000Z",
"key": 1641168000000,
"doc_count": 2,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 2
}
]
}
},
{
"key_as_string": "2022-01-10T00:00:00.000Z",
"key": 1641772800000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-01-17T00:00:00.000Z",
"key": 1642377600000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-03-07T00:00:00.000Z",
"key": 1646611200000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-03-21T00:00:00.000Z",
"key": 1647820800000,
"doc_count": 7,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 37,
"doc_count": 2
},
{
"key": 38,
"doc_count": 2
},
{
"key": 39,
"doc_count": 2
},
{
"key": 40,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-05-16T00:00:00.000Z",
"key": 1652659200000,
"doc_count": 1,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4,
"doc_count": 1
}
]
}
},
{
"key_as_string": "2022-11-14T00:00:00.000Z",
"key": 1668384000000,
"doc_count": 3,
"associated_to.id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 2,
"doc_count": 2
},
{
"key": 37,
"doc_count": 1
},
{
"key": 38,
"doc_count": 1
},
{
"key": 39,
"doc_count": 1
},
{
"key": 40,
"doc_count": 1
},
{
"key": 41,
"doc_count": 1
},
{
"key": 42,
"doc_count": 1
}
]
}
}
]
}
}
One way to do it is with a date_range aggregation (note: the to date of the ranges are exclusive):
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"range": {
"associated_datetime": {
"gte": "14/12/2021 19:31:56",
"lte": "14/12/2022 19:31:56",
"format": "dd/MM/yyyy HH:mm:ss"
}
}
}
]
}
},
"aggs": {
"incident": {
"date_range": {
"field": "associated_datetime",
"ranges": [
{
"from": "2022-01-01",
"to": "2022-01-08"
},
{
"from": "2022-01-02",
"to": "2022-01-09"
},
{
"from": "2022-01-03",
"to": "2022-01-10"
},
...
]
},
"aggs": {
"associated_to.id": {
"terms": {
"size": 10000,
"field": "associated_to.id"
}
}
}
}
}
}

Aggs percentage doc_count

So I know my total hits are 182 documents
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
And then I make a aggregation to know how many documents have the source instagagram or twitter and it returns me:
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
}
Is it possible to get the percentage of documents that have source twitter and instagram?
So the percentage of documents that have source instagram is 89 % and twitter 11%.
My aggregation code its like this:
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
}
}
Let me know if this is possible.
Thank you
Sure, it is possible using the 'Bucket Script Aggregation'.
An example query might look like this:
{
"size": 0,
"aggs": {
"filters_agg": {
"filters": {
"filters": {
"sourceCount": {
"match_all": {}
}
}
},
"aggs": {
"bySource": {
"terms": {
"field": "profile.source.keyword"
}
},
"instagram_count_percentage": {
"bucket_script": {
"buckets_path": {
"instagram_count": "bySource['instagram']>_count",
"total_count": "_count"
},
"script": "Math.round((params.instagram_count * 100)/params.total_count)"
}
},
"twitter_count_percentage": {
"bucket_script": {
"buckets_path": {
"twitter_count": "bySource['twitter']>_count",
"total_count": "_count"
},
"script": "Math.round((params.twitter_count * 100)/params.total_count)"
}
}
}
}
}
}
And the response could be something like this:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 182,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"filters_agg": {
"buckets": {
"sourceCount": {
"doc_count": 182,
"bySource": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "instagram",
"doc_count": 162
},
{
"key": "twitter",
"doc_count": 20
}
]
},
"instagram_count_percentage": {
"value": 89
},
"twitter_count_percentage": {
"value": 11
}
}
}
}
}
}
Try to adjust it or get inspired depending on your case and your mapping.

ElasticSearch aggregation by all tokens in a string field

I have ElasticSearch 2.4 and I'm trying to do an aggregation on a text field of type String which contains multiple tokens. The field in question is an address field called mailingAddress. For example, below are a few results which look for NY in the address field.
{
"from": 0,
"size": 100,
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"customerprofile.mailingAddress": {
"query": "NY",
"fuzziness": 0,
"operator": "or"
}
}
},
{
"match": {
"customerprofile.companyId": {
"query": "999",
"fuzziness": 0,
"operator": "or"
}
}
}
]
}
}
]
}
}
}
returns
"hits":[
{
"_index":"wht_index_prod_v33_es24",
"_type":"customerprofile",
"_id":"2044",
"_score":2.9787974,
"_source":{
"customerId":2044,
"companyId":2007,
"fullName":"John Doe",
"email":"jon#aol.com",
"pictureURL":"john.png",
"profilePictureContentType":"image/png",
"phone":"(703) 999-8888",
"mailingAddress":"100 Lake Braddock Drive\nBurke, NY 22015",
"gender":"Male",
"emergencyContactsIds":[
],
"wantCorrespondence":false
}
},
{
"_index":"wht_index_prod_v33_es24",
"_type":"customerprofile",
"_id":"2045",
"_score":2.9787974,
"_source":{
"customerId":2045,
"companyId":2007,
"fullName":"Jane Anderson",
"email":"janea#touchva.net",
"pictureURL":"JAnderson.png",
"profilePictureContentType":"image/png",
"phone":"(434) 111-2345",
"mailingAddress":"PO Box 333, Boydton, NY 23917",
"gender":"Male",
"emergencyContactsIds":[
],
"wantCorrespondence":false
}
},
..
..
]
The question
When I do the aggregation by mailingAddress I expect to see buckets for each word in the text field. From the results above I expect to also find a bucket key named 'NY' but there isn't one. Can anyone explain why - my guess is that it has too few entries?
The aggregation:
{
"size": 0,
"aggs": {
"group_by_age": {
"terms": {
"field": "mailingAddress"
},
"aggs": {
"group_by_gender": {
"terms": {
"field": "gender"
}
}
}
}
}
}
Aggregation results:
{
"took": 16,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 401,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"group_by_age": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 1041,
"buckets": [
{
"key": "st",
"doc_count": 30,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 17
},
{
"key": "male",
"doc_count": 13
}
]
}
},
{
"key": "ca",
"doc_count": 28,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 21
},
{
"key": "male",
"doc_count": 7
}
]
}
},
{
"key": "dr",
"doc_count": 16,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 13
},
{
"key": "male",
"doc_count": 3
}
]
}
},
{
"key": "street",
"doc_count": 15,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 11
},
{
"key": "male",
"doc_count": 4
}
]
}
},
{
"key": "ave",
"doc_count": 14,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 7
}
]
}
},
{
"key": "box",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 9
},
{
"key": "male",
"doc_count": 2
}
]
}
},
{
"key": "fl",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 9
},
{
"key": "male",
"doc_count": 2
}
]
}
},
{
"key": "va",
"doc_count": 11,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "male",
"doc_count": 6
},
{
"key": "female",
"doc_count": 5
}
]
}
},
{
"key": "n",
"doc_count": 10,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 3
}
]
}
},
{
"key": "az",
"doc_count": 9,
"group_by_gender": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 7
},
{
"key": "male",
"doc_count": 2
}
]
}
}
]
}
}
}
By default, terms aggregation return the first 10 terms, but you can decide to return more by specifying a size in your aggregation, like this:
{
"size": 0,
"aggs": {
"group_by_age": {
"terms": {
"field": "mailingAddress",
"size": 50 <---- add this
},
"aggs": {
"group_by_gender": {
"terms": {
"field": "gender"
}
}
}
}
}
}
Your mileage may vary and you might need to increase the size in order to really see NY.

In elasticsearch, how to group by value inside nested array

Say, I have following documents:
1st doc:
{
productName: "product1",
tags: [
{
"name":"key1",
"value":"value1"
},
{
"name":"key2",
"value":"value2"
}
]
}
2nd doc:
{
productName: "product2",
tags: [
{
"name":"key1",
"value":"value1"
},
{
"name":"key2",
"value":"value3"
}
]
}
I know if I want to group by productName, I could use a terms aggregation
"terms": {
"field": "productName"
}
which will give me two buckets with two different keys "product1", "product2".
However, what should the query be if I would like to group by tag key? i.e. I would like to group by tag with name==key1, then I am expecting one bucket with key="value1"; while if I group by tag with name==key2, I am expecting the result to be two buckets with keys "value2", "value3".
What should the query look like if I would like to group by the 'value' inside a nested array but not group by the 'key'? Any suggestion?
It sounds like a nested terms aggregation is what you're looking for.
With the two documents you posted, this query:
POST /test_index/_search
{
"size": 0,
"aggs": {
"product_name_terms": {
"terms": {
"field": "product_name"
}
},
"nested_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"tags_name_terms": {
"terms": {
"field": "tags.name"
}
},
"tags_value_terms": {
"terms": {
"field": "tags.value"
}
}
}
}
}
}
returns this:
{
"took": 67,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"product_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"nested_tags": {
"doc_count": 4,
"tags_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "key1",
"doc_count": 2
},
{
"key": "key2",
"doc_count": 2
}
]
},
"tags_value_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "value1",
"doc_count": 2
},
{
"key": "value2",
"doc_count": 1
},
{
"key": "value3",
"doc_count": 1
}
]
}
}
}
}
Here is some code I used to test it:
http://sense.qbox.io/gist/a9a172f41dbd520d5e61063a9686055681110522
EDIT: Filter by Nested Value
As per your comment, if you want to filter the nested results by a value (of the nested results), you can add another "layer" of aggregation making use of the filter aggregation as follows:
POST /test_index/_search
{
"size": 0,
"aggs": {
"nested_tags": {
"nested": {
"path": "tags"
},
"aggs": {
"filter_tag_name": {
"filter": {
"term": {
"tags.name": "key1"
}
},
"aggs": {
"tags_name_terms": {
"terms": {
"field": "tags.name"
}
},
"tags_value_terms": {
"terms": {
"field": "tags.value"
}
}
}
}
}
}
}
}
which returns:
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_tags": {
"doc_count": 4,
"filter_tag_name": {
"doc_count": 2,
"tags_name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "key1",
"doc_count": 2
}
]
},
"tags_value_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "value1",
"doc_count": 2
}
]
}
}
}
}
}
Here's the updated code:
http://sense.qbox.io/gist/507c3aabf36b8f6ed8bb076c8c1b8552097c5458

Elasticsearch include field in result set of aggregation

How can field of type string be included in the result set of an aggregation?
For example given the following mapping:
{
"sport": {
"mappings": {
"runners": {
"properties": {
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"region": {
"type": "string"
},
"sport": {
"type": "string"
}
}
}
}
}
}
Sample data:
curl -XPOST "http://localhost:9200/sport/_bulk" -d'
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Gary", "city":"New York","region":"A","sport":"Soccer"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Bob", "city":"New York","region":"A","sport":"Tennis"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Mike", "city":"Atlanta","region":"B","sport":"Soccer"}
'
How can the field name be included in result set of the aggregation:
{
"size": 0,
"aggregations": {
"agg": {
"terms": {
"field": "city"}
}
}
}
This seems to do what you want, if I'm understanding you correctly:
POST /sport/_search
{
"size": 0,
"aggregations": {
"city_terms": {
"terms": {
"field": "city"
},
"aggs": {
"name_terms": {
"terms": {
"field": "name"
}
}
}
}
}
}
With the data you provided, it returns:
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"city_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "york",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "atlanta",
"doc_count": 1,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mike",
"doc_count": 1
}
]
}
}
]
}
}
}
(You may want to add "index":"not_analyzed" to one or both fields in your mapping, if these results are not what you were expecting.)
Here's the code I used to test it:
http://sense.qbox.io/gist/07735aadc082c1c60409931c279f3fd85a340dbb

Resources