Elasticsearch - Count number of occurrence perd field per document - elasticsearch

Is it possible to calculate the number of occurence of distinct values in a list field.
For example, let the following data:
[
{
"page":1,
"colors":[
{
"color": red
},
{
"color": white
},
{
"color": red
}
]
},
{
"page":2,
"colors":[
{
"color": yellow
},
{
"color": yellow
}
]
}
]
Is it possible to get a result as the follwing:
{
"page":1,
"colors_count":[
{
"Key": red,
"Count": 2
},
{
"Key": white,
"Count": 1
},
]
},
{
"page":2,
"colors_count":[
{
"Key": yellow,
"Count": 2
}
]
}
I tried using term aggregation but I got the number of distinct values, so for page:1 i got red:1 and white:1.

Yes, you can do it. you will have to use nested_field type and nested_Agg
Mapping:
PUT colors
{
"mappings": {
"properties": {
"page" : { "type": "keyword" },
"colors": {
"type": "nested",
"properties": {
"color": {
"type": "keyword"
}
}
}
}
}
}
Insert Documents:
PUT colors/_doc/1
{
"page": 1,
"colors": [
{
"color": "red"
},
{
"color": "white"
},
{
"color": "red"
}
]
}
PUT colors/_doc/2
{
"page": 2,
"colors": [
{
"color": "yellow"
},
{
"color": "yellow"
}
]
}
Query:
GET colors/_search
{
"size" :0,
"aggs": {
"groupByPage": {
"terms": {
"field": "page"
},
"aggs": {
"colors": {
"nested": {
"path": "colors"
},
"aggs": {
"genres": {
"terms": {
"field": "colors.color"
}
}
}
}
}
}
}
}
Output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"groupByPage": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", // page field value
"doc_count": 1,
"colors": {
"doc_count": 3,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "red",
"doc_count": 2
},
{
"key": "white",
"doc_count": 1
}
]
}
}
},
{
"key": "2", // page field value
"doc_count": 1,
"colors": {
"doc_count": 2,
"genres": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "yellow",
"doc_count": 2
}
]
}
}
}
]
}
}
}

Related

Elasticsearch - Count duplicated and unique values for a nested field

Elasticsearch - Count duplicated and unique values
I need also same kind of count but that field is in nested properties as
[{
"firstname": "john",
"lastname": "doe",
"addressList": [{
"addressId": 39640,
"txt": "sdf",
}, {
"addressId": 39641,
"txt": "NEW",
}, {
"addressId": 39640,
"txt": "sdf",
}, {
"addressId": 39641,
"txt": "NEW"
}
]
}, {
"firstname": "jane",
"lastname": "smith",
"addressList": [{
"addressId": 39644,
"txt": "sdf",
}, {
"addressId": 39642,
"txt": "NEW",
}, {
"addressId": 39644,
"txt": "sdf",
}, {
"addressId": 39642,
"txt": "NEW"
}
]
}
]
what would be the query for addressId duplicate counts ? Need you help on this user:3838328
I got the answer for nested field duplicate counts as
POST <your_index_name>/_search
{
"size": 0,
"aggs": {
"prop_counts": {
"nested": {
"path": "addressList"
},
"aggs": {
"duplicate_aggs": {
"terms": {
"field": "addressList.addressId",
"min_doc_count": 2,
"size": 100 <----- Note this
}
},
"duplicate_bucketcount": {
"stats_bucket": {
"buckets_path": "duplicate_aggs._count"
}
},
"nonduplicate_aggs": {
"terms": {
"field": "addressList.addressId",
"size": 100 <---- Note this
},
"aggs": {
"equal_one": {
"bucket_selector": {
"buckets_path": {
"count": "_count"
},
"script": "params.count == 1"
}
}
}
},
"nonduplicate_bucketcount": {
"sum_bucket": {
"buckets_path": "nonduplicate_aggs._count"
}
}
}
}
}
}
Response as
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop_counts": {
"doc_count": 8,
"duplicate_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": 39640,
"doc_count": 2
}, {
"key": 39641,
"doc_count": 2
}, {
"key": 39644,
"doc_count": 2
}, {
"key": 39642,
"doc_count": 2
}
]
},
"nonduplicate_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"duplicate_bucketcount": {
"count": 4,
"min": 2,
"max": 2,
"avg": 2,
"sum": 8
},
"nonduplicate_bucketcount": {
"value": 0
}
}
}
}

Convert sql query to elasticsearch

I need to convert this query into elastic search, but I am facing the problem that in elastic search (having) is not supported yet.
Select sum(count) as count,prop1
from
(
SELECT Count(*) as count,prop1 FROM [table1] group by prop1,prop2
having count = 1
)
group by prop1
order by count desc limit 10
I try this query in elastic search:
`GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term":
{
"field": "test"
}
}
]
}
},
"aggs": {
"aggregation": {
"terms": {
"field": "prop1"
},
"aggs": {
"subaggregation": {
"terms": {
"field": "prop2",
"order": {
"_count": "desc"
}
}
},
"test":{
"bucket_selector": {
"buckets_path":
{
"test1": "_count"
},
"script":"params.test1 == 1"
}
}
}
}
}
}`
Here is the mapping that I use:
PUT /index
{
"mappings" : {
"timeline" : {
"properties" : {
"prop1" : {
"type" : "keyword"
},
"prop2" : {
"type" : "keyword"
}
}
}
}
}
but I cannot get the sub-aggregation buckets who have count == 1
Here is the output of the suggested answer :
{
"took": 344,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 852146,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 646,
"sum_other_doc_count": 37299,
"buckets": [
{
"key": "porp1-key",
"doc_count": 348178,
"prop2": {
"doc_count_error_upper_bound": 130,
"sum_other_doc_count": 345325,
"buckets": [
{
"key": "e1552d2d-da84-4588-9b65-16c33848bb94_1",
"doc_count": 558,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "04b1a8eb-f876-459b-af9b-855493318dca_426",
"doc_count": 383,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "b165d2c7-6a23-4a4d-adbb-3b2a79d4c627_80",
"doc_count": 344,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c4ea55dc-c3b3-492b-98a2-1ad004212c3d_99",
"doc_count": 297,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "dfc1ae22-5c7f-49ab-8488-207661b43716_294",
"doc_count": 264,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "28815490-e7ce-420b-bab8-57a6ffc3f56a_572",
"doc_count": 239,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "c3c56ec8-e0ff-46ea-841d-cc22b2dc65f6_574",
"doc_count": 217,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "473289b8-fb73-4cbb-b8d7-a5386846745f_34",
"doc_count": 187,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "670cb862-7976-4fd5-ba3f-3f8b7c03d615_11",
"doc_count": 185,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
},
{
"key": "41870755-96dd-4a00-ab76-632a1dfaecb5_341",
"doc_count": 179,
"prop2_count": {
"value": 0
},
"prop2_check": {
"value": 0
}
}
]
},
"final": {
"value": 0
}
} ]
}
}
}
Try this. Aggregation final will give you the desired output.
GET /analytics_data/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"field": "test"
}
}
]
}
},
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final": {
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Working code :
PUT prop
{
"mappings": {
"prop": {
"properties": {
"prop1": {
"type": "keyword"
},
"prop2": {
"type": "keyword"
}
}
}
}
}
POST _bulk
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q1"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p1","prop2":"q2"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q5"}
{"index":{"_index":"prop","_type":"prop"}}
{"prop1":"p2","prop2":"q6"}
GET prop/prop/_search
{
"size": 0,
"aggs": {
"prop1": {
"terms": {
"field": "prop1",
"size": 10
},
"aggs": {
"prop2": {
"terms": {
"field": "prop2",
"size": 10
},
"aggs": {
"prop2_count": {
"value_count": {
"field": "prop2"
}
},
"prop2_check": {
"bucket_script": {
"buckets_path": {
"count": "prop2_count.value"
},
"script": "(params.count == 1) ? 1 : 0"
}
}
}
},
"final":{
"sum_bucket": {
"buckets_path": "prop2>prop2_check"
}
}
}
}
}
}
Output :
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"prop1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "p1",
"doc_count": 3,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q2",
"doc_count": 2,
"prop2_count": {
"value": 2
},
"prop2_check": {
"value": 0
}
},
{
"key": "q1",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 1
}
},
{
"key": "p2",
"doc_count": 2,
"prop2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "q5",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
},
{
"key": "q6",
"doc_count": 1,
"prop2_count": {
"value": 1
},
"prop2_check": {
"value": 1
}
}
]
},
"final": {
"value": 2
}
}
]
}
}
}

query for elasticsearch returning count

I am struggling to create the query/rule that will help me create an alerting script. I want to query the elasticsearch API for counts on a specific index so that I can get alerted when the count reaches a certain threshold.
The following query is an attempt as I have no experience with this:
{
"query": {
"filtered": {
"query": {
"query_string": {
"analyze_wildcard": true,
"query": "*"
}
},
"filter": {
"bool": {
"must": [
{
"query": {
"match": {
"PStream": {
"query": "*",
"type": "phrase"
}
}
}
},
{
"range": {
"#timestamp": {
"gte": 1447789445320,
"lte": 1447793045320
}
}
}
],
"must_not": []
}
}
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
},
"size": 500,
"sort": [
{
"#timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"aggs": {
"2": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"pre_zone": "-05:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 0,
"extended_bounds": {
"min": 1447789445317,
"max": 1447793045317
}
}
}
},
The field PStream is the field that I am focused on
EDIT:
An example of the data going to the index:
{
"_index": "logstash-2015.11.17",
"_type": "logs",
"_id": "AVEXMKu2YVnF1NOjr9YT",
"_score": null,
"_source": {
"authorUrl": "",
"postUrl": "",
"pubDate": "2015-11-17T15:18:24",
"scrapeDate": "2015-11-17T15:44:03",
"clientId": "136902834",
"query": "Jenny Balatsinou",
"PType": "post",
"tLatency": 1539,
"PLang": "en",
"PStream": "864321",
"PName": "xStackOverflow",
"#version": "1",
"#timestamp": "2015-11-17T20:44:03.400Z"
},
"fields": {
"#timestamp": [
1447793043400
],
"pubDate": [
1447773504000
],
"scrapeDate": [
1447775043000
]
},
"sort": [
1447793043400
]
there are about 20 million of these messages getting indexed daily into Elasticsearch. I have created a dashboard in Kibana where I view this data and stats. I would like to write the proper query that I can use in a java program that periodically runs and checks this index using this query. It should return the hourly total count grouped by the PStream variable which has multiple values. So anytime the value is 0 it will send an alert.
Eg. Output:
"result": {
"total": 74,
"successful": 63,
"failed": 11,
{
{
"index": "logstash-2015.11.08",
"PStream": "37647338933",
"Count": 1234532
},
{
"index": "logstash-2015.11.08",
"PStream": "45345343566",
"Count": 156532
},
As a quick example (per comments above), I just set up a trivial index:
DELETE /test_index
PUT /test_index
added some (simplified) data:
PUT /test_index/doc/_bulk
{"index":{"_id":1}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":2}}
{"PStream": "864321","#timestamp": "2015-11-17T21:44:03.400Z"}
{"index":{"_id":3}}
{"PStream": "864321","#timestamp": "2015-11-17T20:44:03.400Z"}
{"index":{"_id":4}}
{"PStream": "864322","#timestamp": "2015-11-17T21:44:03.400Z"}
And now I can get the "PStream" terms inside an hour histogram:
POST /test_index/_search
{
"size": 0,
"aggs" : {
"timestamp_histogram" : {
"date_histogram" : {
"field" : "#timestamp",
"interval" : "hour"
},
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
}
}
}
}
}
}
...
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 2,
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 1
},
{
"key": "864322",
"doc_count": 1
}
]
}
}
]
}
}
}
or the other way around:
POST /test_index/_search
{
"size": 0,
"aggs": {
"pstream_terms": {
"terms": {
"field": "PStream"
},
"aggs": {
"timestamp_histogram": {
"date_histogram": {
"field": "#timestamp",
"interval": "hour"
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"pstream_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "864321",
"doc_count": 3,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T20:00:00.000Z",
"key": 1447790400000,
"doc_count": 2
},
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
},
{
"key": "864322",
"doc_count": 1,
"timestamp_histogram": {
"buckets": [
{
"key_as_string": "2015-11-17T21:00:00.000Z",
"key": 1447794000000,
"doc_count": 1
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/6c0c30db1cf0fb8529bcfec21c0ce5c02a5ae94c

Elasticsearch include field in result set of aggregation

How can field of type string be included in the result set of an aggregation?
For example given the following mapping:
{
"sport": {
"mappings": {
"runners": {
"properties": {
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"region": {
"type": "string"
},
"sport": {
"type": "string"
}
}
}
}
}
}
Sample data:
curl -XPOST "http://localhost:9200/sport/_bulk" -d'
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Gary", "city":"New York","region":"A","sport":"Soccer"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Bob", "city":"New York","region":"A","sport":"Tennis"}
{"index":{"_index":"sport","_type":"runner"}}
{"name":"Mike", "city":"Atlanta","region":"B","sport":"Soccer"}
'
How can the field name be included in result set of the aggregation:
{
"size": 0,
"aggregations": {
"agg": {
"terms": {
"field": "city"}
}
}
}
This seems to do what you want, if I'm understanding you correctly:
POST /sport/_search
{
"size": 0,
"aggregations": {
"city_terms": {
"terms": {
"field": "city"
},
"aggs": {
"name_terms": {
"terms": {
"field": "name"
}
}
}
}
}
}
With the data you provided, it returns:
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"city_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "york",
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bob",
"doc_count": 1
},
{
"key": "gary",
"doc_count": 1
}
]
}
},
{
"key": "atlanta",
"doc_count": 1,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mike",
"doc_count": 1
}
]
}
}
]
}
}
}
(You may want to add "index":"not_analyzed" to one or both fields in your mapping, if these results are not what you were expecting.)
Here's the code I used to test it:
http://sense.qbox.io/gist/07735aadc082c1c60409931c279f3fd85a340dbb

elasticsearch terms and sum aggregation

I have documents in elasticsearch (1.5) that looks like:
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
not all of the documents contains the three options (male/female/unknown)
i would like to get the sum of all values per each gender name. like that:
{
"buckets": [
{
"key": "unknown",
"doc_count": 112,
"gender_a": {
"value": 462
}
},
{
"key": "male",
"doc_count": 107,
"gender_a": {
"value": 438
}
},
{
"key": "female",
"doc_count": 36,
"gender_a": {
"value": 186
}
}
]
}
i tried this query:
{
"aggs": {
"gender_name": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_sum": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
but something weird is going on, and i don't get the right values.
any idea what i am missing ?
You will probably need to make sure that your "gender" property has type "nested". With that, I was able to make the following do what I think you're asking.
First I set up a simple index:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"gender": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"value": {
"type": "long"
}
}
}
}
}
}
}
Then added a couple of docs:
PUT /test_index/doc/1
{
"gender": [
{
"name": "unknown",
"value": 12
},
{
"name": "male",
"value": 89
},
{
"name": "female",
"value": 84
}
]
}
PUT /test_index/doc/2
{
"gender": [
{
"name": "male",
"value": 8
},
{
"name": "female",
"value": 4
}
]
}
Then I was able to get total counts by gender name as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"genders": {
"nested": {
"path": "gender"
},
"aggs": {
"gender_terms": {
"terms": {
"field": "gender.name"
},
"aggs": {
"gender_name_value_sums": {
"sum": {
"field": "gender.value"
}
}
}
}
}
}
}
}
...
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"genders": {
"doc_count": 5,
"gender_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "female",
"doc_count": 2,
"gender_name_value_sums": {
"value": 88,
"value_as_string": "88.0"
}
},
{
"key": "male",
"doc_count": 2,
"gender_name_value_sums": {
"value": 97,
"value_as_string": "97.0"
}
},
{
"key": "unknown",
"doc_count": 1,
"gender_name_value_sums": {
"value": 12,
"value_as_string": "12.0"
}
}
]
}
}
}
}
Here is the code I used to test it:
http://sense.qbox.io/gist/d4533215806b858aa2cc1565546d167fdec3c973

Resources