Get all documents of having unique value with elastic - elasticsearch

for example:
i have many documents like this:
email status
1#123.com open
1#123.com click
2#123.com open
3#123.com open
i will query all documents with unique status value :"open", due to the record "1#123.com" contains "click" status, so "1#123.com" don't expect!
i tried this below,but not my expect:
{
"aggs": {
"hard_bounce_count": {
"filter": {
"term": {
"actionStatus": "open"
}
},
"aggs": {
"email_count": {
"value_count": {
"field": "email"
}
}
}
my expect response like this:
2#123.com open
3#123.com open
How can i do this,thanks..

Here, outer term-aggs (named EMAIL_LIST) return all emails and then within each email bucket, first it finds whether the status is open or not (using filter-aggs with name OPEN) then it finds if the status is other than "open" (using another filter-aggs with name OTHER_THAN_OPEN)
{
"size": 0,
"aggs": {
"EMAIL_LIST": {
"terms": {
"field": "email.keyword"
},
"aggs": {
"OPEN": {
"filter": {
"bool": {
"must": [
{
"term": {
"status": "open"
}
}
]
}
}
},
"OTHER_THAN_OPEN": {
"filter": {
"bool": {
"must_not": [
{
"term": {
"status": "open"
}
}
]
}
}
},
"SELECTION_SCRIPT": {
"bucket_selector": {
"buckets_path": {
"open_count": "OPEN._count",
"other_than_open_count": "OTHER_THAN_OPEN._count"
},
"script": "params.other_than_open_count==0 && params.open_count>0"
}
}
}
}
}
}
Above "bucket_selector" aggregation select only those bucket to output which have only open status
"aggregations": {
"EMAIL_LIST": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "2#123.com",
"doc_count": 1,
"OTHER_THAN_OPEN": {
"doc_count": 0
},
"OPEN": {
"doc_count": 1
}
},
{
"key": "3#123.com",
"doc_count": 1,
"OTHER_THAN_OPEN": {
"doc_count": 0
},
"OPEN": {
"doc_count": 1
}
}
]
}
}
so final answer will be email "2#123.com" and "3#123.com"

I can query this too.
{
"aggs": {
"email": {
"terms": {
"field": "email"
},
"aggs": {
"status_group": {
"terms": {
"field": "status"
}
}
}
}
}
}
response:
"aggregations": {
"email": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "1#123.com",
"doc_count": 2,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "click",
"doc_count": 1
}, {
"key": "open",
"doc_count": 1
}
]
}
}, {
"key": "2#123.com",
"doc_count": 1,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "open",
"doc_count": 1
}
]
}
}, {
"key": "3#123.com",
"doc_count": 1,
"status_group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "open",
"doc_count": 1
}
]
}
}
]
}
}
but how can I exclude "1#email" in resulting buckets, Because I eventually need the statistics of all eligible documents

Related

ElasticSearch filtering results according buckets length ( number of unique keys in a bucket)

I want to write down an elastic aggregation which only returns a key only if its inner bucket's length is greater than 1.
"aggs": {
"product_definitions": {
"terms": {
"field": "definition_name",
"size": 200,
"exclude": "NO_MATCH",
"min_doc_count": 5
},
"aggs": {
"product_instances": {
"terms": {
"field": "data_source_name",
"size": 100
}
}
}
}
}
This is my aggregation an it returns:
"aggregations": {
"product_definitions": {
"doc_count_error_upper_bound": 10,
"sum_other_doc_count": 29281,
"buckets": [
{
"key": "DANA ANTRİKOT KG",
"doc_count": 13,
"product_instances": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "SariyerMarketCom",
"doc_count": 13
}
]
}
},
{
"key": "Keskinoğlu Piliç Salam 700G",
"doc_count": 10,
"product_instances": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "HappyCenterComTr",
"doc_count": 9
},
{
"key": "SanalMarketComTr",
"doc_count": 1
}
]
}
},
{
"key": "Doğuş Filiz Çayı 1000 G",
"doc_count": 9,
"product_instances": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "HappyCenterComTr",
"doc_count": 7
},
{
"key": "SanalMarketComTr",
"doc_count": 2
}
]
}
}
]
}
}
I want keys in products definitions only if their product instances buckets has more than two keys. In this example it should only return 2. and 3. keys and not 1. because bucket of 1. key only contains 1 key which is
"buckets": [
{
"key": "SariyerMarketCom",
"doc_count": 13
}
]
You can leverage the bucket_selector pipeline aggregations to achieve that, like this:
"aggs": {
"product_definitions": {
"terms": {
"field": "definition_name",
"size": 200,
"exclude": "NO_MATCH",
"min_doc_count": 5
},
"aggs": {
"product_instances": {
"terms": {
"field": "data_source_name",
"size": 100
}
},
"minimum_2": {
"bucket_selector": {
"buckets_path": {
"count": "product_instances._bucket_count"
},
"script": "params.count >= 2"
}
}
}
}
}

Buckets size filter in Elasticsearch

Here is my query result
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 502,
"max_score": 0,
"hits": []
},
"aggregations": {
"HIGH_RISK_USERS": {
"doc_count": 1004,
"USERS_COUNT": {
"doc_count_error_upper_bound": 5,
"sum_other_doc_count": 437,
"buckets": [
{
"key": "49",
"doc_count": 502,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key": "02122219455#53.205.223.157",
"doc_count": 44,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 42
},
{
"key": "CallFrom",
"doc_count": 2
}
]
}
},
{
"key": "+02129916178#53.205.223.157",
"doc_count": 2,
"NAME": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "caller",
"doc_count": 2
}
]
}
}
]
}
}
}
}
Here is my query
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "x_nova_extensions.entities",
"query": {
"bool": {
"filter": [
{
"match": {
"x_nova_extensions.entities.text": "49"
}
},
{
"terms": {
"x_nova_extensions.entities.type": [
"sourceCountryCode",
"CallerIPCountryCode",
"CallerIPCountryName",
"CallerIPCountryCode",
"CallerPhoneCountryName"
]
}
}
]
}
}
}
}
]
}
},
"aggs": {
"HIGH_RISK_USERS": {
"nested": {
"path": "x_nova_extensions.entities"
},
"aggs": {
"USERS_COUNT": {
"terms": {
"field": "x_nova_extensions.entities.text",
"size": 10,
"order": {
"_count": "desc"
}
},
"aggs": {
"NAME": {
"terms": {
"field": "x_nova_extensions.entities.type",
"include": [
"caller",
"callee",
"CallFrom",
"CallTo"
]
}
}
}
}
}
}
}
}
I want my query to return only bucket[].size > 0
I searched on the internet and I couldn't find any specific keyword or something else. Even I am not sure if Elasticsearch supports this or not. I want to sure that Elasticsearch supports this
Are there any keyword or how can I handle it ?
Thanks
I think the thing that you are looking for is Aggregation Pipeline
By that way, you can reach the bucket size and filter the result accordingly.
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"nameCount": "NAME._bucket_count"
},
"script": {
"source": "params.nameCount != 0"
}
}
}
}
}
But please pay attention to the elasticsearch version. The way how it is applied can be different according to the version.

Elasticsearch: filter aggregation using bucket value

Not sure how to formulate the question.
I'm using Elasticsearch 2.2.
Let's start with an example of the dataset, made of 5 documents:
[
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": {},
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "a" },
"coverage_entity": { "uuid": "b" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 1
}
},
{
"header": {
"called_entity": { "uuid": "b" },
"coverage_entity": { "uuid": "a" },
"sucessful_transfers": 0
}
}
]
called_entity always has a uuid.
coverage_entity can be empty, or have an uuid.
I use a script to aggregate on either called_entity.uuid or coverage_entity.uuid:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
}
}
}
}
}
So now, the aggregation has generated terms from either header.called_entity.uuid, or header.coverage_entity.uuid.
How can I filter my aggregation using the value of the aggregation key? For example, if I want to count, for each bucket, how many documents have their uuid taken from header.called_entity.uuid only. Something like that:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script" : "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.successful_transfers"
}
},
"from_called_entity": {
"filter": {
"term": { "header.called_entity.uuid": BUCKET_KEY }
}
}
}
}
}
}
Not sure this is possible. The key itself is only available as a sorting option.
Could you use something like this:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"dim1": {
"terms": {
"script": "return doc['header.called_entity.uuid'] + doc['header.coverage_entity.uuid']",
"size": 10
},
"aggs": {
"successful_transfers": {
"sum": {
"field": "header.sucessful_transfers"
}
}
}
},
"called_entity_source": {
"terms": {
"field": "header.called_entity.uuid",
"size": 10
}
},
"coverage_entity_source": {
"terms": {
"field": "header.coverage_entity.uuid",
"size": 10
}
}
}
}
And the output will be something like this:
"called_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 2
}
]
},
"coverage_entity_source": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 2
},
{
"key": "b",
"doc_count": 1
}
]
},
"dim1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 4,
"successful_transfers": {
"value": 3
}
},
{
"key": "b",
"doc_count": 3,
"successful_transfers": {
"value": 2
}
}
]
}
If you really need to have the json in that specific way, add another final step in your application where you post process the result a bit. The result above does contain the info you need but the keys from coverage_entity_source and called_entity_source are not under the dim aggregation.

elasticsearch aggregations sort on buckets keys

How do i sort elasticsearch aggregations buckets on keys. I have nested aggregations and want to sort on my 2nd aggregation buckets result.
Like I have:
"result": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 20309,
"doc_count": 752,
"Events": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "impression",
"doc_count": 30
},
{
"key": "page_view",
"doc_count": 10
},
...
]
}
},
{
"key": 20771,
"doc_count": 46,
"Events": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "impression",
"doc_count": 32
},
{
"key": "page_view",
"doc_count": 9
},
...
]
}
},
I want my Events aggregate buckets to sort by desc/asc on key impression or on page_view.
How do I achieve such results set?
Here is my query
GET someindex/useractivity/_search?search_type=count
{
"size": 1000000,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"created_on": {
"from": "2015-01-12",
"to": "2016-05-12"
}
}
},
{
"term": {
"group_id": 1
}
}
]
}
}
}
},
"aggs": {
"result": {
"terms": {
"field": "entity_id",
"size": 1000000
},
"aggs": {
"Events": {
"terms": {
"field": "event_type",
"min_doc_count": 0,
"size": 10
}
}
}
}
}
}
I have tried using _key, but it sort within the bucket. I want to sort by looking at all buckets. Like I have a key impression. I want my buckets result to be sorted with this key. Not within the bucket.
I want my results set to be like if I want to sort on impression by descending order then my result should be
"buckets": [
{
"key": 20771,
"doc_count": 46,
"Events": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "impression",
"doc_count": 32
},
{
"key": "page_view",
"doc_count": 9
},
...
]
}
},
{
"key": 20309,
"doc_count": 752,
"Events": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "impression",
"doc_count": 30
},
{
"key": "page_view",
"doc_count": 10
},
...
]
}
},
i.e the bucket with maximum impression should be on top. (order buckets by impression in descending order)
Try this aggregation:
{
"size": 0,
"aggs": {
"result": {
"terms": {
"field": "entity_id",
"size": 10,
"order": {
"impression_Events": "desc"
}
},
"aggs": {
"Events": {
"terms": {
"field": "event_type",
"min_doc_count": 0,
"size": 10
}
},
"impression_Events": {
"filter": {
"term": {
"event_type": "impression"
}
}
}
}
}
}
}

elasticsearch sort aggregation on categorical value

In elasticsearch, I can aggregate and sort the aggregation on a second aggregation's numeric field.
e.g.
GET myindex/_search
{
"size":0,
"aggs": {
"a1": {
"terms": {
"field": "FIELD1",
"size":0,
"order": {"a2": "desc"}
},
"aggs":{
"a2":{
"sum":{
"field":"FIELD2"
}
}
}
}
}
}
However, I want to sort the aggregation on a categorical field value. ie. let's say the value of FIELD2 was one of ("a", "b", "c") -- I want to sort a1 first by all documents's with FIELD2: "a", then FIELD2: "b", then FIELD2: "c".
In my case, every FIELD1 has a unique FIELD2. So I really just want a way to sort the a1 results by FIELD2.
I am not sure what exactly you want but I tried following.
I created index with mapping
PUT your_index
{
"mappings": {
"your_type": {
"properties": {
"name": {
"type": "string"
},
"fruit" : {"type" : "string", "index": "not_analyzed"}
}
}
}
}
Then I indexed few documents like this
PUT your_index/your_type/1
{
"name" : "federer",
"fruit" : "orange"
}
Then I sorted all players with fruits with following aggregation
{
"size": 0,
"aggs": {
"a1": {
"terms": {
"field": "name",
"order": {
"_term": "asc"
}
},
"aggs": {
"a2": {
"terms": {
"field": "fruit",
"order": {
"_term": "asc"
}
}
}
}
}
}
}
The result I got is
"aggregations": {
"a1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "federer",
"doc_count": 3,
"a2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "green apple",
"doc_count": 1
},
{
"key": "orange",
"doc_count": 2
}
]
}
},
{
"key": "messi",
"doc_count": 2,
"a2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "apple",
"doc_count": 1
},
{
"key": "banana",
"doc_count": 1
}
]
}
},
{
"key": "nadal",
"doc_count": 2,
"a2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "blueberry",
"doc_count": 1
},
{
"key": "watermelon",
"doc_count": 1
}
]
}
},
{
"key": "ronaldo",
"doc_count": 2,
"a2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "banana",
"doc_count": 1
},
{
"key": "watermelon",
"doc_count": 1
}
]
}
}
]
}
}
Make sure your FIELD2 is not_analyzed or you will get unexpected results.
Does this help?
I found a way that works. You must first aggregate on FIELD2, then on FIELD1.
{
"size": 0,
"aggs": {
"a2": {
"terms": {
"size": 0,
"field": "FIELD2",
"order": {
"_term": "asc"
}
},
"aggs": {
"a1": {
"terms": {
"size": 0,
"field": "FIELD1",
"order": {
"_term": "asc"
}
}
}
}
}
}
}

Resources