Elasticsearch Aggregation: How to Sort Bucket Order - elasticsearch

ES Version: 1.5 (Amazon Elasticsearch)
My goal: Have search results with deduplication on a certain field. I am currently doing some research with aggregation that deals with the deduplication. So, my result is a list buckets with 1-sized buckets. However, I can't find a way to order the list of buckets.
Current query:
curl -XGET "http://localhost:9200/myidx/product/_search?search_type=count" -d '{
"size": 2,
"query": {
"function_score": {
"field_value_factor": {
"field": "relevance",
"factor": 2.0
},
"query": { "term": { "title": "abcd" } },
"score_mode": "multiply",
"boost_mode": "multiply"
}
},
"aggs": {
"unique": {
"terms": {
"field": "groupid",
"size": 2
},
"aggs": {
"sample": {
"top_hits": {
"size": 1
}
}
}
}
}
}'
Result:
{ ...
"aggregations": {
"unique": {
"doc_count_error_upper_bound": 1,
"sum_other_doc_count": 39,
"buckets": [
{
"key": 717878424,
"doc_count": 14,
"sample": {
"hits": {
"total": 14,
"max_score": 45.856163,
"hits": [
{
"_index": "myidx",
"_type": "product",
"_id": "89531",
"_score": 45.856163,
"_source": { ... }
}
]
}
}
},
{
"key": 717878423,
"doc_count": 8,
"sample": {
"hits": {
"total": 8,
"max_score": 68.78424,
"hits": [
{
"_index": "myidx",
"_type": "product",
"_id": "89517",
"_score": 68.78424,
"_source": { ... }
}
]
}
}
}
]
}
}
}
I would like to see the second bucket with the max_score=68.78424 as the first. Is this possible?
If aggregations is not a recommended solution, please tell.

Yes, you can do it by adding another sub-aggregation on the max document score and sorting the unique terms aggregation by that score.
curl -XGET "http://localhost:9200/myidx/product/_search?search_type=count" -d '{
"size": 2,
"query": {
"function_score": {
"field_value_factor": {
"field": "relevance",
"factor": 2.0
},
"query": { "term": { "title": "abcd" } },
"score_mode": "multiply",
"boost_mode": "multiply"
}
},
"aggs": {
"unique": {
"terms": {
"field": "groupid",
"size": 2,
"order": {
"max_score": "desc"
}
},
"aggs": {
"max_score": {
"max": {
"script": "doc.score"
}
},
"sample": {
"top_hits": {
"size": 1
}
}
}
}
}
}'

Related

Elasticsearch top_hits aggregation result and doc_count are different

Query
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"field": "holderInfo.raw",
"size": 50
},
"aggregations": {
"tops": {
"top_hits": {
"_source": {
"includes": ["uid"]
}
}
}
}
}
}
}
Result
{
...
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"498": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "MATSUSHITA ELECTRIC INDUSTRIAL",
"doc_count": 5,
"tops": {
"hits": {
"total": 5,
"max_score": 1,
"hits": [
{
"_index": "someindex",
"_id": "03a3",
"_score": 1,
"_source": {
"uid": "03a3"
}
},
{
"_index": "someindex",
"_id": "08a2",
"_score": 1,
"_source": {
"uid": "08a2"
}
},
{
"_index": "someindex",
"_id": "84a1",
"_score": 1,
"_source": {
"uid": "84a1"
}
}
]
}
}
}
]
}
}
}
"08a2", "08a3", "03a2", "03a3" and "84a1" each clearly have 'MATSUSHITA ELECTRIC INDUSTRIAL' in the holderInfo.raw field.
Therefore, there are 5 cases in the doc_count, but only "03a3", "08a2", and "84a1" are output in the top_hits results, and "08a3" and "03a2" are omitted.
Query
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"script": {
"inline": "doc['holderInfo.raw'].value"
},
"size": 50
}
}
}
}
Result
{
...
"hits": {
"total": 5,
"max_score": 0,
"hits": []
},
"aggregations": {
"498": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "MATSUSHITA ELECTRIC INDUSTRIAL",
"doc_count": 3
}
]
}
}
}
In addition, two cases are omitted when aggregating with script.
I'd like to know why some uids are missing.
I'm in a situation where I have to use the elasticsearch version 2.2. I want to know if it's an elasticsearch bug that occurs in an old version or a user's fault.
Thanks!
By default, the top_hits aggregation returns the first 3 top hits. You just need to increase the size parameter:
GET /someindex/_search
{
"size": 0,
"query": {
"ids": {
"types": [],
"values": ["08a2","08a3","03a2","03a3","84a1"]
}
},
"aggregations": {
"498": {
"terms": {
"field": "holderInfo.raw",
"size": 50
},
"aggregations": {
"tops": {
"top_hits": {
"size": 5, <---- add this
"_source": {
"includes": ["uid"]
}
}
}
}
}
}
}

I want to show Top 10 records and apply filter for specific fields in Elastic search

This is the query to get the Top 10 records. There is a Field name Answer inside this we have a record "UNHANDLED". I want to exclude the UNHANDLED inside the Answer field.
How to write the query to get both Top 10 and Exclude UNHANDLED
GET /logstash-sdc-mongo-abcsearch/_search?size=0
{
"aggs": {
"top_tags": {
"terms": {
"field": "question.keyword"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"answer"
]
},
"size": 1
}
}
}
}
}
}
You can use the must_not clause, to exclude the documents that containsUNHANDLED in the answer field. Try out the below query -
Index Mapping:
{
"mappings": {
"properties": {
"question": {
"type": "keyword"
},
"answer": {
"type": "keyword"
}
}
}
}
Index Data:
{
"question": "a",
"answer": "b"
}
{
"question": "c",
"answer": "UNHANDLED"
}
Search Query:
{
"query": {
"bool": {
"must_not": {
"term": {
"answer": "UNHANDLED"
}
}
}
},
"aggs": {
"top_tags": {
"terms": {
"field": "question"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"answer"
]
},
"size": 1
}
}
}
}
}
}
Search Result:
"aggregations": {
"top_tags": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "a",
"doc_count": 1,
"top_faq_hits": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.0,
"hits": [
{
"_index": "65563925",
"_type": "_doc",
"_id": "1",
"_score": 0.0,
"_source": {
"answer": "b"
}
}
]
}
}
}
]
}
}
Update 1:
Based on the comments below, try out the below query:
{
"query": {
"bool": {
"must_not": {
"term": {
"answer": "UNHANDLED"
}
},
"must": {
"term": {
"source": "sonax"
}
}
}
},
"aggs": {
"top_tags": {
"terms": {
"field": "question"
},
"aggs": {
"top_faq_hits": {
"top_hits": {
"_source": {
"includes": [
"answer"
]
},
"size": 1
}
}
}
}
}
}

Filtering documents after aggregation

In Elasticsearch, I am storing item state snapshots in an append-only scheme.
For example:
POST /item/item
{
"id": "1",
"time": "2018-09-19T00:00:00Z",
status": "ON_HOLD"
}
POST /item/item
{
"id": "2",
"time": "2018-09-19T00:01:00Z",
"status": "ON_HOLD"
}
POST /item/item
{
"id": "2",
"time": "2018-09-19T00:02:00Z",
"status": "DONE"
}
Now, what I wish to achieve is answer the following question: what items are still on hold? (status==ON_HOLD).
In this simple case, the answer would be:
{
"id": "1",
"time": "2018-09-19T00:00:00Z",
status": "ON_HOLD"
}
So, in order to get the last state of an item, I use a terms aggregation, on id, like so:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 10
},
"aggs": {
"top_items": {
"top_hits": {
"size": 1,
"sort": [
{
"time": {
"order": "desc"
}
}
],
"_source": {
"includes": ["*"]
}
}
}
}
}
}
}
This gives me the last available state of each item identified by its id:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "2",
"doc_count": 2,
"top_items": {
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "S-5eCGYBNyILygyml2jR",
"_score": null,
"_source": {
"id": "2",
"time": "2018-09-19T00:02:00Z",
"status": "DONE"
},
"sort": [
1537315320000
]
}
]
}
}
},
{
"key": "1",
"doc_count": 1,
"top_items": {
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "Se5eCGYBNyILygymjmg0",
"_score": null,
"_source": {
"id": "1",
"time": "2018-09-19T00:00:00Z",
"status": "ON_HOLD"
},
"sort": [
1537315200000
]
}
]
}
}
}
]
}
}
}
Now the problem is I would like to filter the result (after aggregation) on Elasticsearch's side (not client).
I tried a bucket_selector aggregation but it complains since the top_hits result is not a number or single value numeric aggregation.
I also tried to add a script_field to get a numeric value but cannot seem to use this after:
"script_fields": {
"on_hold": {
"script": {
"lang": "painless",
"source": "doc['status.keyword'].value == 'ON_HOLD' ? 1 : 0"
}
}
}
Is what I want to do even possible on Elasticsearch's side or do I have to do it on the client side?
PS: adding the filter before the aggregation does not provide correct result as it will return items who have been ON_HOLD at any point in time.
EDIT:
Alright I am getting somewhere with:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 50
},
"aggs": {
"top_item": {
"terms": {
"size": 1,
"field": "time",
"order": {
"_key": "desc"
}
},
"aggs": {
"on_hold": {
"filter": {
"term": {
"status.keyword": "ON_HOLD"
}
},
"aggs": {
"document": {
"top_hits": {
"size": 1,
"_source": ["*"]
}
}
}
}
}
}
}
}
}
}
The top_hits aggregation is a metrics and not a bucket aggregation, so it does not do the job and must be used last.
One last problem though: filtered out buckets leave empty leaves:
"hits": []
Is there any way to remove such branches ending in empty leaves from the result tree? Thanks
Alright, I found the complete solution to the problem, including filtering out empty branches in the aggregation tree:
GET /item/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"id": {
"terms": {
"field": "id.keyword",
"size": 50
},
"aggs": {
"top_item": {
"terms": {
"size": 1,
"field": "time",
"order": {
"_key": "desc"
}
},
"aggs": {
"on_hold": {
"filter": {
"term": {
"status.keyword": "ON_HOLD"
}
},
"aggs": {
"document": {
"top_hits": {
"size": 1,
"_source": ["*"]
}
}
}
},
"remove_filtered": {
"bucket_selector": {
"buckets_path": {
"count": "on_hold._count"
},
"script": {
"source": "params.count != 0"
}
}
}
}
},
"remove_empty": {
"bucket_selector": {
"buckets_path": {
"count": "top_item._bucket_count"
},
"script": "params.count != 0"
}
}
}
}
}
}
This gives the following output which was expected:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1",
"doc_count": 1,
"top_item": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1537315200000,
"key_as_string": "2018-09-19T00:00:00.000Z",
"doc_count": 1,
"on_hold": {
"doc_count": 1,
"document": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "item",
"_type": "item",
"_id": "HvywM2YB5Ei0wOZMeia9",
"_score": 1,
"_source": {
"id": "1",
"time": "2018-09-19T00:00:00Z",
"status": "ON_HOLD"
}
}
]
}
}
}
}
]
}
}
]
}
}
}

How to sort bucket result based on viewed_timestamp in ElasticSearch?

I am new to Elastic Search. I want to find the top 10 unique recent visited doc_id.
I have done first aggregation on doc_id and added sub-aggregation to sort each group and get a single result. Now I want to sort this bucket.
I am not able to sort the bucket's result based on view_timestamp. How can I add order during first aggregation?
I have tried other solutions given on stack overflow, but it is not working for me. Can anyone help me to solve this problem?
Query
{
"query": {
"constant_score": {
"filter": {
"term": { "username": "nil#gmail.com" }
}
}
},
"size":0,
"aggs":{
"title": {
"terms": {
"field": "doc_id",
"size":0
}
,
"aggs": {
"top": {
"top_hits": {
"sort": [
{
"viewed_timestamp": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
Bucket result:
{
"aggregations": {
"title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "b003",
"doc_count": 3,
"top_tag_hits": {
"hits": {
"total": 3,
"max_score": null,
"hits": [{
"_index": "visitedData",
"_type": "userdoc",
"_id": "AVak51Sp",
"_score": null,
"_source": {
"viewed_timestamp": "20160819T152359",
"content_type": "bp",
"title": "Data print",
"doc_id": "BP003"
},
"sort": [
1471620239000
]
}]
}
}
}, {
"key": "bp004",
"doc_count": 3,
"top_tag_hits": {
"hits": {
"total": 3,
"max_score": null,
"hits": [{
"_index": "visitedData",
"_type": "userdoc",
"_id": "AVak513Y8G",
"_score": null,
"_source": {
"viewed_timestamp": "20160819T152401",
"content_type": "bp",
"title": "Application Print",
"doc_id": "BP004"
},
"sort": [
1471620241000
]
}]
}
}
}]
}
}
}
it is beacuse your view_timestap type is not date, it is timesatmp. you should change this field to date format, such as:
"updateTime": "2017-01-12T21:28:49.562065"
If you're only trying to order by the timestamp, you could try using a max aggregation, like in this example:
https://www.elastic.co/guide/en/elasticsearch/reference/5.6/search-aggregations-metrics-top-hits-aggregation.html#_field_collapse_example

How do I perform an "OR" filter on an aggregate?

I am trying to grab the first 10 documents grouped by domain. These 10 documents need to have a value for "crawl_date" that haven't been crawled for a while or haven't been crawled at all (eg a blank value). I have:
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '
{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"or":[
"term": {"crawl_date": ""},
"term": {"crawl_date": ""} // how do I put a range here? e.g. <= '2014-12-31'
]
},
"terms": {
"field": "domain"
}
}
}
}'
I am new to ES and using version 2.2. Since the documentation isn't fully updated I am struggling.
EDIT:
To clarify, I need 10 urls that haven't been crawled or haven't been crawled for a while. Each of those 10 urls has to come from a unique domain so that when I crawl them I don't overload someone's server.
Another Edit:
So, I need something like this (1 link for each of 10 unique domains):
1. www.domain1.com/page
2. www.domain2.com/url
etc...
Instead, I am getting just the domain and the number of pages:
"buckets": [
{
"key": "http://www.dailymail.co.uk",
"doc_count": 212
},
{
"key": "https://sedo.com",
"doc_count": 196
},
{
"key": "http://www.foxnews.com",
"doc_count": 118
},
{
"key": "http://data.worldbank.org",
"doc_count": 117
},
{
"key": "http://detail.1688.com",
"doc_count": 117
},
{
"key": "https://twitter.com",
"doc_count": 112
},
{
"key": "http://search.rakuten.co.jp",
"doc_count": 104
},
{
"key": "https://in.1688.com",
"doc_count": 92
},
{
"key": "http://www.abc.net.au",
"doc_count": 87
},
{
"key": "http://sport.lemonde.fr",
"doc_count": 85
}
]
The "hits" returns multiple pages for just 1 domain:
"hits": [
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/w/at-the-edge-of-the-orchard-tracy-chevalier/1121908441?ean=9780525953005",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
{
"_index": "tester",
"_type": "test",
"_id": "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb",
"_score": 1,
"_source": {
"domain": "http://www.barnesandnoble.com",
"crawl_date": "0001-01-01T00:00:00Z"
}
},
etc....
Barnes and Noble will quickly ban my UA if I try to crawl that many domains at the same time.
I need something like this:
1. "http://www.dailymail.co.uk/page/text.html",
2. "https://sedo.com/another/page"
3. "http://www.barnesandnoble.com/b/bargain-books/_/N-8qb"
4. "http://www.starbucks.com/homepage/"
etc.
Using Aggregations
If you want to use aggregations, I'd suggest using the terms aggregations to remove the duplicates from your result set and as sub aggregation, I'd use the top_hits aggregation, which gives you the best hit from the aggregated documents of each domain (per default the score for each document within a domain should be the same.)
Consequently the query will look like that:
POST sites/page/_search
{
"size": 0,
"aggs": {
"filtered_domains": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "domain",
"size": 10
},
"aggs": {
"pages": {
"top_hits": {
"size": 1
}
}
}
}
}
}
}
}
Giving you results like that
"aggregations": {
"filtered_domains": {
"doc_count": 3,
"domains": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "barnesandnoble.com",
"doc_count": 2,
"pages": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "barnesandnoble.com/test2.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "barnesandnoble.com"
}
}
]
}
}
},
{
"key": "starbucks.com",
"doc_count": 1,
"pages": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1,
"_source": {
"crawl_date": "1982-05-16",
"domain": "starbucks.com"
}
}
]
}
}
}
]
}
}
Using Parent/Child Aggregations
If you can change your index structure, I'd suggest to create an index with either parent/child relationship or nested documents.
If you do so, you can select 10 distinct domains and retrieve one (or more) specific pages of this url.
Let me show you an example with parent/child (if you use sense, you should be able to just copy paste):
First generate the mappings for the documents:
PUT /sites
{
"mappings": {
"domain": {},
"page": {
"_parent": {
"type": "domain"
},
"properties": {
"crawl_date": {
"type": "date"
}
}
}
}
}
Insert some documents
PUT sites/domain/barnesandnoble.com
{}
PUT sites/domain/starbucks.com
{}
PUT sites/domain/dailymail.co.uk
{}
POST /sites/page/_bulk
{ "index": { "_id": "barnesandnoble.com/test.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "barnesandnoble.com/test2.html", "parent": "barnesandnoble.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "starbucks.com/index.html", "parent": "starbucks.com" }}
{ "crawl_date": "1982-05-16" }
{ "index": { "_id": "dailymail.co.uk/index.html", "parent": "dailymail.co.uk" }}
{}
Search for the urls to crawl
POST /sites/domain/_search
{
"query": {
"has_child": {
"type": "page",
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2016-01-01"
}
}
}]
}
}
}
},
"inner_hits": {
"size": 1
}
}
}
}
We do a has_child query on the parent type and therefor receive only distinct urls of the parent type. To get the specific pages, we have to add an inner_hits query, which gives us the child documents leading to the hits in the parent type.
If you set inner_hits size to 1, you get only one page per domain.
You can even add a sorting in the inner_hits query... For example, you can sort by the crawl_date. ;)
The above search gives you the following result:
"hits": [
{
"_index": "sites",
"_type": "domain",
"_id": "starbucks.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "starbucks.com/index.html",
"_score": 1.9664046,
"_routing": "starbucks.com",
"_parent": "starbucks.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "dailymail.co.uk",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 1,
"max_score": 1.9664046,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "dailymail.co.uk/index.html",
"_score": 1.9664046,
"_routing": "dailymail.co.uk",
"_parent": "dailymail.co.uk",
"_source": {}
}
]
}
}
}
},
{
"_index": "sites",
"_type": "domain",
"_id": "barnesandnoble.com",
"_score": 1,
"_source": {},
"inner_hits": {
"page": {
"hits": {
"total": 2,
"max_score": 1.4142135,
"hits": [
{
"_index": "sites",
"_type": "page",
"_id": "barnesandnoble.com/test.html",
"_score": 1.4142135,
"_routing": "barnesandnoble.com",
"_parent": "barnesandnoble.com",
"_source": {
"crawl_date": "1982-05-16"
}
}
]
}
}
}
}
]
Finally, let me note one thing. Parent/child relationship comes with small costs at query time. If this isn't a problem for your use case, I'd go for this solution.
I suggest you use the exists filter instead of trying to match an empty term (the missing filter is deprecated in 2.2). Then, the range filter will help you filter out the documents you don't need.
Finally, since you have used the absolute URL as id, make sure to aggregate on the _uid field and not the domain field, that way you'll get unique counts per exact page.
curl -XPOST 'http://localhost:9200/tester/test/_search' -d '{
"size": 10,
"aggs": {
"group_by_domain": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {
"field": "crawl_date"
}
}
}
},
{
"range": {
"crawl_date": {
"lte": "2014-12-31T00:00:00.000"
}
}
}
]
}
},
"aggs": {
"domains": {
"terms": {
"field": "_uid"
}
}
}
}
}
}'
You have to use Filter Aggregation and then sub-aggregation
{
"size": 10,
"aggs": {
"filter_date": {
"filter": {
"bool": {
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "crawl_date"
}
}
]
}
},
{
"range": {
"crawl_date": {
"from": "now-100d"
}
}
}
]
}
},
"aggs": {
"group_by_domain": {
"terms": {
"field": "domain"
}
}
}
}
}
}

Resources