Elasticsearch complex aggregation on match results - elasticsearch

I'm trying to get the top 5 teams based on win rate (matches won / all matches) having the match result documents in the index (see below).
The fields I need to get in the bucket:
apiId
name
winrate (won/total)
I guess this would require complex aggregation calculation witch is far beyond my current elasticsearch skills.
Elasticsearch version: 7.15.2
Could anyone help me with such elasticsearch query?
Thanks a lot in advance!
{
"lastModified": "2022-01-14T09:33:48.232Z",
"uuid": "01234567",
"started": "2022-01-14T09:31:27.651Z",
"editing": false,
"approved": true,
"statistics": {
"teams": [
{
"name": "Team1",
"score": 0,
"winner": false,
"apiId": "1"
},
{
"name": "Team2",
"score": 2,
"winner": true,
"apiId": "2"
}
]
}
}
and the mapping:
{
"mappings": {
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"approved": {
"type": "boolean"
},
"editing": {
"type": "boolean"
},
"ended": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"game": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"lastModified": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"started": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"statistics": {
"type": "nested",
"include_in_parent": true,
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"teams": {
"type": "nested",
"include_in_parent": true,
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"apiId": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"score": {
"type": "long"
},
"winner": {
"type": "boolean"
}
}
}
}
},
"uuid": {
"type": "keyword"
}
}
}
}
Edit:
Based on #ilvar answer, I constructed the query:
{
"query": {
"bool": {
"must": [
{
"term": {
"approved": true
}
}
]
}
},
"size": 0,
"aggs": {
"top_teams": {
"nested": {
"path": "statistics.teams"
},
"aggs": {
"the_all": {
"multi_terms": {
"terms": [
{
"field": "statistics.teams.name.keyword"
},
{
"field": "statistics.teams.apiId"
}
]
}
},
"the_won": {
"filter": {
"terms": {
"statistics.teams.winner": [
true
]
}
},
"aggs": {
"teams": {
"multi_terms": {
"terms": [
{
"field": "statistics.teams.name.keyword"
},
{
"field": "statistics.teams.apiId"
}
]
}
}
}
}
}
}
}
}
Which gives me:
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"top_teams": {
"doc_count": 10,
"the_all": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
"Team1",
2
],
"key_as_string": "Team1|2",
"doc_count": 2
},
{
"key": [
"Team2",
3
],
"key_as_string": "Team2|3",
"doc_count": 2
},
{
"key": [
"Team3",
5
],
"key_as_string": "Team3|5",
"doc_count": 2
},
{
"key": [
"Team4",
1
],
"key_as_string": "Team4|1",
"doc_count": 1
},
{
"key": [
"Team5",
4
],
"key_as_string": "Team5|4",
"doc_count": 1
},
{
"key": [
"Team6",
7
],
"key_as_string": "Team6|7",
"doc_count": 1
},
{
"key": [
"Team7",
6
],
"key_as_string": "Team7|6",
"doc_count": 1
}
]
},
"the_won": {
"doc_count": 4,
"teams": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
"Team2",
5
],
"key_as_string": "Team2|5",
"doc_count": 2
},
{
"key": [
"Team4",
2
],
"key_as_string": "Team4|2",
"doc_count": 1
},
{
"key": [
"Team7",
3
],
"key_as_string": "Team7|3",
"doc_count": 1
}
]
}
}
}
}
}
But I still cannot get the winrate from two siblings, where one sibling might have missing teams that have not won any match.
Should I use some pipeline aggregation?

The final solution I came out with is bellow. the sort pipeline aggregation did the final part of the job
{
"query": {
"bool": {
"must": [
{
"term": {
"approved": true
}
}
]
}
},
"size": 0,
"aggs": {
"top_teams": {
"nested": {
"path": "statistics.teams"
},
"aggs": {
"the_all": {
"terms": {
"field": "statistics.teams.apiId"
},
"aggs": {
"the_won_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
true
]
}
}
},
"the_lost_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
false
]
}
}
},
"the_all_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
true,
false
]
}
}
},
"the_winrate": {
"bucket_script": {
"buckets_path": {
"the_won_count": "the_won_by_team._count",
"the_all_count": "the_all_by_team._count"
},
"script": "params.the_won_count / params.the_all_count"
}
},
"the_sort": {
"bucket_sort": {
"sort": [
{
"the_winrate": "desc"
},
{
"the_all_by_team._count": "desc"
}
],
"size": 5
}
}
}
}
}
}
}
}

This is very similar to the example docs have for nested aggregation. The only difference would be that you'll have a terms aggregation on the top level instead of filter so you get back all of the teams.

Related

How to aggregate matched terms in a query_string search?

I wish to search wildcard terms in a nested list of dict and then obtain a list of terms and its uuid grouped by matched wildcard.
I've the following mapping in my index:
"mappings": {
"properties": {
"uuid": {
"type": "keyword"
},
"urls": {
"type": "nested",
"properties": {
"url": {
"type": "keyword"
},
"is_visited": {
"type": "boolean"
}
}
}
}
}
and a lot of data such this:
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd"
"urls": [
{
"is_visited": true,
"url": "https://www.google.com"
},
{
"is_visited": false,
"url": "https://www.facebook.com"
},
{
"is_visited": true,
"url": "https://www.twitter.com"
},
]
},
{
"uuid":"4a1c695d-756b-4d9d-b3a0-cf524d955884"
"urls": [
{
"is_visited": true,
"url": "https://www.stackoverflow.com"
},
{
"is_visited": false,
"url": "https://www.facebook.com"
},
{
"is_visited": false,
"url": "https://drive.google.com"
},
{
"is_visited": false,
"url": "https://maps.google.com"
},
]
}
...
I wish to search via wildcard "*google.com OR *twitter.com" and obtain something like this:
"hits": [
"*google.com": [
{
"uuid": "4a1c695d-756b-4d9d-b3a0-cf524d955884",
"_source": {
"is_visited": false,
"url": "https://drive.google.com"
}
},
{
"id": "4a1c695d-756b-4d9d-b3a0-cf524d955884",
"_source": {
"is_visited": false,
"url": "https://maps.google.com"
}
},
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd",
"_source": {
"is_visited": true,
"url": "https://www.google.com"
}
}
]
"*twitter.com": [
{
"uuid":"afa9ac03-0723-4d66-ae18-08a51e2973bd",
"_source": {
"is_visited": true,
"url": "https://www.twitter.com"
},
},
]
]
This is my (python) search query:
body = {
#"_source": False,
"size": 100,
"query": {
"nested": {
"path": "urls",
"query":{
"query_string":{
"query": f"urls.url:{urlToSearch}",
}
}
,"inner_hits": {
"size":100 # returns top 100 results
}
}
}
}
but it returns an hit for each matched term instead of aggregate them in a list similar to what I would like to get.
EDIT
This is my setting and mapping:
{
"settings": {
"analysis": {
"char_filter": {
"my_filter": {
"type": "mapping",
"mappings": [
"- => _",
]
},
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"char_filter": [
"my_filter"
],
"filter": [
"lowercase",
]
}
}
}
},
"mappings": {
"properties": {
"uuid": {
"type": "keyword"
},
"urls": {
"type": "nested",
"properties": {
"url": {
"type": "keyword"
},
"is_visited": {
"type": "boolean"
}
}
}
}
}
}
Elasticsearch will not provide the output you want the way you set up the query.
This scenario to be an aggregation. My suggestion was to apply the nested query and use aggregation on the results.
Attention point wildcard query:
Avoid beginning patterns with * or ?. This can increase the iterations
needed to find matching terms and slow search performance.
{
"size": 0,
"query": {
"nested": {
"path": "urls",
"query": {
"bool": {
"should": [
{
"wildcard": {
"urls.url": {
"value": "*google.com"
}
}
},
{
"wildcard": {
"urls.url": {
"value": "*twitter.com"
}
}
}
]
}
}
}
},
"aggs": {
"agg_providers": {
"nested": {
"path": "urls"
},
"aggs": {
"google.com": {
"terms": {
"field": "urls.url",
"include": ".*google.com",
"size": 10
}
},
"twitter.com": {
"terms": {
"field": "urls.url",
"include": ".*twitter.com",
"size": 10
}
}
}
}
}
}
Results:
"aggregations": {
"agg_providers": {
"doc_count": 7,
"twitter.com": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "https://www.twitter.com",
"doc_count": 1
}
]
},
"google.com": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "https://drive.google.com",
"doc_count": 1
},
{
"key": "https://maps.google.com",
"doc_count": 1
},
{
"key": "https://www.google.com",
"doc_count": 1
}
]
}
}
}

elastic search nested sub aggregations

We are using elastic search which holds records as documents with following definition
{
"loadtender": {
"aliases": {},
"mappings": {
"_doc": {
"_meta": {
"version": 20
},
"properties": {
"carrierId": {
"type": "long"
},
"destinationData": {
"type": "keyword"
},
"destinationZip": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 50
}
}
},
"effStartTime": {
"type": "date"
},
"endTime": {
"type": "date"
},
"id": {
"type": "long"
},
"mustRespondByTime": {
"type": "date"
},
"orgdiv": {
"type": "keyword"
},
"originData": {
"type": "keyword"
},
"originZip": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 50
}
}
},
"purchaseOrderNum": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 255
}
}
},
"startTime": {
"type": "date"
},
"tenderStatus": {
"type": "keyword"
},
"tenderedTime": {
"type": "date"
}
}
}
},
"settings": {
"index": {
"creation_date": "1655105542470",
"number_of_shards": "5",
"number_of_replicas": "1",
"uuid": "ohcXgA8EQ5iJj0X6_4BqXA",
"version": {
"created": "6080499"
},
"provided_name": "loadtender"
}
}
}
}
I am trying to search records to return me following filtered results
Input Parameter : startDate (yesterday), originData.originCity and originData.destinationCity
Output Required:
Three buckets for 0-30 days, 30-60 days and 60-90 days
buckets of distinct originData.city and destinationData.city combinations under each of the above
Under each of the above, buckets of data for each unique carrierId and the corresponding record list / count
Basically I was trying to achieve something like the below
{
"aggregations": {
"aggr": {
"buckets": [
{
"key": "0-30 days",
"doc_count": 10,
"aggr": {
"buckets": [
{
"key": "(originCity)Menasha, WI, US|Hanover, MD, US (DestinationCity)",
"aggr": {
"buckets": [
{
"key": "10183-carrierId",
"count": 10
}
]
}
}
]
}
},
{
"key": "30-60 days",
"doc_count": 11,
"aggr": {
"buckets": [
{
"key": "Dallas, TX, US|Houston, TX, US",
"aggr": {
"buckets": [
{
"key": "10183-carrierId",
"count": 10
},
{
"key": "10022-carrierId",
"count": 1
}
]
}
}
]
}
}
]
}
}
}
I've tried the following but I think I am not finding a way to filter it further using the sub aggregators.
{
"_source":["id", "effStartTime", "carrierId", "originData", "destinationData"],
"size": 100,
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"range": {
"startTime": {
"from": "2021-08-27T23:59:59.000Z",
"to": "2022-09-01T00:00:00.000Z",
"include_lower": true,
"include_upper": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"must_not": [
{
"term": {
"tenderStatus": {
"value": "REMOVED",
"boost": 1
}
}
}
],
"filter" : {
"exists" : {
"field" : "carrierId"
}
},
"adjust_pure_negative": true,
"boost": 1
}
},
"aggregations": {
"aggr": {
"terms": {
"script": "doc['originData'].values[0] + '|' + doc['destinationData'].values[0]"
}
}
}
}
I started beginning to think if this is even possible OR should I shift to issuing multiple queries for the same
I was able to achieve the same using the following sub-aggregations:
"aggregations": {
"aggr":{
"date_range": {
"field": "startTime",
"format": "MM-yyyy",
"ranges": [
{"to": "now-1M/M", "from": "now"}, --> now to 30 days back
{"to": "now-1M/M", "from": "now-2M/M"}, from 30 days back to 60 days back
{"to": "now-2M/M", "from": "now-3M/M"}, from 60 days back to 90 days back
{"to": "now-3M/M", "from": "now-12M/M"}
]
},
"aggregations": {
"aggr":{
"terms": {
"script": "doc['originData'].values[0] + '|' + doc['destinationData'].values[0]" --> concatenated origin and destination address as a key
},
"aggregations": {
"aggr": {
"terms": {
"field": "carrierId" --> nested carrier count
}
}
}
}
}
}
}
Following is the response template that I receive.
"aggregations": {
"aggr": {
"buckets": [
{
"key": "09-2021-06-2022",
"from": 1630454400000,
"from_as_string": "09-2021",
"to": 1654041600000,
"to_as_string": "06-2022",
"doc_count": 1,
"aggr": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Dallas, TX, US|Houston, TX, US",
"doc_count": 14,
"aggr": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 10022,
"doc_count": 14
}
]
}
}
]
}
}
]
}
}
Thank you to all of you for your efforts and time. Do let me know if you discover any better way.

ElasticSearch - indexing URLs and aggregating on Domains

Similar to this question Elasticsearch aggregate on URL hostname
I'm wanting to store URLs and be able to aggregate on Domains
Using ES6 I have the following mapping defined
PUT /test
{
"settings": {
"analysis": {
"filter": {
"url_domains": {
"type": "pattern_capture",
"preserve_original" : false,
"patterns": [
"https?:\/\/([^/]+)"
]
}
},
"analyzer": {
"url": {
"tokenizer": "uax_url_email",
"filter": [
"url_domains",
"lowercase",
"unique"
]
},
"full_url": {
"tokenizer": "uax_url_email",
"filter": ["lowercase", "stop"]
}
}
}
},
"mappings": {
"urls": {
"properties": {
"url": {
"type": "text",
"fielddata" : true,
"fields": {
"domain": {
"type": "text",
"fielddata" : true,
"analyzer": "url"
},
"full": {
"type": "text",
"fielddata" : true,
"analyzer": "full_url"
}
}
}
}
}
}
}
I then load in the following data
POST /test/urls/_bulk
{"index":{"_id":"1"}}
{"url": "http://gmail1.com/hello/world"}
{"index":{"_id":"2"}}
{"url": "https://gmail2.com/hello"}
{"index":{"_id":"3"}}
{"url": "http://gmail3.com/"}
{"index":{"_id":"4"}}
{"url": "https://gmail4.com"}
{"index":{"_id":"5"}}
{"url": "https://www.gmail5.com"}
{"index":{"_id":"6"}}
{"url": "www.gmail6.coom"}
{"index":{"_id":"7"}}
{"url": "www.gmail7.co.uk/hello"}
When I do the following query
GET /test/urls/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"by_domain": {
"terms": {
"field": "url.domain",
"size": 10
}
}
}
}
The following aggregations are returned:
"aggregations": {
"by_domain": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "coom",
"doc_count": 1
},
{
"key": "gmail1.com",
"doc_count": 1
},
{
"key": "gmail2.com",
"doc_count": 1
},
{
"key": "gmail3.com",
"doc_count": 1
},
{
"key": "gmail4.com",
"doc_count": 1
},
{
"key": "www.gmail5.com",
"doc_count": 1
},
{
"key": "www.gmail6",
"doc_count": 1
},
{
"key": "www.gmail7.co.uk/hello",
"doc_count": 1
}
]
}
}
From running the Analyzer I can see that documents 6 and 7 are not treated as URL's, but are of type ALPHANUM.
How can I get the domains of documents 6 and 7 correctly extracted ?
Bit of persistence paid off ! Changed the pattern to this:
"patterns": [
"https?:\/\/([^/]+)",
"(www[^/]+)"
]

Further filtering of aggregations

I have a question regarding aggregation in elastic search. I have a document like the following:
{
"_index": "products",
"_type": "product",
"_id": "ID-12345",
"_score": 1,
"_source": {
"created_at": "2017-08-04T17:56:44.592Z",
"updated_at": "2017-08-04T17:56:44.592Z",
"product_information": {
"sku": "12345",
"name": "Product Name",
"price": 25,
"brand": "Brand Name",
"url": "URL"
},
"product_detail": {
"description": "Product description text here.",
"string_facets": [
{
"facet_name": "Colour",
"facet_value": "Grey"
},
{
"facet_name": "Category",
"facet_value": "Linen"
},
{
"facet_name": "Category",
"facet_value": "Throws & Blanket"
},
{
"facet_name": "Keyword",
"facet_value": "Contemporary"
},
{
"facet_name": "Keyword",
"facet_value": "Sophisticated"
}
]
}
}
}
I am storing product information such as Colour, Material, Category and Keywords within the product_detail.string_facets field. I'd like to use this for aggregation to get Colour/Material/Category/Keyword suggestions but as separate buckets. I.e, there is a separate bucket for each of those string_facet types as defined in product_detail.string_facets.facet_name.
This is the query I have at the moment which is returning data, but not as I expect. First the query (this was just to try and get Colours):
{
"from": 0,
"size": 12,
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "Rug",
"fields": ["product_information.name", "product_detail.string_facets.facet_value"]
}
},
{
"multi_match": {
"query": "Blue",
"fields": ["product_information.name", "product_detail.string_facets.facet_name"]
}
}
],
"minimum_should_match": "100%"
}
},
"aggs": {
"suggestions": {
"filter": { "term": { "product_detail.string_facets.facet_name.keyword": "Colour" }},
"aggs": {
"colours": {
"terms": {
"field": "product_detail.string_facets.facet_value.keyword",
"size": 10
}
}
}
}
}
}
This is giving me output like the following:
"aggregations": {
"suggestions": {
"doc_count": 21,
"colours": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 23,
"buckets": [
{
"key": "Rug",
"doc_count": 21
},
{
"key": "Blue",
"doc_count": 18
},
{
"key": "Bold",
"doc_count": 7
},
{
"key": "Modern",
"doc_count": 6
},
{
"key": "Multi-Coloured",
"doc_count": 5
},
{
"key": "Contemporary",
"doc_count": 4
},
{
"key": "Traditional",
"doc_count": 4
},
{
"key": "White",
"doc_count": 4
},
{
"key": "Luxurious",
"doc_count": 3
},
{
"key": "Minimal",
"doc_count": 3
}
]
}
}
}
It has given me the results of all facet_name rather those of facet_type Colour as I thought it would.
Any help would be greatly appreciated. Elasticsearch seems very powerful but the documentation is quite daunting!
You did not show how the mapping looks like, but I suppose that product_detail.string_facets field is just an inner object field and that is the reason why you get this kind of result. With this type of mapping Elasticsearch flattens the array into a simple list of field names and values. In your case it becomes:
{
"product_detail.string_facets.facet_name": ["Colour", "Category", "Keyword"],
"product_detail.string_facets.facet_value": ["Grey", "Linen", "Throws & Blanket", "Contemporary", "Sophisticated"]
}
As you can see, based on this structure, Elasticsearch cannot know how to aggregate the data.
To make it work product_detail.string_facets field should be of type nested. Mapping for string_facets should be similar to this (note "type": "nested"):
"string_facets": {
"type": "nested",
"properties": {
"facet_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"facet_value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
Now I index following document:
{
"created_at": "2017-08-04T17:56:44.592Z",
"updated_at": "2017-08-04T17:56:44.592Z",
"product_information": {
"sku": "12345",
"name": "Rug",
"price": 25,
"brand": "Brand Name",
"url": "URL"
},
"product_detail": {
"description": "Product description text here.",
"string_facets": [
{
"facet_name": "Colour",
"facet_value": "Blue"
},
{
"facet_name": "Colour",
"facet_value": "Red"
},
{
"facet_name": "Category",
"facet_value": "Throws & Blanket"
},
{
"facet_name": "Keyword",
"facet_value": "Contemporary"
}
]
}
}
Now, to get aggregation of colour suggestions as separate buckets, you can try this query (I simplified the bool query for the need of my document):
{
"from": 0,
"size": 12,
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "Rug",
"fields": ["product_information.name", "product_detail.string_facets.facet_value"]
}
}
]
}
},
"aggs": {
"facets": {
"nested" : {
"path" : "product_detail.string_facets"
},
"aggs": {
"suggestions": {
"filter": { "term": { "product_detail.string_facets.facet_name.keyword": "Colour" }},
"aggs": {
"colours": {
"terms": {
"field": "product_detail.string_facets.facet_value.keyword",
"size": 10
}
}
}
}
}
}
}
}
And result:
{
...,
"hits": {
...
},
"aggregations": {
"facets": {
"doc_count": 5,
"suggestions": {
"doc_count": 2,
"colours": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Blue",
"doc_count": 1
},
{
"key": "Red",
"doc_count": 1
}
]
}
}
}
}
}

ElasticSearch - Aggregations with document details

I need to aggregate the following documents:
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
by actor, and would like to get the following structure:
[
{"name": "Christian Bale"},
{"movies": [
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6"
},
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0"
}, ...
]
Beyong using a standard term aggregation based on the casting.name field, how can I retrieve the releaseDate and imdbRate of the related documents?
For each actor, I also need movies to be sorted by releaseDate asc.
Can I perform this using one single request?
As you have an array of casting objects in your documents you'll need to use the nested type in your mapping. To get the aggregations you want you need a combination of Terms Aggregations, Nested Aggregations and Reverse Nested Aggregations. Below is an example.
Create and index with the mapping:
POST /test
{
"mappings": {
"movie": {
"properties": {
"title": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"releaseDate": {
"type": "string",
"index": "not_analyzed"
},
"casting": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"category": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
}
Index the documents:
POST /test/movie/1
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
POST /test/movie/2
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
And finally search:
POST /test/movie/_search?search_type=count
{
"aggs": {
"nested_path": {
"nested": {
"path": "casting"
},
"aggs": {
"actor_name": {
"terms": {
"field": "casting.name.raw"
},
"aggs": {
"movies": {
"reverse_nested": {},
"aggs": {
"movie_title": {
"terms": {
"field": "title.raw"
},
"aggs": {
"release_date": {
"terms": {
"field": "releaseDate"
}
},
"imdbRate_date": {
"terms": {
"field": "imdbRate"
}
}
}
}
}
}
}
}
}
}
}
}
The response for Christian Bale is:
{
"key": "Christian Bale",
"doc_count": 2,
"movies": {
"doc_count": 2,
"movie_title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "American Psycho",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7/06/2000",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7.6",
"doc_count": 1
}
]
}
},
{
"key": "The Dark Knight",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "13/08/2008",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9.0",
"doc_count": 1
}
]
}
}
]
}
}
}

Resources