Further filtering of aggregations - elasticsearch

I have a question regarding aggregation in elastic search. I have a document like the following:
{
"_index": "products",
"_type": "product",
"_id": "ID-12345",
"_score": 1,
"_source": {
"created_at": "2017-08-04T17:56:44.592Z",
"updated_at": "2017-08-04T17:56:44.592Z",
"product_information": {
"sku": "12345",
"name": "Product Name",
"price": 25,
"brand": "Brand Name",
"url": "URL"
},
"product_detail": {
"description": "Product description text here.",
"string_facets": [
{
"facet_name": "Colour",
"facet_value": "Grey"
},
{
"facet_name": "Category",
"facet_value": "Linen"
},
{
"facet_name": "Category",
"facet_value": "Throws & Blanket"
},
{
"facet_name": "Keyword",
"facet_value": "Contemporary"
},
{
"facet_name": "Keyword",
"facet_value": "Sophisticated"
}
]
}
}
}
I am storing product information such as Colour, Material, Category and Keywords within the product_detail.string_facets field. I'd like to use this for aggregation to get Colour/Material/Category/Keyword suggestions but as separate buckets. I.e, there is a separate bucket for each of those string_facet types as defined in product_detail.string_facets.facet_name.
This is the query I have at the moment which is returning data, but not as I expect. First the query (this was just to try and get Colours):
{
"from": 0,
"size": 12,
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "Rug",
"fields": ["product_information.name", "product_detail.string_facets.facet_value"]
}
},
{
"multi_match": {
"query": "Blue",
"fields": ["product_information.name", "product_detail.string_facets.facet_name"]
}
}
],
"minimum_should_match": "100%"
}
},
"aggs": {
"suggestions": {
"filter": { "term": { "product_detail.string_facets.facet_name.keyword": "Colour" }},
"aggs": {
"colours": {
"terms": {
"field": "product_detail.string_facets.facet_value.keyword",
"size": 10
}
}
}
}
}
}
This is giving me output like the following:
"aggregations": {
"suggestions": {
"doc_count": 21,
"colours": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 23,
"buckets": [
{
"key": "Rug",
"doc_count": 21
},
{
"key": "Blue",
"doc_count": 18
},
{
"key": "Bold",
"doc_count": 7
},
{
"key": "Modern",
"doc_count": 6
},
{
"key": "Multi-Coloured",
"doc_count": 5
},
{
"key": "Contemporary",
"doc_count": 4
},
{
"key": "Traditional",
"doc_count": 4
},
{
"key": "White",
"doc_count": 4
},
{
"key": "Luxurious",
"doc_count": 3
},
{
"key": "Minimal",
"doc_count": 3
}
]
}
}
}
It has given me the results of all facet_name rather those of facet_type Colour as I thought it would.
Any help would be greatly appreciated. Elasticsearch seems very powerful but the documentation is quite daunting!

You did not show how the mapping looks like, but I suppose that product_detail.string_facets field is just an inner object field and that is the reason why you get this kind of result. With this type of mapping Elasticsearch flattens the array into a simple list of field names and values. In your case it becomes:
{
"product_detail.string_facets.facet_name": ["Colour", "Category", "Keyword"],
"product_detail.string_facets.facet_value": ["Grey", "Linen", "Throws & Blanket", "Contemporary", "Sophisticated"]
}
As you can see, based on this structure, Elasticsearch cannot know how to aggregate the data.
To make it work product_detail.string_facets field should be of type nested. Mapping for string_facets should be similar to this (note "type": "nested"):
"string_facets": {
"type": "nested",
"properties": {
"facet_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"facet_value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
Now I index following document:
{
"created_at": "2017-08-04T17:56:44.592Z",
"updated_at": "2017-08-04T17:56:44.592Z",
"product_information": {
"sku": "12345",
"name": "Rug",
"price": 25,
"brand": "Brand Name",
"url": "URL"
},
"product_detail": {
"description": "Product description text here.",
"string_facets": [
{
"facet_name": "Colour",
"facet_value": "Blue"
},
{
"facet_name": "Colour",
"facet_value": "Red"
},
{
"facet_name": "Category",
"facet_value": "Throws & Blanket"
},
{
"facet_name": "Keyword",
"facet_value": "Contemporary"
}
]
}
}
Now, to get aggregation of colour suggestions as separate buckets, you can try this query (I simplified the bool query for the need of my document):
{
"from": 0,
"size": 12,
"query": {
"bool": {
"should": [
{
"multi_match": {
"query": "Rug",
"fields": ["product_information.name", "product_detail.string_facets.facet_value"]
}
}
]
}
},
"aggs": {
"facets": {
"nested" : {
"path" : "product_detail.string_facets"
},
"aggs": {
"suggestions": {
"filter": { "term": { "product_detail.string_facets.facet_name.keyword": "Colour" }},
"aggs": {
"colours": {
"terms": {
"field": "product_detail.string_facets.facet_value.keyword",
"size": 10
}
}
}
}
}
}
}
}
And result:
{
...,
"hits": {
...
},
"aggregations": {
"facets": {
"doc_count": 5,
"suggestions": {
"doc_count": 2,
"colours": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Blue",
"doc_count": 1
},
{
"key": "Red",
"doc_count": 1
}
]
}
}
}
}
}

Related

elastic search nested sub aggregations

We are using elastic search which holds records as documents with following definition
{
"loadtender": {
"aliases": {},
"mappings": {
"_doc": {
"_meta": {
"version": 20
},
"properties": {
"carrierId": {
"type": "long"
},
"destinationData": {
"type": "keyword"
},
"destinationZip": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 50
}
}
},
"effStartTime": {
"type": "date"
},
"endTime": {
"type": "date"
},
"id": {
"type": "long"
},
"mustRespondByTime": {
"type": "date"
},
"orgdiv": {
"type": "keyword"
},
"originData": {
"type": "keyword"
},
"originZip": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 50
}
}
},
"purchaseOrderNum": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 255
}
}
},
"startTime": {
"type": "date"
},
"tenderStatus": {
"type": "keyword"
},
"tenderedTime": {
"type": "date"
}
}
}
},
"settings": {
"index": {
"creation_date": "1655105542470",
"number_of_shards": "5",
"number_of_replicas": "1",
"uuid": "ohcXgA8EQ5iJj0X6_4BqXA",
"version": {
"created": "6080499"
},
"provided_name": "loadtender"
}
}
}
}
I am trying to search records to return me following filtered results
Input Parameter : startDate (yesterday), originData.originCity and originData.destinationCity
Output Required:
Three buckets for 0-30 days, 30-60 days and 60-90 days
buckets of distinct originData.city and destinationData.city combinations under each of the above
Under each of the above, buckets of data for each unique carrierId and the corresponding record list / count
Basically I was trying to achieve something like the below
{
"aggregations": {
"aggr": {
"buckets": [
{
"key": "0-30 days",
"doc_count": 10,
"aggr": {
"buckets": [
{
"key": "(originCity)Menasha, WI, US|Hanover, MD, US (DestinationCity)",
"aggr": {
"buckets": [
{
"key": "10183-carrierId",
"count": 10
}
]
}
}
]
}
},
{
"key": "30-60 days",
"doc_count": 11,
"aggr": {
"buckets": [
{
"key": "Dallas, TX, US|Houston, TX, US",
"aggr": {
"buckets": [
{
"key": "10183-carrierId",
"count": 10
},
{
"key": "10022-carrierId",
"count": 1
}
]
}
}
]
}
}
]
}
}
}
I've tried the following but I think I am not finding a way to filter it further using the sub aggregators.
{
"_source":["id", "effStartTime", "carrierId", "originData", "destinationData"],
"size": 100,
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"range": {
"startTime": {
"from": "2021-08-27T23:59:59.000Z",
"to": "2022-09-01T00:00:00.000Z",
"include_lower": true,
"include_upper": true,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
}
],
"must_not": [
{
"term": {
"tenderStatus": {
"value": "REMOVED",
"boost": 1
}
}
}
],
"filter" : {
"exists" : {
"field" : "carrierId"
}
},
"adjust_pure_negative": true,
"boost": 1
}
},
"aggregations": {
"aggr": {
"terms": {
"script": "doc['originData'].values[0] + '|' + doc['destinationData'].values[0]"
}
}
}
}
I started beginning to think if this is even possible OR should I shift to issuing multiple queries for the same
I was able to achieve the same using the following sub-aggregations:
"aggregations": {
"aggr":{
"date_range": {
"field": "startTime",
"format": "MM-yyyy",
"ranges": [
{"to": "now-1M/M", "from": "now"}, --> now to 30 days back
{"to": "now-1M/M", "from": "now-2M/M"}, from 30 days back to 60 days back
{"to": "now-2M/M", "from": "now-3M/M"}, from 60 days back to 90 days back
{"to": "now-3M/M", "from": "now-12M/M"}
]
},
"aggregations": {
"aggr":{
"terms": {
"script": "doc['originData'].values[0] + '|' + doc['destinationData'].values[0]" --> concatenated origin and destination address as a key
},
"aggregations": {
"aggr": {
"terms": {
"field": "carrierId" --> nested carrier count
}
}
}
}
}
}
}
Following is the response template that I receive.
"aggregations": {
"aggr": {
"buckets": [
{
"key": "09-2021-06-2022",
"from": 1630454400000,
"from_as_string": "09-2021",
"to": 1654041600000,
"to_as_string": "06-2022",
"doc_count": 1,
"aggr": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Dallas, TX, US|Houston, TX, US",
"doc_count": 14,
"aggr": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 10022,
"doc_count": 14
}
]
}
}
]
}
}
]
}
}
Thank you to all of you for your efforts and time. Do let me know if you discover any better way.

Elasticsearch complex aggregation on match results

I'm trying to get the top 5 teams based on win rate (matches won / all matches) having the match result documents in the index (see below).
The fields I need to get in the bucket:
apiId
name
winrate (won/total)
I guess this would require complex aggregation calculation witch is far beyond my current elasticsearch skills.
Elasticsearch version: 7.15.2
Could anyone help me with such elasticsearch query?
Thanks a lot in advance!
{
"lastModified": "2022-01-14T09:33:48.232Z",
"uuid": "01234567",
"started": "2022-01-14T09:31:27.651Z",
"editing": false,
"approved": true,
"statistics": {
"teams": [
{
"name": "Team1",
"score": 0,
"winner": false,
"apiId": "1"
},
{
"name": "Team2",
"score": 2,
"winner": true,
"apiId": "2"
}
]
}
}
and the mapping:
{
"mappings": {
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"approved": {
"type": "boolean"
},
"editing": {
"type": "boolean"
},
"ended": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"game": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"lastModified": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"started": {
"type": "date",
"format": "date_optional_time||epoch_millis"
},
"statistics": {
"type": "nested",
"include_in_parent": true,
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"teams": {
"type": "nested",
"include_in_parent": true,
"properties": {
"_class": {
"type": "keyword",
"index": false,
"doc_values": false
},
"apiId": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"score": {
"type": "long"
},
"winner": {
"type": "boolean"
}
}
}
}
},
"uuid": {
"type": "keyword"
}
}
}
}
Edit:
Based on #ilvar answer, I constructed the query:
{
"query": {
"bool": {
"must": [
{
"term": {
"approved": true
}
}
]
}
},
"size": 0,
"aggs": {
"top_teams": {
"nested": {
"path": "statistics.teams"
},
"aggs": {
"the_all": {
"multi_terms": {
"terms": [
{
"field": "statistics.teams.name.keyword"
},
{
"field": "statistics.teams.apiId"
}
]
}
},
"the_won": {
"filter": {
"terms": {
"statistics.teams.winner": [
true
]
}
},
"aggs": {
"teams": {
"multi_terms": {
"terms": [
{
"field": "statistics.teams.name.keyword"
},
{
"field": "statistics.teams.apiId"
}
]
}
}
}
}
}
}
}
}
Which gives me:
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"top_teams": {
"doc_count": 10,
"the_all": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
"Team1",
2
],
"key_as_string": "Team1|2",
"doc_count": 2
},
{
"key": [
"Team2",
3
],
"key_as_string": "Team2|3",
"doc_count": 2
},
{
"key": [
"Team3",
5
],
"key_as_string": "Team3|5",
"doc_count": 2
},
{
"key": [
"Team4",
1
],
"key_as_string": "Team4|1",
"doc_count": 1
},
{
"key": [
"Team5",
4
],
"key_as_string": "Team5|4",
"doc_count": 1
},
{
"key": [
"Team6",
7
],
"key_as_string": "Team6|7",
"doc_count": 1
},
{
"key": [
"Team7",
6
],
"key_as_string": "Team7|6",
"doc_count": 1
}
]
},
"the_won": {
"doc_count": 4,
"teams": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": [
"Team2",
5
],
"key_as_string": "Team2|5",
"doc_count": 2
},
{
"key": [
"Team4",
2
],
"key_as_string": "Team4|2",
"doc_count": 1
},
{
"key": [
"Team7",
3
],
"key_as_string": "Team7|3",
"doc_count": 1
}
]
}
}
}
}
}
But I still cannot get the winrate from two siblings, where one sibling might have missing teams that have not won any match.
Should I use some pipeline aggregation?
The final solution I came out with is bellow. the sort pipeline aggregation did the final part of the job
{
"query": {
"bool": {
"must": [
{
"term": {
"approved": true
}
}
]
}
},
"size": 0,
"aggs": {
"top_teams": {
"nested": {
"path": "statistics.teams"
},
"aggs": {
"the_all": {
"terms": {
"field": "statistics.teams.apiId"
},
"aggs": {
"the_won_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
true
]
}
}
},
"the_lost_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
false
]
}
}
},
"the_all_by_team": {
"filter": {
"terms": {
"statistics.teams.winner": [
true,
false
]
}
}
},
"the_winrate": {
"bucket_script": {
"buckets_path": {
"the_won_count": "the_won_by_team._count",
"the_all_count": "the_all_by_team._count"
},
"script": "params.the_won_count / params.the_all_count"
}
},
"the_sort": {
"bucket_sort": {
"sort": [
{
"the_winrate": "desc"
},
{
"the_all_by_team._count": "desc"
}
],
"size": 5
}
}
}
}
}
}
}
}
This is very similar to the example docs have for nested aggregation. The only difference would be that you'll have a terms aggregation on the top level instead of filter so you get back all of the teams.

how do I implement a single-word auto complete using Elasticsearch 6

I would like to implement an single word autocomplete using elasticsearch 6. I have seen a fair amount of posts on how to do this using lesser versions however, it seems that autocomplete has changed significantly in the last version.
I am using the standard mapping for autocomplete:
PUT advertising_tins
{
"settings": {
"analysis": {
"analyzer": {
"completion_analyzer": {
"type": "custom",
"filter": [
"lowercase",
"completion_filter"
],
"tokenizer": "keyword"
}
},
"filter": {
"completion_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 24
}
}
}
},
"mappings": {
"item": {
"properties": {
"date": {
"type": "long"
},
"id": {
"type": "text"
},
"title": {
"type": "text"
},
"suggest": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
},
"completion": {
"type": "text",
"analyzer": "completion_analyzer",
"search_analyzer": "standard"
}
}
}
}
}
}
}
I am indexing like this:
POST advertising_tins/item/_bulk
{"index":{}}
{"date": 20180217, "title": "Vintage Spice Cardboard Tin of Mace Dainty Brand St. Paul, MN 1 oz.","id": "305232814","suggest": [ "spice","cardboard","tin","mace","dainty","brand","st","paul","mn","oz"]}
And querying like this:
POST advertising_tins/_search?pretty
{
"size": 0,
"query": {
"term": {
"suggest.completion": "car"
}
},
"aggs": {
"suggestions": {
"terms": {
"field": "suggest.raw"
}
}
}
}
However my results return all terms in the suggest field instead of just single term "cardboard".
{
"took": 4,
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"suggestions": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "brand",
"doc_count": 1
},
{
"key": "cardboard",
"doc_count": 1
},
{
"key": "dainty",
"doc_count": 1
},
{
"key": "mace",
"doc_count": 1
},
{
"key": "mn",
"doc_count": 1
},
{
"key": "oz",
"doc_count": 1
},
{
"key": "paul",
"doc_count": 1
},
{
"key": "spice",
"doc_count": 1
},
{
"key": "st",
"doc_count": 1
},
{
"key": "tin",
"doc_count": 1
}
]
}
}
}
And idea how I fix this and get just a single term match?
You are almost there. It can be achieved with the default Completion Suggester, you only need to change the type of your completion field to "completion":
"mappings": {
"item": {
"properties": {
"date": {
"type": "long"
},
"id": {
"type": "text"
},
"title": {
"type": "text"
},
"suggest": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
},
"completion": {
"type" : "completion", <--- here
"analyzer": "completion_analyzer",
"search_analyzer": "standard"
}
}
}
}
}
}
And add a "suggest" part into your query:
POST advertising_tins/_search
{
"size": 0,
"query": {
"term": {
"suggest.completion": "car"
}
},
"suggest" : { <--- Here goes he suggest query
"my-suggestion" : {
"text" : "car",
"completion" : {
"field" : "suggest.completion"
}
}
},
"aggs": {
"suggestions": {
"terms": {
"field": "suggest.raw"
}
}
}
}
The response will look like this:
{
// ...
"hits": //... ,
"aggregations": // ...,
"suggest": {
"my-suggestion": [
{
"text": "car",
"offset": 0,
"length": 3,
"options": [
{
"text": "cardboard", <--- here is the suggestion
"_index": "advertising_tins",
"_type": "item",
"_id": "GLeUqGEBVrFe7u7pR5uA",
"_score": 1,
"_source": {
"date": 20180217,
"title": "Vintage Spice Cardboard Tin of Mace Dainty Brand St. Paul, MN 1 oz.",
"id": "305232814",
"suggest": [
"spice",
"cardboard",
"tin",
"mace",
"dainty",
"brand",
"st",
"paul",
"mn",
"oz"
]
}
}
]
}
]
}
}
The response also includes the _source of the suggested document, so you might not even need to use "query" and "aggs" parts.
Hope that helps!

Elasticsearch: Ordering with respect to Nested Sum Aggregation

The use case is to create aggregations to find top selling products in a region sorted by the counts of products sold. The data is stored in an index in elasticsearch.
I want to sort my 'group_by_name' aggregations output by the sum_quantity value aggregation which is in the last nested aggregation/ two levels in after 'group_by_name' with an intermediate aggregation 'group_by_sku'. The default output is sorted by doc_count. I want the aggregation to be sorted by the 'sum_quantity' aggregation value.
I have an index with the following mapping:
{"settings": {
"index": {
"number_of_shards": 2,
"number_of_replicas": 0
},
"analysis":{
"analyzer":{
"autocomplete":{
"type":"custom",
"tokenizer":"standard",
"filter":[ "standard", "lowercase", "ngram" ]
}
},
"filter":{
"ngram":{
"type":"ngram",
"min_gram":3,
"max_gram":25
}
}
} },"mappings": {
"farmer_products_map":{
"properties": {
"state": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"district": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"taluka": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"village": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"createdOn": {
"type": "date",
"format": "epoch_millis"
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"productName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
},
"autocomplete":{
"analyzer": "autocomplete",
"type": "string"
}
}
},
"crop": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"sku": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"quantity": {
"type": "integer"
},
"farmerId": {
"type": "integer"
},
"orderId": {
"type": "integer"
}
}} }}
The following are the sample documents:
[{
"_index": "farmer_products_index_adv",
"_type": "farmer_products_map",
"_id": "AVtCttf0IP9v8cUTtoiz",
"_score": 1,
"_source": {
"orderId": 469173,
"category": "Hardware",
"farmerId": 509583,
"district": "",
"brand": "Honda",
"taluka": "",
"crop": "",
"productName": "Honda BRUSH CUTTER UMK 435-T U2NT",
"state": "",
"sku": "AGS-HW-471",
"village": "",
"quantity": 1
},{
"_index": "farmer_products_index_adv",
"_type": "farmer_products_map",
"_id": "AVtCttf0IP9v8cUTtoi1",
"_score": 1,
"_source": {
"orderId": 469177,
"category": "Crop Nutrition",
"farmerId": 13732,
"district": "Banaskantha",
"brand": "Unassigned Brand",
"taluka": "Kankrej",
"crop": "",
"productName": "Free Power Gel - Plant Nutrient (500 Ml)",
"state": "Gujarat",
"sku": "AGS-CN-006",
"village": "Nanota",
"quantity": 1
}}]
I wish to perform the following query aggregation:
{
"query": {
"bool": {
"must": [{
"match": {
"state": {
"query": "Maharashtra",
"fuzziness": 3,
"prefix_length": 2
}
}
}, {
"match": {
"district": {
"query": "Wardha",
"fuzziness": 3,
"prefix_length": 2
}
}
}, {
"match": {
"taluka": {
"query": "Wardha",
"fuzziness": 3,
"prefix_length": 2
}
}
}]
}
},
"size": 0,
"aggs": {
"group_by_state": {
"terms": {
"field": "state.keyword"
},
"aggs": {
"group_by_district": {
"terms": {
"field": "district.keyword"
},
"aggs": {
"group_by_taluka": {
"terms": {
"field": "taluka.keyword"
},
"aggs": {
"group_by_name": {
"terms": {
"field": "productName.keyword"
},
"aggs": {
"group_by_sku": {
"terms": {
"field": "sku.keyword"
},
"aggs": {
"sum_quantity": {
"sum": {
"field": "quantity"
}
}
}
}
}
}
}
}
}
}
}
}
}}
The current output of the aggregation is:
[{
"key": "Free MH HDPE Tarpaulin Tape Black 3mtr roll",
"doc_count": 13,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-HW-410",
"doc_count": 13,
"sum_quantity": {
"value": 13
}
}]}}, {
"key": "Tarpaulin Sheet 11*15 (Tadpatri) 250 GSM",
"doc_count": 10,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-HW-326",
"doc_count": 10,
"sum_quantity": {
"value": 10
}
}]
}}, {
"key": "Free Humic power Advanced powder 95% (250 Gms)",
"doc_count": 6,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-CN-036",
"doc_count": 6,
"sum_quantity": {
"value": 18
}
}]
}}]
I want the output to be sorted by the sum_quantity value:
[{
"key": "Free Humic power Advanced powder 95% (250 Gms)",
"doc_count": 6,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-CN-036",
"doc_count": 6,
"sum_quantity": {
"value": 18
}
}]}}, {
"key": "Free MH HDPE Tarpaulin Tape Black 3mtr roll",
"doc_count": 13,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-HW-410",
"doc_count": 13,
"sum_quantity": {
"value": 13
}
}]
}}, {
"key": "Tarpaulin Sheet 11*15 (Tadpatri) 250 GSM",
"doc_count": 10,
"group_by_sku": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "AGS-HW-326",
"doc_count": 10,
"sum_quantity": {
"value": 10
}
}]
}}]
How can I achieve this? I tried approaches suggested in other stackoverflow questions using "reverse_nested" but was unable to arrive at a solution.

ElasticSearch - Aggregations with document details

I need to aggregate the following documents:
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
by actor, and would like to get the following structure:
[
{"name": "Christian Bale"},
{"movies": [
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6"
},
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0"
}, ...
]
Beyong using a standard term aggregation based on the casting.name field, how can I retrieve the releaseDate and imdbRate of the related documents?
For each actor, I also need movies to be sorted by releaseDate asc.
Can I perform this using one single request?
As you have an array of casting objects in your documents you'll need to use the nested type in your mapping. To get the aggregations you want you need a combination of Terms Aggregations, Nested Aggregations and Reverse Nested Aggregations. Below is an example.
Create and index with the mapping:
POST /test
{
"mappings": {
"movie": {
"properties": {
"title": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"releaseDate": {
"type": "string",
"index": "not_analyzed"
},
"casting": {
"type": "nested",
"properties": {
"name": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"category": {
"type": "string",
"fields":{
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
}
}
Index the documents:
POST /test/movie/1
{
"title": "American Psycho",
"releaseDate": "7/06/2000",
"imdbRate": "7.6",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Justin Theroux",
"category": "Actor"
}
]
}
POST /test/movie/2
{
"title": "The Dark Knight",
"releaseDate": "13/08/2008",
"imdbRate": "9.0",
"casting": [
{
"name": "Christian Bale",
"category": "Actor"
},
{
"name": "Morgan Freeman",
"category": "Actor"
}
]
}
And finally search:
POST /test/movie/_search?search_type=count
{
"aggs": {
"nested_path": {
"nested": {
"path": "casting"
},
"aggs": {
"actor_name": {
"terms": {
"field": "casting.name.raw"
},
"aggs": {
"movies": {
"reverse_nested": {},
"aggs": {
"movie_title": {
"terms": {
"field": "title.raw"
},
"aggs": {
"release_date": {
"terms": {
"field": "releaseDate"
}
},
"imdbRate_date": {
"terms": {
"field": "imdbRate"
}
}
}
}
}
}
}
}
}
}
}
}
The response for Christian Bale is:
{
"key": "Christian Bale",
"doc_count": 2,
"movies": {
"doc_count": 2,
"movie_title": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "American Psycho",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7/06/2000",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "7.6",
"doc_count": 1
}
]
}
},
{
"key": "The Dark Knight",
"doc_count": 1,
"release_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "13/08/2008",
"doc_count": 1
}
]
},
"imdbRate_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9.0",
"doc_count": 1
}
]
}
}
]
}
}
}

Resources