Perform multi-field / multi-dimensional aggregations with nested fields in Elastic Search - elasticsearch

I am tracking attendance of few students. I am storing their details in the index like the below.
Each doc in "entries" have few other fields. The following data shows that a student has attended 6 classes on "Monday".
"entries" is of type "nested"
{
reg_id: 1111,
"entires" : [
{
id: "123"
day: 'Monday'
},
{
id: "1234",
attendance: true
},
{
id: "12345",
classes_attended: 6
}
],
}
I want the count of each classes_attended of students for each day.
For Example "72 entries of students found for "Monday", who has attended 6 classes"
Sample desired output - This is just a sample I am completely fine if the output schema is changed.
[
{
"day" : "monday",
"classes_attended": 6,
count: 4
},
{
"day" : "monday",
"classes_attended": 1,
count: 5
},
{
"day" : "tuesday",
"classes_attended": 5,
count: 2
},
{
"day" : "tuesday",
"classes_attended": 6,
count: 1
}
]
Not sure How to start with the aggregations query:
I tried with the following query but I know its not the correct solution
"aggs": {
"attendance_aggs": {
"nested": {
"path": "entries"
},
"aggs": {
"days": {
"terms": {
"field": "entries.day"
},
"aggs": {
"attended": {
"reverse_nested": {},
"aggs":{
"class_attended_day": {
"terms": {
"field": "entries.classes_attended"
},
"aggs": {
"class_attended_days_count": {
"reverse_nested": {},
"aggs": {
"classes_attended_final": {
"cardinality": {
"field": "entries.class_attended"
}
}
}
}
}
}
}
}
}
}
}
}
}

It's unclear what the top-level object is so I'm going to assume it's a "student attendance entry per day". I'm also unsure what the entries.ids represent but I'll assume you'll be needing them at some later point so I'll keep them untouched.
Now, since all that your entries objects have in common is the id, they can be decoupled. Meaning that you should be using nested if any only if you share some attributes across all objects which need their attribute connections preserved. Since I don't see entries.id anywhere in your aggs, I'd recommend the following adjustments to your mapping:
PUT students
{
"mappings": {
"properties": {
"day": { ------------
"type": "keyword" |
}, |
"attendance": { |
"type": "boolean" | <--
}, |
"classes_attended": { |
"type": "integer" |
}, ------------
"entries": {
"type": "nested",
"properties": {
"day": {
"type": "keyword",
"copy_to": "day" <--
},
"attendance": {
"type": "boolean",
"copy_to": "attendance" <--
},
"classes_attended": {
"type": "integer",
"copy_to": "classes_attended" <--
}
}
}
}
}
}
and here's your query:
GET students/_search
{
"size": 0,
"aggs": {
"days": {
"terms": {
"field": "day"
},
"aggs": {
"classes_attended": {
"terms": {
"field": "classes_attended"
},
"aggs": {
"student_count": {
"cardinality": {
"field": "_id"
}
}
}
}
}
}
}
}
The response can then be post-processed into whatever you prefer.
EDIT
You could hijack reverse_nested but will need to come back to it as you're referencing other nested entries:
GET students/_search
{
"size": 0,
"aggs": {
"attendance_aggs": {
"nested": {
"path": "entries"
},
"aggs": {
"days": {
"terms": {
"field": "entries.day"
},
"aggs": {
"attended": {
"reverse_nested": {},
"aggs": {
"class_attended_day": {
"nested": {
"path": "entries"
},
"aggs": {
"class_attended_day": {
"terms": {
"field": "entries.classes_attended"
},
"aggs": {
"classes_attended_final": {
"cardinality": {
"field": "entries.classes_attended"
}
}
}
}
}
}
}
}
}
}
}
}
}
}

Related

Elasticsearch aggregation with unqiue counting

My documents consist of a history of orders and their state, here a minimal example:
{
"orderNumber" : "xyz",
"state" : "shipping",
"day" : "2022-07-20",
"timestamp" : "2022-07-20T15:06:44.290Z",
}
the state can be strings like shipping, processing, redo,...
For every possible state, I need to count the number of orders that had this state at some point during a day, without counting a state twice for the same orderNumber that day (which can happen if there is a problem and it needs to start from the beginning that same day).
My aggregation looks like this:
GET order-history/_search
{
"aggs": {
"countDays": {
"terms": {
"field": "day",
"order": {
"_key": "desc"
},
"size": 20
},
"aggs": {
"countStates": {
"terms": {
"field": "state.keyword",
"size": 10
}
}
}
}
}
, "size": 1
}
However, this will count a state for a given orderNumber twice if it reappears that same day. How would I prevent it from counting a state twice for each orderNumber, if it is on the same day?
Tldr;
I don't think there is a flexible and simple solution.
But if you know in advance the number of state that exists. Maybe through another aggregation query, to get all type of state.
You could do the following
POST /_bulk
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"shipping","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"redo","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"xyz","state":"shipping","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"bbb","state":"processing","day":"2022-07-20"}
{"index":{"_index":"73138766"}}
{"orderNumber":"bbb","state":"shipping","day":"2022-07-20"}
GET 73138766/_search
{
"size": 0,
"aggs": {
"per_day": {
"date_histogram": {
"field": "day",
"calendar_interval": "day"
},
"aggs": {
"shipping": {
"filter": { "term": { "state.keyword": "shipping" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
},
"processing": {
"filter": { "term": { "state.keyword": "processing" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
},
"redo": {
"filter": { "term": { "state.keyword": "redo" }
},
"aggs": {
"orders": {
"cardinality": {
"field": "orderNumber.keyword"
}
}
}
}
}
}
}
}
You will obtain the following results
{
"aggregations": {
"per_day": {
"buckets": [
{
"key_as_string": "2022-07-20T00:00:00.000Z",
"key": 1658275200000,
"doc_count": 5,
"shipping": {
"doc_count": 3,
"orders": {
"value": 2
}
},
"processing": {
"doc_count": 1,
"orders": {
"value": 1
}
},
"redo": {
"doc_count": 1,
"orders": {
"value": 1
}
}
}
]
}
}
}

Subtract numeric fields between two documents with different timestamp

Lets say I have these data samples:
{
"date": "2019-06-16",
"rank": 150
"name": "doc 1"
}
{
"date": "2019-07-16",
"rank": 100
"name": "doc 1"
}
{
"date": "2019-06-16",
"rank": 50
"name": "doc 2"
}
{
"date": "2019-07-16",
"rank": 80
"name": "doc 2"
}
The expected result is by subtracting the rank field from two same name of docs with different date (old date - new date):
{
"name": "doc 1",
"diff_rank": 50
}
{
"name": "doc 2",
"diff_rank": -30
}
And sort by diff_rank if possible, otherwise I will just sort manually after getting the result.
What I have tried is by using date_histogram and serial_diff but some results are missing the diff_rank value in somehow which I am sure the data exist:
{
"aggs" : {
"group_by_name": {
"terms": {
"field": "name"
},
"aggs": {
"days": {
"date_histogram": {
"field": "date",
"interval": "day"
},
"aggs": {
"the_rank": {
"sum": {
"field": "rank"
}
},
"diff_rank": {
"serial_diff": {
"buckets_path": "the_rank",
"lag" : 30 // 1 month or 30 days in this case
}
}
}
}
}
}
}
}
The help will be much appreciated to solve my issue above!
Finally, I found a method from official doc using Filter, Bucket Script aggregation and Bucket Sort to sort the result. Here is the final snippet code:
{
"size": 0,
"aggs" : {
"group_by_name": {
"terms": {
"field": "name",
"size": 50,
"shard_size": 10000
},
"aggs": {
"last_month_rank": {
"filter": {
"term": {"date": "2019-06-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"latest_rank": {
"filter": {
"term": {"date": "2019-07-17"}
},
"aggs": {
"rank": {
"sum": {
"field": "rank"
}
}
}
},
"diff_rank": {
"bucket_script": {
"buckets_path": {
"lastMonthRank": "last_month_rank>rank",
"latestRank": "latest_rank>rank"
},
"script": "params.lastMonthRank - params.latestRank"
}
},
"rank_bucket_sort": {
"bucket_sort": {
"sort": [
{"diff_rank": {"order": "desc"}}
],
"size": 50
}
}
}
}
}
}

Unable to create nested date aggregation query

I am trying to create an ElasticSearch aggregation query which can generate sum or average of value in all my ingested documents.
The documents are of the format -
{
"weather":"cold",
"date_1":"2017/07/05",
"feedback":[
{
"date_2":"2017/08/07",
"value":28,
"comment":"not cold"
},{
"date_2":"2017/08/09",
"value":48,
"comment":"a bit chilly"
},{
"date_2":"2017/09/07",
"value":18,
"comment":"very cold"
}, ...
]
}
I am able to create a sum aggregation of all "feedback.value" using "date_1" by using the following request -
GET _search
{
"query": {
"query_string": {
"query": "cold"
}
},
"size": 0,
"aggs": {
"temperature": {
"date_histogram":{
"field" : "date_1",
"interval" : "month"
},
"aggs":{
"temperature_agg":{
"terms": {
"field": "feedback.value"
}
}
}
}
}
}
However, I need to generate the same query across all documents aggregate based on "feedback.date_2". I am not sure if ElasticSearch can resolve such aggregation or how to approach it. Any guidance would be helpful
[EDIT]
Mapping file( I only define the nested items, ES identifes other fields on its own)
{
"mappings": {
"catalog_item": {
"properties": {
"feedback":{
"type":"nested",
"properties":{
"date_2":{
"type": "date",
"format":"YYYY-MM-DD"
},
"value": {
"type": "float"
},
"comment": {
"type": "text"
}
}
}
}
}
}
}
You would need to make use of nested documents and sum aggregation.
Here's a working example:
Sample Mapping:
PUT test
{
"mappings": {
"doc": {
"properties": {
"feedback": {
"type": "nested"
}
}
}
}
}
Add Sample document:
PUT test/doc/1
{
"date_1": "2017/08/07",
"feedback": [
{
"date_2": "2017/08/07",
"value": 28,
"comment": "not cold"
},
{
"date_2": "2017/08/09",
"value": 48,
"comment": "a bit chilly"
},
{
"date_2": "2017/09/07",
"value": 18,
"comment": "very cold"
}
]
}
Calculate both the sum and average based on date_2.
GET test/_search
{
"size": 0,
"aggs": {
"temperature_aggregation": {
"nested": {
"path": "feedback"
},
"aggs": {
"temperature": {
"date_histogram": {
"field": "feedback.date_2",
"interval": "month"
},
"aggs": {
"sum": {
"sum": {
"field": "feedback.value"
}
},
"avg": {
"avg": {
"field": "feedback.value"
}
}
}
}
}
}
}
}

Sum and count aggregations over Elasticsearch fields

I am new to Elasticsearch and I am looking to perform certain aggregations over the fields from an Elasticsearch 5.x index. I have an index that contains the documents with fields langs (which have nested structure) and docLang. These are dynamically mapped fields. Following are the examples documents
DOC 1:
{
"_index":"A",
"_type":"document",
"_id":"1",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1,
"es":2,
"zh":3
},
"Y":{
"en":4,
"es":5,
"zh":6
}
},
"docLang": "en"
}
}
DOC 2:
{
"_index":"A",
"_type":"document",
"_id":"2",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1,
"es":2
},
"Y":{
"en":3,
"es":4
}
},
"docLang": "es"
}
}
DOC 3:
{
"_index":"A",
"_type":"document",
"_id":"2",
"_source":{
"text":"This is a test sentence.",
"langs":{
"X":{
"en":1
},
"Y":{
"en":2
}
},
"docLang": "en"
}
}
I want to perform sum aggregation over the langs field in a way that for each key (X/Y) and for each language, I can get the sum across all documents in an index. Also, I want to produce the counts of documents for each type of language from docLang field.
e.g.: For above 3 documents, sum aggregation over langs field would look like below:
"langs":{
"X":{
"en":3,
"es":4,
"zh":3
},
"Y":{
"en":9,
"es":9,
"zh":6
}
}
And the docLang count would look like below:
"docLang":{
"en" : 2,
"es" : 1
}
Also because of some production env restrictions, I cannot use scripts in Elasticsearch. So, I was wondering if it is possible to use just field aggregation type for above fields?
{
"size": 0,
"aggs": {
"X": {
"nested": {
"path": "langs.X"
},
"aggs": {
"X_sum_en": {
"sum": {
"field": "langs.X.en"
}
},
"X_sum_es": {
"sum": {
"field": "langs.X.es"
}
},
"X_sum_zh": {
"sum": {
"field": "langs.X.zh"
}
}
}
},
"Y": {
"nested": {
"path": "langs.Y"
},
"aggs": {
"Y_sum_en": {
"sum": {
"field": "langs.Y.en"
}
},
"Y_sum_es": {
"sum": {
"field": "langs.Y.es"
}
},
"Y_sum_zh": {
"sum": {
"field": "langs.Y.zh"
}
}
}
},
"sum_docLang": {
"terms": {
"field": "docLang.keyword",
"size": 10
}
}
}
}
Since you didn't mention, but I think it's important. I made X and Y as nested fields:
"langs": {
"properties": {
"X": {
"type": "nested",
"properties": {
"en": {
"type": "long"
},
"es": {
"type": "long"
},
"zh": {
"type": "long"
}
}
},
"Y": {
"type": "nested",
"properties": {
"en": {
"type": "long"
},
"es": {
"type": "long"
},
"zh": {
"type": "long"
}
}
}
}
}
But, if you fields are not nested at all and here I mean actually the nested field type in Elasticsearch, a simple aggregation like this one should be enough:
{
"size": 0,
"aggs": {
"X_sum_en": {
"sum": {
"field": "langs.X.en"
}
},
"X_sum_es": {
"sum": {
"field": "langs.X.es"
}
},
"X_sum_zh": {
"sum": {
"field": "langs.X.zh"
}
},
"Y_sum_en": {
"sum": {
"field": "langs.Y.en"
}
},
"Y_sum_es": {
"sum": {
"field": "langs.Y.es"
}
},
"Y_sum_zh": {
"sum": {
"field": "langs.Y.zh"
}
},
"sum_docLang": {
"terms": {
"field": "docLang.keyword",
"size": 10
}
}
}
}

Aggregates in Nest (Elastic) with filter having both nested and parent objects

I have a catalog of products that I want to calculate aggregates on. The trouble comes with trying to do nested aggregations with filter that has both nested and parent fields in it. Either it gives wrong counts or 0 hits. Here is a sample of my product object mapping:
"Products": {
"properties": {
"ProductID": {
"type": "long"
},
"ProductType": {
"type": "long"
},
"ProductName": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"Prices": {
"type": "nested",
"properties": {
"CurrencyType": {
"type": "integer"
},
"Cost": {
"type": "double"
}
}
}
}
}
Here is an example of the sql query that I am trying to replicate in elastic:
SELECT PRODPR.Cost AS PRODPR_Cost
,COUNT(PROD.ProdcutID) AS PROD_ProductID_Count
FROM Products PROD WITH (NOLOCK)
LEFT OUTER JOIN Prices PRODPR WITH (NOLOCK) ON (PRODPR.objectid = PROD.objectid)
WHERE PRODPR.CurrencyType = 4
AND PROD.ProductType IN (
11273
,11293
,11294
)
GROUP BY PRODPR.Cost
Elastic Search queries I came up with:
First One (following query returns correct counts with just CurrencyType as filter but when I add ProductType filter, it gives me wrong counts)
GET /IndexName/Products/_search
{
"aggs": {
"price_agg": {
"filter": {
"bool": {
**"must": [
{
"nested": {
"path": "Prices",
"filter": {
"term": {
"Prices.CurrencyType": "8"
}
}
}
},
{
"terms": {
"ProductType": [
"11273",
"11293",
"11294"
]
}
}
]**
}
},
"aggs": {
"price_nested_agg": {
"nested": {
"path": "Prices"
},
"aggs": {
"59316518_group_agg": {
"terms": {
"field": "Prices.Cost",
"size": 0
},
"aggs": {
"product_count": {
"reverse_nested": { },
"aggs": {
"ProductID_count_agg": {
"value_count": {
"field": "ProductID"
}
}
}
}
}
}
}
}
}
}
},
"size": 0
}
Second One (following query returns correct counts with just CurrencyType as filter but when I add ProductType filter, it gives me 0 hits):
GET /IndexName/Prodcuts/_search
{
"aggs": {
"price_agg": {
"nested": {
"path": "Prices"
},
"aggs": {
"currency_filter": {
"filter": {
"bool": {
"must": [
{
"term": {
"Prices.CurrrencyType": "4"
}
},
{
"terms": {
"ProductType": [
"11273",
"11293"
]
}
}
]
}
},
"aggs": {
"59316518_group_agg": {
"terms": {
"field": "Prices.Cost",
"size": 0
},
"aggs": {
"product_count": {
"reverse_nested": {},
"aggs": {
"ProductID_count_agg": {
"value_count": {
"field": "ProductID"
}
}
}
}
}
}
}
}
}
}
},
"size": 0
}
I have tried some more queries but the above two are the closest I came up with. Has anyone come across this use case? What am I doing wrong? Any help is appreciated. Thanks!

Resources