Elasticsearch Bucket sort on nested field - elasticsearch

I have a problem with a bucket sort on nested field on Elastic 7.1.0:
My index has the following mapping:
{
"mapping": {
"dynamic": "strict",
"properties": {
"created_at_timestamp": {
"type": "date"
},
"url": {
"type": "keyword",
},
"title": {
"type": "keyword",
},
"entities": {
"type": "nested",
"properties": {
"counter": {
"type": "long"
},
"metric": {
"type": "long"
},
"id": {
"type": "long"
},
"relevance": {
"type": "float"
},
"weighted_metric": {
"type": "float"
}
}
}
}
}
}
and I need to order this documents by "weighted_metric", filtered for a specific entity id. I wrote this query:
GET my_index/_search?size=0
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "entities",
"query": {
"term": {
"entities.id": "27374"
}
}
}
}
],
"must_not": [
{
"term": {
"title": {
"value": ""
}
}
}
]
}
},
"aggs": {
"by_url_and_title": {
"composite": {
"sources": [
{
"final_url": {
"terms": {
"field": "final_url"
}
}
},
{
"title": {
"terms": {
"field": "title"
}
}
}
]
},
"aggs": {
"sum_metric": {
"nested": {
"path": "entities"
},
"aggs": {
"weightedmetric": {
"filters": {
"filters": {
"new": {
"bool": {
"should": [
{
"term": {
"entities.id": "27374"
}
}
]
}
}
}
},
"aggs": {
"wmetric": {
"sum": {
"field": "entities.weighted_metric"
}
}
}
},
"w_sort": {
"bucket_sort": {
"sort": [
{
"weightedmetric.wmetric": {
"order": "desc"
}
}
],
"size": 10
}
}
}
}
}
}
}
}
And I have this error:
{
"error": {
"root_cause": [],
"type": "search_phase_execution_exception",
"reason": "",
"phase": "fetch",
"grouped": true,
"failed_shards": [],
"caused_by": {
"type": "class_cast_exception",
"reason": "org.elasticsearch.search.aggregations.bucket.nested.InternalNested cannot be cast to org.elasticsearch.search.aggregations.InternalMultiBucketAggregation"
}
},
"status": 503
}
If I don't try to order the buckets everything works fine.
Can someone help me with this query? I need to order the buckets by weighted_metric. thanks

Related

How to nested aggregate matched terms?

I've this mapping in my index:
{
"mappings": {
"properties": {
"uuid": {
"type": "keyword"
},
"last_visit": {
"type": "date"
},
"urls": {
"type": "nested",
"properties": {
"url": {
"type": "keyword"
},
"is_visited": {
"type": "boolean"
}
}
}
}
}
}
and hundreds of data like this:
This is my desired output when I search for *google.com and *facebook.com:
[
{
"uuid": "afa9ac03-0723-4d66-ae18-08a51e2973bd",
"urls": [
{
"is_visited": true,
"url": "https://www.google.com",
"last_visit": "2022-02-31"
},
{
"is_visited": false,
"url": "https://www.facebook.com",
"last_visit": "2022-02-03"
},
{
"is_visited": true,
"url": "https://www.twitter.com",
"last_visit": "2022-03-30"
}
]
},
{
"uuid": "4a1c695d-756b-4d9d-b3a0-cf524d955884",
"urls": [
{
"is_visited": true,
"url": "https://www.stackoverflow.com",
"last_visit": "2022-03-23"
},
{
"is_visited": false,
"url": "https://www.facebook.com",
"last_visit": "2022-02-02"
},
{
"is_visited": false,
"url": "https://drive.google.com",
"last_visit": "2022-05-01"
},
{
"is_visited": true,
"url": "https://www.google.com",
"last_visit": "2022-07-09"
}
]
}
]
and this is the code I wrote (thanks to another question where I have not explained myself well about desired output) with focus on *google.com when I try to add last_visit field to output :
{
"query": {
"nested": {
"path": "urls",
"query": {
"bool": {
"should": [
{
"wildcard": {
"urls.url": {
"value": "*google.com"
}
}
},
{
"wildcard": {
"urls.url": {
"value": "*facebook.com"
}
}
}
]
}
}
}
},
"aggs": {
"agg_providers": {
"nested": {
"path": "urls"
},
"aggs": {
"google.com": {
"terms": {
"field": "urls.url",
"include": ".*google.com",
"size": 10
},
"aggs": {
"top_hits": {
"top_hits": {
"size": 1,
"_source": {
"includes": ["last_visit"]
}
}
}
}
},
"facebook.com": {
"terms": {
"field": "urls.url",
"include": ".*facebook.com",
"size": 10
}
}
}
}
}
}
The code above returns 2 differents buckets lists in which I have key,doc_count dict values instead of all fields (is_visited, last_visit, uuid, etc.)
Thanks.

Elasticsearch Querying Double Nested Object, Match Multiple Rows in Query Within Parent

My data model is related to patient records. At the highest level is the Patient, then their information such as Lab Panels and the individual rows of the results of the panel. So it looks like this: {Patient:{Labs:[{Results:[{}]}]}}
I am able to successfully create the two nested objects Labs nested in Patient and Results nested in Labs, populate it, and query it. What I am unable to successfully do is create a query that constrains the results to a single Lab, and then match by more than one row in the Results object.
An example is attached, where I only want labs that are "Lipid Panel" and the results are HDL <= 46 and LDL >= 140.
Any suggestions?
Example Index
PUT localhost:9200/testpipeline
{
"aliases": {},
"mappings": {
"dynamic": "false",
"properties": {
"ageAtFirstEncounter": {
"type": "float"
},
"dateOfBirth": {
"type": "date"
},
"gender": {
"type": "keyword"
},
"id": {
"type": "float"
},
"labs": {
"type": "nested",
"properties": {
"ageOnDateOfService": {
"type": "float"
},
"date": {
"type": "date"
},
"encounterId": {
"type": "keyword"
},
"id": {
"type": "keyword"
},
"isEdVisit": {
"type": "boolean"
},
"labPanelName": {
"type": "keyword"
},
"labPanelNameId": {
"type": "float"
},
"labPanelSourceName": {
"type": "text",
"store": true
},
"personId": {
"type": "keyword"
},
"processingLogId": {
"type": "float"
},
"results": {
"type": "nested",
"properties": {
"dataType": {
"type": "keyword"
},
"id": {
"type": "float"
},
"labTestName": {
"type": "keyword"
},
"labTestNameId": {
"type": "float"
},
"resultAsNumber": {
"type": "float"
},
"resultAsText": {
"type": "keyword"
},
"sourceName": {
"type": "text",
"store": true
},
"unit": {
"type": "keyword"
}
}
}
}
},
"personId": {
"type": "keyword"
},
"processingLogId": {
"type": "float"
},
"race": {
"type": "keyword"
}
}
}
}
Example Document
PUT localhost:9200/testpipeline/_doc/274746
{
"id": 274746,
"personId": "10005786.000000",
"processingLogId": 51,
"gender": "Female",
"dateOfBirth": "1945-01-01T00:00:00",
"ageAtFirstEncounter": 76,
"labs": [
{
"isEdVisit": false,
"labPanelSourceName": "Lipid Panel",
"dataType": "LAB",
"ageOnDateOfService": 76.9041,
"results": [
{
"unit": "mg/dL",
"labTestNameId": 160,
"labTestName": "HDL",
"sourceName": "HDL",
"resultAsNumber": 46.0,
"resultAsText": "46",
"id": 2150284
},
{
"unit": "mg/dL",
"labTestNameId": 158,
"labTestName": "LDL",
"sourceName": "LDL",
"resultAsNumber": 144.0,
"resultAsText": "144.00",
"id": 2150286
}
],
"id": "9ab9ba84-580b-f2d2-4d32-25658ea5f1bf",
"sourceId": 2150278,
"personId": "10003783.000000",
"encounterId": "39617217.000000",
"processingLogId": 51,
"date": "2021-11-08T00:00:00"
}
],
"lastModified": "2022-03-24T10:21:29.8682784-05:00"
}
Example Query
POST localhost:9200/testpipeline/_search
{
"fields": [
"personId",
"processingLogId",
"id",
"gender",
"ageAtFirstDOS",
"dateOfBirth"
],
"from": 0,
"query": {
"bool": {
"should": [
{
"constant_score": {
"boost": 200,
"filter": {
"bool": {
"_name": "CriteriaFilterId:2068,CriteriaId:1,CriteriaClassId:1,Points:200,T5:False,SoftScore:200",
"should": [
{
"bool": {
"must": [
{
"nested": {
"path": "labs",
"inner_hits": {
"size": 3,
"name": "labs,CriteriaFilterId:2068,CriteriaId:1,CriteriaClassId:1,Points:200,T5:False,guid:8b41f346-2861-4099-b3c0-fcd6393c367b"
},
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match_phrase": {
"labs.labPanelSourceName": {
"_name": "CriteriaFilterId:2068,Pipeline.Labs.LabPanelSourceName,es_match_phrase=>'Lipid Panel' found in text",
"query": "Lipid Panel",
"slop": 100
}
}
},
{
"nested": {
"path": "labs.results",
"inner_hits": {
"size": 3,
"name": "labs.results,CriteriaFilterId:2068,CriteriaId:1,CriteriaClassId:1,Points:200,T5:False,guid:3564e83f-958b-4fe8-848e-f9edb5d7f3b2"
},
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"bool": {
"must": [
{
"range": {
"labs.results.resultAsNumber": {
"lte": 46
}
}
},
{
"term": {
"labs.results.labTestNameId": {
"value": 160
}
}
}
]
}
},
{
"bool": {
"must": [
{
"range": {
"labs.results.resultAsNumber": {
"gte": 140.0
}
}
},
{
"term": {
"labs.results.labTestNameId": {
"value": 158
}
}
}
]
}
}
],
"minimum_should_match": 2
}
}
]
}
}
}
}
]
}
}
]
}
}
}
}
]
}
}
]
}
}
}
}
],
"minimum_should_match": 1,
"filter": [
]
}
},
"size": 10,
"sort": [
{
"_score": {
"order": "desc"
}
},
{
"processingLogId": {
"order": "asc"
}
},
{
"personId": {
"order": "asc"
}
}
],
"_source": false
}

Elasticsearch nested query with aggregation using nested term doesn't return any bucket

I have an ES index with this mapping:
{
"_doc": {
"dynamic": "false",
"properties": {
"original": {
"properties":{
"id": {
"type": "keyword"
},
"purchaseStatus": {
"type": "keyword"
},
"marketCode": {
"type": "keyword"
},
"salesProfiles": {
"type": "nested",
"properties": {
"marketCode": {
"type": "keyword"
},
"purchaseStatus": {
"type": "keyword"
}
}
}
}
},
"recommended": {
"properties":{
"id": {
"type": "keyword"
},
"purchaseStatus": {
"type": "keyword"
},
"marketCode": {
"type": "keyword"
},
"salesProfiles": {
"type": "nested",
"properties": {
"marketCode": {
"type": "keyword"
},
"purchaseStatus": {
"type": "keyword"
}
}
}
}
},
"distance": {
"type": "double"
},
"rank": {
"type": "double"
},
"source": {
"properties": {
"application": {
"type": "keyword"
},
"platform": {
"type": "keyword"
}
}
},
"timestamp": {
"properties": {
"createdAt": {
"type": "date"
},
"updatedAt": {
"type": "date"
}
}
}
}
},
"_default_": {
"dynamic": "false"
}
}
and I need to obtain the recommended docs with salesProfiles.marketCode equal to original.marketCode but my query doesn't return any buckets:
GET index/_search
{
"aggs": {
"similarities": {
"filter": {
"bool": {
"must": [
{
"term": {
"original.storefrontId": "12345"
}
},
{
"nested": {
"path": "recommended.salesProfiles",
"query": {
"bool": {
"must": [
{
"match": {
"recommended.salesProfiles.purchaseStatus": "PAID"
}
}
]
}
}
}
}
]
}
},
"aggs": {
"markets": {
"nested": {
"path": "recommended.salesProfiles"
},
"aggs": {
"recommendedMarket": {
"terms": {
"field": "recommended.salesProfiles.marketCode",
"size": 100
}
}
}
}
}
}
},
"explain": false
}
Any suggestion would be really appreciated. Thanks in advance!
Its hard to debug this without any example docs, but I think this might work
{
"size": 0,
"query": {
"bool": {
"must": [
{
"term": {
"original.storefrontId": "12345"
}
},
{
"nested": {
"path": "recommended.salesProfiles",
"query": {
"bool": {
"must": [
{
"match": {
"recommended.salesProfiles.purchaseStatus": "PAID"
}
}
]
}
}
}
}
]
}
},
"aggs": {
"Profiles": {
"nested": {
"path": "recommended.salesProfiles"
},
"aggs": {
"by_term": {
"terms": {
"field": "recommended.salesProfiles.marketCode",
"size": 100
}
}
}
}
}
}
I don't think you can use "nested" under the filter agg without being under a nested aggregation, so I believe that's why you didn't get any docs.
I basically moved all the filtering to the query and just aggregated the terms later

ElasticSearch sorting nested with condition

With ElasticSearch I want to insert a condition to sort nested fields.
I have this mapping
{
"dario": {
"mappings": {
"agents": {
"properties": {
"applications": {
"type": "nested",
"properties": {
"companies": {
"type": "nested",
"properties": {
"active": {
"type": "integer"
},
"application_date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"application_date_month": {
"type": "date",
"format": "yyyy-MM"
},
"application_id": {
"type": "long"
},
"assigned_date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"buy_date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"date_month": {
"type": "date",
"format": "yyyy-MM"
},
"favorite": {
"type": "integer"
},
"id": {
"type": "long"
},
"notes": {
"type": "nested",
"properties": {
"date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"id": {
"type": "integer"
},
"note": {
"type": "string",
"analyzer": "standard"
}
}
},
"score": {
"type": "long"
},
"state": {
"type": "long"
},
"view": {
"type": "integer"
},
"visible": {
"type": "integer"
},
"visible_date": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
},
"count": {
"type": "integer"
},
"sectors": {
"type": "long"
}
}
}
}
}
}
}
}
I want to sort by the field applications.companies.buy_date, but, if this is null, then i want to consider applications.companies.date
I tried with groovy script:
{
"size": 10,
"from": 0,
"sort": [
{
"_script": {
"script": "doc['applications.companies.buy_date'] != null ? doc['applications.companies.buy_date'].date.getMillisOfDay() : doc['applications.companies.date'].date.getMillisOfDay()",
"type": "number",
"nested_filter": {
"match": {
"applications.companies.id": 711
}
},
"order": "desc"
}
}
],
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"bool": {
"must": [
{
"nested": {
"path": "applications.companies",
"query": {
"bool": {
"must": [
{
"match": {
"applications.companies.active": 1
}
},
{
"match": {
"applications.companies.id": 711
}
},
{
"bool": {
"should": [
{
"exists": {
"field": "applications.companies.buy_date"
}
},
{
"match": {
"applications.companies.favorite": 1
}
}
]
}
}
]
}
}
}
}
]
}
}
]
}
}
]
}
}
}
but nothing change. Any ideas?
UPDATE
I resolved issue with this solution
{
"size": 10,
"from": 0,
"_source": [
"id"
],
"sort": [
{
"_script": {
"script": {
"script": " if (doc['applications.companies.id'].value == 711) { return (doc['applications.companies.buy_date'].value > 0) ? doc['applications.companies.buy_date'].value : doc['applications.companies.date'].value; } else { return null; } ",
"lang": "groovy"
},
"type": "number",
"order": "desc",
"nested_path": "applications.companies",
"nested_filter": {
"match": {
"applications.companies.id": 711
}
}
}
}
],
"query": {
"bool": {
"filter": [
{
"bool": {
"must": [
{
"bool": {
"must": [
{
"nested": {
"path": "applications.companies",
"query": {
"bool": {
"must": [
{
"match": {
"applications.companies.active": 1
}
},
{
"match": {
"applications.companies.id": 711
}
},
{
"bool": {
"should": [
{
"exists": {
"field": "applications.companies.buy_date"
}
},
{
"match": {
"applications.companies.favorite": 1
}
}
]
}
}
]
}
}
}
}
]
}
}
]
}
}
]
}
}
}

ElasticSearch double nested sorting

I have documents which look like this (here is example):
{
"user": "xyz",
"state": "FINISHED",
"finishedTime": 1465566467161,
"jobCounters": {
"counterGroup": [
{
"counterGroupName": "org.apache.hadoop.mapreduce.FileSystemCounter",
"counter": [
{
"name": "FILE_BYTES_READ",
"mapCounterValue": 206509212380,
"totalCounterValue": 423273933523,
"reduceCounterValue": 216764721143
},
{
"name": "FILE_BYTES_WRITTEN",
"mapCounterValue": 442799895522,
"totalCounterValue": 659742824735,
"reduceCounterValue": 216942929213
},
{
"name": "HDFS_BYTES_READ",
"mapCounterValue": 207913352565,
"totalCounterValue": 207913352565,
"reduceCounterValue": 0
},
{
"name": "HDFS_BYTES_WRITTEN",
"mapCounterValue": 0,
"totalCounterValue": 89846725044,
"reduceCounterValue": 89846725044
}
]
},
{
"counterGroupName": "org.apache.hadoop.mapreduce.JobCounter",
"counter": [
{
"name": "TOTAL_LAUNCHED_MAPS",
"mapCounterValue": 0,
"totalCounterValue": 13394,
"reduceCounterValue": 0
},
{
"name": "TOTAL_LAUNCHED_REDUCES",
"mapCounterValue": 0,
"totalCounterValue": 720,
"reduceCounterValue": 0
}
]
}
]
}
}
Now I want to sort this data to get TOP 15 documents on the basis of totalCounterValue where counter.name is FILE_BYTES_READ. I have tried nested sorting on this but no matter which key name I write in counter.name, it is always sorting on the basis of HDFS_BYTES_READ. Can anyone please help me with my query.
{
"_source": true,
"size": 15,
"query": {
"bool": {
"must": [
{
"term": {
"state": {
"value": "FINISHED"
}
}
},
{
"range": {
"startedTime": {
"gte": "now - 4d",
"lte": "now"
}
}
}
]
}
},
"sort": [
{
"jobCounters.counterGroup.counter.totalCounterValue": {
"order": "desc",
"nested_path": "jobCounters.counterGroup",
"nested_filter": {
"nested": {
"path": "jobCounters.counterGroup.counter",
"filter": {
"term": {
"jobCounters.counterGroup.counter.name": "file_bytes_read"
}
}
}
}
}
}
]}
This is the mapping for jobCounters we have created:
"jobCounters": {
"type": "nested",
"include_in_parent": true,
"properties" : {
"counterGroup": {
"type": "nested",
"include_in_parent": true,
"properties": {
"counterGroupName": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"counter" : {
"type": "nested",
"include_in_parent": true,
"properties": {
"reduceCounterValue": {
"type": "long"
},
"name": {
"type": "string",
"analyzer": "english",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"totalCounterValue": {
"type": "long"
},
"mapCounterValue": {
"type": "long"
}
}
}
}
}
}
}
I followed nested sorting documentation of ElasticSearch and came up with this query, but I don't know why it is always sorting the totalCounterValue of HDFS_BYTES_READ irrespective of jobCounters.counterGroup.counter.name's value.
you can try something like this,
curl -XGET 'http://localhost:9200/index/jobCounters/_search' -d '
{
"size": 15,
"query": {
"nested": {
"path": "jobCounters.counterGroup.counter",
"filter": {
"term": {
"jobCounters.counterGroup.counter.name": "file_bytes_read"
}
}
}
},
"sort": [
{
"jobCounters.counterGroup.counter.totalCounterValue": {
"order": "desc",
"nested_path": "jobCounters.counterGroup",
"nested_filter": {
"nested": {
"path": "jobCounters.counterGroup.counter",
"filter": {
"term": {
"jobCounters.counterGroup.counter.name": "file_bytes_read"
}
}
}
}
}
}
]
}
'
Read the end of this document. It explains that we have to repeat the same query in nested_filter too.

Resources