Elasticsearch mapping document for epoch in seconds - elasticsearch

As far as I understand, es only supports epoch in ms. My data source is couchbase and the json documents in there have an insertEpoch that is stored in seconds. I have been struggling to make my mapping document do the seconds to ms conversion for me.
Here is my mapping doc:
{
"template" : "cb*",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_source" : {
"includes" : ["meta.*","doc.*"]
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "nested",
"include_in_all" : false,
"transform": {
"script": "ctx._source['insertEpoch'] = ctx._source['insertEpoch'] * 1000",
"params": {},
"lang": "groovy"
}
}
}
}
}
}
The transform isn't happening.
New mapping document:
{
"template" : "wheepl",
"order" : 10,
"mappings" : {
"couchbaseCheckpoint" : {
"_source" : {
"includes" : ["doc.*"]
},
"dynamic_templates": [
{
"store_no_index": {
"match": "*",
"mapping": {
"store" : "no",
"index" : "no",
"include_in_all" : false
}
}
}
]
},
"couchbaseDoc" : {
"_timestamp" : {
"enabled" : true,
"store" : true
},
"properties" : {
"meta" : {
"type" : "object",
"include_in_all" : false
},
"doc" : {
"type" : "object",
"include_in_all" : false,
"updateEpoch" : {
"type" : "date",
"format" : "date_time",
"numeric_resolution" : "seconds"
}
}
}
}
}
}
I don't even see the _timestamp field that I should be seeing!
Here's a Kibana screenie:
Thanks

This works in 1.6, as per https://github.com/elastic/elasticsearch/pull/10420.
Even if, internally, the date itself will be kept in milliseconds, you can index it as seconds, retrieve it as seconds, meaning just like you indexed it.
I've tried out a simple test, to see this in action:
PUT /test_dates
{
"mappings": {
"test": {
"properties": {
"time_stamp": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}
Test data:
POST /test_dates/test/1
{
"time_stamp": "9231200"
}
Retrieving it:
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_dates",
"_type": "test",
"_id": "1",
"_score": 1,
"_source": {
"time_stamp": "9231200"
}
}
]
}
To prove it works, running this aggregation:
GET /test_dates/test/_search?search_type=count
{
"aggs": {
"NAME": {
"date_histogram": {
"field": "time_stamp",
"interval": "second",
"format": "yyyy-MM-dd"
}
}
}
}
returns
"aggregations": {
"NAME": {
"buckets": [
{
"key_as_string": "1970-04-17",
"key": 9231200000,
"doc_count": 1
}
]
}
}
Also, your template is a bit wrong. It should be:
"couchbaseDocument": {
"_timestamp": {
"enabled": true,
"store": true
},
"properties": {
"meta": {
"type": "object",
"include_in_all": false
},
"doc": {
"type": "object",
"include_in_all": false,
"properties": {
"updateEpoch": {
"type": "date",
"format": "date_time",
"numeric_resolution": "seconds"
}
}
}
}
}

Related

Using Metric Aggregation with Composite Aggregation

I have the following mapping for an index:
{
"test5" : {
"mappings" : {
"dynamic" : "false",
"properties" : {
"messageType" : {
"type" : "keyword"
},
"groupId" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"request" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"data" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"chargingPeriods" : {
"type" : "nested",
"include_in_root" : true,
"properties" : {
"endDateTime" : {
"type" : "date"
},
"power" : {
"type" : "double"
},
"startDateTime" : {
"type" : "date"
}
}
}
}
}
}
}
}
}
}
}
}
}
First use case, I want buckets in 2 min intervals based on payload.request.data.chargingPeriods.startDateTime and groupId with a filter criteria of messageType . BTW chargingPeriods is an array.
This query works for that use case:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Now I want metric aggregations done on these composite buckets returned and I tried this:
GET test5/_search
{
"size": 0,
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{ "sessionId": { "terms": { "field": "groupId"} } },
{
"date" : {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
According to ES documentation https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html, this should work by doing metric aggregation on the composite bucket
But instead of the metric aggregation being computed on the composite bucket, it is being computed across all the power fields in chargingPeriods array in the entire given document.
How I created the index:
PUT /test5
{
"settings": {
"number_of_shards": 1
},
"mappings" : {
"dynamic" : "false",
"properties" : {
"groupId" : {
"type" : "keyword"
},
"messageType" : {
"type" : "keyword"
},
"payload" : {
"type" : "nested",
"include_in_root": true,
"properties": {
"request": {
"type":"nested",
"include_in_root":true,
"properties": {
"data": {
"type":"nested",
"include_in_root": true,
"properties": {
"chargingPeriods": {
"type": "nested",
"include_in_root": true,
"properties" : {
"endDateTime":{
"type": "date"
},
"power": {
"type": "double"
},
"startDateTime":{
"type": "date"
}
}
}
}
}
}
}
}
}
}
}
}
Test Data:
POST test5/_doc/testdocu1
{
"groupId": "563",
"messageType": "test",
"payload": {
"request": {
"data": {
"chargingPeriods": [
{
"endDateTime": "2022-10-13T17:42:25Z",
"power": 9.62857,
"startDateTime": "2022-10-13T17:41:55Z"
},
{
"endDateTime": "2022-10-13T17:42:55Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:25Z"
},
{
"endDateTime": "2022-10-13T17:43:25Z",
"power": 9.6491,
"startDateTime": "2022-10-13T17:42:55Z"
},
{
"endDateTime": "2022-10-13T17:43:55Z",
"power": 9.66963,
"startDateTime": "2022-10-13T17:43:25Z"
},
{
"endDateTime": "2022-10-13T17:44:25Z",
"power": 9.67128,
"startDateTime": "2022-10-13T17:43:55Z"
},
{
"endDateTime": "2022-10-13T17:44:55Z",
"power": 9.65079,
"startDateTime": "2022-10-13T17:44:25Z"
},
{
"endDateTime": "2022-10-13T17:45:25Z",
"power": 9.66492,
"startDateTime": "2022-10-13T17:44:55Z"
},
{
"endDateTime": "2022-10-13T17:45:55Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:25Z"
},
{
"endDateTime": "2022-10-13T17:46:25Z",
"power": 9.68544,
"startDateTime": "2022-10-13T17:45:55Z"
},
{
"endDateTime": "2022-10-13T17:46:55Z",
"power": 9.67434,
"startDateTime": "2022-10-13T17:46:25Z"
}
]
}
}
}
}
My output:
"aggregations" : {
"my_buckets" : {
"after_key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"buckets" : [
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:42:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:44:00"
},
"doc_count" : 4,
"metricAgg" : {
"value" : 9.68544
}
},
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:46:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
}
]
}
}
As you can see, it chose the max payload.request.data.chargingPeriods.power from all the elements, ignoring the composite buckets. For example
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.68544
}
},
metricAgg should have been 9.62857
It doesn't work the way you expect because you're aggregating nested data which you have include_in_root, and hence, all the nested data finds itself in the root document as if it was not nested, and so, the relation between the startDateTime and the power is basically lost.
The other issue is that your composite aggregation aggregates nested (payload...) and non-nested data (groupId), that won't work.
However, if you add the groupId field inside each element of your array, then you can make your query work like this:
GET test5/_search
{
"size": 0,
"aggs": {
"payload": {
"nested": {
"path": "payload"
},
"aggs": {
"request": {
"nested": {
"path": "payload.request"
},
"aggs": {
"data": {
"nested": {
"path": "payload.request.data"
},
"aggs": {
"charging": {
"nested": {
"path": "payload.request.data.chargingPeriods"
},
"aggs": {
"my_buckets": {
"composite": {
"sources": [
{
"sessionId": {
"terms": {
"field": "payload.request.data.chargingPeriods.groupId"
}
}
},
{
"date": {
"date_histogram": {
"field": "payload.request.data.chargingPeriods.startDateTime",
"fixed_interval": "2m",
"format": "MM/dd/yyyy - hh:mm:ss",
"order": "asc"
}
}
}
]
},
"aggregations": {
"metricAgg": {
"max": {
"field": "payload.request.data.chargingPeriods.power"
}
}
}
}
}
}
}
}
}
}
}
}
},
"query": {
"terms": {
"messageType": [
"test"
]
}
}
}
Results:
{
"key" : {
"sessionId" : "563",
"date" : "10/13/2022 - 05:40:00"
},
"doc_count" : 1,
"metricAgg" : {
"value" : 9.62857
}
},

Find distinct values in elasticsearch

Elasticsearch 7.10.0
Dynamic Mapping:
{
"mappings": {
"dynamic_templates": [{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
]
}
}
Kibana shows following mapping of the index:
{
"mappings": {
"_doc": {
"dynamic_templates": [
{
"integers": {
"match_mapping_type": "long",
"mapping": {
"type": "integer"
}
}
},
{
"strings": {
"match_mapping_type": "string",
"mapping": {
"fields": {
"raw": {
"type": "keyword"
}
},
"type": "text"
}
}
}
],
"properties": {
....filtered out other properties....
"Registry": {
"type": "text",
"fields": {
"raw": {
"type": "keyword"
}
}
},
....filtered out other properties....
}
}
}
}
GET /iptree_index_base/_search?filter_path=hits.total.value,took,hits.hits._source.Registry
{
"aggs": {
"values": {
"terms": { "field": "Registry.raw" }
}
},
"sort" : [
{"Registry.raw" : {"order" : "asc"}}
]
}
Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 19
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
},
..Rest of duplicate results filtered out
]
}
}
Desired Results:
{
"took" : 8,
"hits" : {
"total" : {
"value" : 2
},
"hits" : [
{
"_source" : {
"Registry" : "AFRINIC"
}
},
{
"_source" : {
"Registry" : "ARIN"
}
}
]
}
}
Registry.raw is a keyword. What am I missing?
You're not interested in hits, but in aggregated buckets. So the query you're looking for is this one:
GET /iptree_index_base/_search?filter_path=hits.total.value,took,aggregations.values.buckets.key
{
"size": 0,
"aggs": {
"values": {
"terms": {
"field": "Registry.raw",
"order": {
"_key": "asc"
}
}
}
}
}

elasticsearch find documents where the given number items in array has the same property value

first of all I would like to show simplified structure of document.
{
"_id": "413123123",
"_source": {
"description": {
"firstLine": "this is my description",
"secondLine": "some value"
},
"InsertDetails": {
"Timestamp": "2020-06-12T11:14:36+0000"
},
"Links": [
{
"LinkDetails": {
"linkId": 2342,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 321313,
"type": "Link",
"dateCreation": "2012-08-21T08:42:09+0000",
"typeId": 404019,
"typeOfLink": "http"
}
},
{
"LinkDetails": {
"linkId": 1231,
"type": "Link",
"dateCreation": "2012-09-21T08:42:09+0000",
"typeId": 32323,
"typeOfLink": "https"
}
},
{
"LinkDetails": {
"linkId": 53434,
"type": "Link",
"dateCreation": "2012-11-21T08:42:09+0000",
"typeId": 123231,
"typeOfLink": "wss"
}
}
]
}
}
I have a problem with forming query, which would find documents, where the following requirements are met:
two items in Links arrays has typeOfLink equal to http
description string contains word "this"
found items will be sorted by date desc
The version of elasticsearch is 2.3.2
I've tried with query such like this:
{
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
],
"minimum_should_match": 2
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
The problem is that this query returns me also the documents, which has only one item in the array with the given value. I've tried to modify this query in different ways, but without any luck.
Added mapping
{
"my_index": {
"mappings": {
"en": {
"properties": {
"InsertDetails": {
"properties": {
"Timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
},
"description": {
"properties": {
"firstLine": {
"type": "string"
},
"secondLine": {
"type": "string"
}
}
},
"Links": {
"properties": {
"LinkDetails": {
"properties": {
"linkId": {
"type": "long"
},
"type": {
"type": "string"
},
"dateCreation": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"typeOfLink": {
"type": "string"
},
"typeId": {
"type": "long"
}
}
}
}
}
}
}
}
}
}
At first, you want to filter on a nested field. (array of object)
To have coherent result you must have to map this field as a nested one.
https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
Then, you will have to use aggregations.
What you want is to aggregate only "http" values for type_of_link, and return results if the aggregation return more than 2 results.
You query will be a little more complicated:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "Links",
"query": {
"match": {
"Links.LinkDetails.typeOfLink": "http"
}
}
}
},
{
"match": {
"description.firstLine": "this"
}
}
]
}
},
"aggs": {
"links": {
"nested": {
"path": "Links"
},
"aggs": {
"http_only": {
"filter": {
"term": {
"Links.LinkDetails.typeOfLink.keyword": "http"
}
},
"aggs": {
"several_http": {
"terms": {
"field": "Links.LinkDetails.typeOfLink.keyword",
"min_doc_count": 2
}
,
"aggs": {
"complete_match": {
"top_hits": {
"size": 100
}
}
}
}
}
}
}
}
},
"sort": [
{
"InsertDetails.Timestamp": {
"order": "desc"
}
}
]
}
And your response will looks like:
"aggregations" : {
"links" : {
"doc_count" : 4,
"http_only" : {
"doc_count" : 2,
"several_http" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "http",
"doc_count" : 2,
"complete_match" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.98082924,
"hits" : [
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 0
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 2342,
"type" : "Link",
"dateCreation" : "2012-09-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
},
{
"_index" : "test3",
"_type" : "_doc",
"_id" : "ed1AkXQBD_dLYq-V78bD",
"_nested" : {
"field" : "Links",
"offset" : 1
},
"_score" : 0.98082924,
"_source" : {
"LinkDetails" : {
"linkId" : 321313,
"type" : "Link",
"dateCreation" : "2012-08-21T08:42:09+0000",
"typeId" : 404019,
"typeOfLink" : "http"
}
}
}
]
}
}
}
]
}
}
}
}
By playing with the given aggregation you should be able to do what you want.

Counting search results in ElasticSearch by a nested property

Here is a schema with a nested property.
{
"dynamic": "strict",
"properties" : {
"Id" : {
"type": "integer"
},
"Name_en" : {
"type": "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"normalizer": "cloudbuy_normalizer_alphanumeric"
},
"text" : {
"type" : "text",
"analyzer": "english"
}
}
},
"Menus" : {
"type" : "nested",
"properties" : {
"Id" : {
"type" : "integer"
},
"Name" : {
"type" : "keyword",
"normalizer": "normalizer_alphanumeric"
},
"AncestorsIds" : {
"type" : "integer"
}
}
}
}
}
And here is a document.
{
"Id": 12781279
"Name": "Thing of purpose made to fit",
"Menus": [
{
"Id": -571057,
"Name": "Top level menu",
"AncestorsIds": [
-571057
]
}
,
{
"Id": 1022313,
"Name": "Other",
"AncestorsIds": [
-571057
,
1022313
]
}
]
}
For any given query I need a list with two columns: the Menu.Id and the number of documents in the result set that have that Menu.Id in their Menus array.
How?
(Is there any documentation for aggs that isn't impenetrable?)
#Richard, does this query suits your need ?
POST yourindex/_search
{
"_source": "false",
"aggs":{
"menus": {
"nested": {
"path": "Menus"
},
"aggs":{
"menu_aggregation": {
"terms": {
"field": "Menus.Id",
"size": 10
}
}
}
}
}
Output :
"aggregations": {
"menus": {
"doc_count": 2,
"menu_aggregation": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": -571057,
"doc_count": 1
},
{
"key": 1022313,
"doc_count": 1
}
]
}
}
Here we specify a nested path and then aggregate on the menu Ids.
You can take a look at this documentation page : https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-nested-aggregation.html

How to filter by the size of an array in nested type?

Let's say I have the following type:
{
"2019-11-04": {
"mappings": {
"_doc": {
"properties": {
"labels": {
"type": "nested",
"properties": {
"confidence": {
"type": "float"
},
"created_at": {
"type": "date",
"format": "strict_date_optional_time||date_time||epoch_millis"
},
"label": {
"type": "keyword"
},
"updated_at": {
"type": "date",
"format": "strict_date_optional_time||date_time||epoch_millis"
},
"value": {
"type": "keyword",
"fields": {
"numeric": {
"type": "float",
"ignore_malformed": true
}
}
}
}
},
"params": {
"type": "object"
},
"type": {
"type": "keyword"
}
}
}
}
}
}
And I want to filter by the size/length of the labels array. I've tried the following (as the official docs suggest):
{
"query": {
"bool": {
"filter": {
"script": {
"script": {
"source": "doc['labels'].size > 10"
}
}
}
}
}
}
but I keep getting:
{
"error": {
"root_cause": [
{
"type": "script_exception",
"reason": "runtime error",
"script_stack": [
"org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:81)",
"org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:39)",
"doc['labels'].size > 10",
" ^---- HERE"
],
"script": "doc['labels'].size > 10",
"lang": "painless"
}
],
"type": "search_phase_execution_exception",
"reason": "all shards failed",
"phase": "query",
"grouped": true,
"failed_shards": [
{
"shard": 0,
"index": "2019-11-04",
"node": "kk5MNRPoR4SYeQpLk2By3A",
"reason": {
"type": "script_exception",
"reason": "runtime error",
"script_stack": [
"org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:81)",
"org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:39)",
"doc['labels'].size > 10",
" ^---- HERE"
],
"script": "doc['labels'].size > 10",
"lang": "painless",
"caused_by": {
"type": "illegal_argument_exception",
"reason": "No field found for [labels] in mapping with types []"
}
}
}
]
},
"status": 500
}
I'm afraid that is not something possible, because the field labels is not a field that ES saves or albiet creates an inverted index on.
Doc doc['fieldname'] is only applicable on the fields on which inverted index is created and Elasticsearch's Query DSL too only works on fields on which inverted index gets created and unfortunately nested type is not a valid field on which inverted index is created.
Having said so, I have the below two ways of doing this.
For the sake of simplicity, I've created sample mapping, documents and two possible solutions which may help you.
Mapping:
PUT my_sample_index
{
"mappings": {
"properties": {
"myfield": {
"type": "nested",
"properties": {
"label": {
"type": "keyword"
}
}
}
}
}
}
Sample Documents:
// single field inside 'myfield'
POST my_sample_index/_doc/1
{
"myfield": {
"label": ["New York", "LA", "Austin"]
}
}
// two fields inside 'myfield'
POST my_sample_index/_doc/2
{
"myfield": {
"label": ["London", "Leicester", "Newcastle", "Liverpool"],
"country": "England"
}
}
Solution 1: Using Script Fields (Managing at Application Level)
I have a workaround to get what you want, well not exactly but would help you filter out on your service layer or application.
POST my_sample_index/_search
{
"_source": "*",
"query": {
"bool": {
"must": [
{
"match_all": {}
}
]
}
},
"script_fields": {
"label_size": {
"script": {
"lang": "painless",
"source": "params['_source']['labels'].size() > 1"
}
}
}
}
You would notice that in response a separate field label_size gets created with true or false value.
A sample response is something like below:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "my_sample_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"myfield" : {
"label" : [
"New York",
"LA",
"Austin"
]
}
},
"fields" : {
"label_size" : [ <---- Scripted Field
false
]
}
},
{
"_index" : "my_sample_index",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"myfield" : {
"country" : "England",
"label" : [
"London",
"Leicester",
"Newcastle",
"Liverpool"
]
}
},
"fields" : { <---- Scripted Field
"label_size" : [
true <---- True because it has two fields 'labels' and 'country'
]
}
}
]
}
}
Note that only second document makes sense as it has two fields i.e. country and labels. However if you only want the docs with label_size with true, that'd would have to be managed at your application layer.
Solution 2: Reindexing with labels.size using Script Processor
Create a new index as below:
PUT my_sample_index_temp
{
"mappings": {
"properties": {
"myfield": {
"type": "nested",
"properties": {
"label": {
"type": "keyword"
}
}
},
"labels_size":{ <---- New Field where we'd store the size
"type": "integer"
}
}
}
}
Create the below pipeline:
PUT _ingest/pipeline/set_labels_size
{
"description": "sets the value of labels size",
"processors": [
{
"script": {
"source": """
ctx.labels_size = ctx.myfield.size();
"""
}
}
]
}
Use Reindex API to reindex from my_sample_index index
POST _reindex
{
"source": {
"index": "my_sample_index"
},
"dest": {
"index": "my_sample_index_temp",
"pipeline": "set_labels_size"
}
}
Verify the documents in my_sample_index_temp using GET my_sample_index_temp/_search
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "my_sample_index_temp",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"labels_size" : 1, <---- New Field Created
"myfield" : {
"label" : [
"New York",
"LA",
"Austin"
]
}
}
},
{
"_index" : "my_sample_index_temp",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"labels_size" : 2, <----- New Field Created
"myfield" : {
"country" : "England",
"label" : [
"London",
"Leicester",
"Newcastle",
"Liverpool"
]
}
}
}
]
}
}
Now you can simply use this field labels_size in your query and its way easier and not to mention efficient.
Hope this helps!
You can solve it with a custom score approach:
GET 2019-11-04/_search
{
"min_score": 0.1,
"query": {
"function_score": {
"query": {
"match_all": {}
},
"functions": [
{
"script_score": {
"script": {
"source": "params['_source']['labels'].length > 10 ? 1 : 0"
}
}
}
]
}
}
}

Resources