Elasticsearch painless scripting with dates - elasticsearch

I have a document which has a date field. I'd like to sort by documents by the this date ASC, but ones with a date in the past i'd like at the end.
In my end, it's like i want to assign the document value to a new value:
- If date is > "utc now", then assign value to whatever the date is
- If date is < "utc now", then assign value to max date
Then, i can sort by this field ASC.
So, it seems the only way to achieve this is with painless scripting.
This is what i've got so far, works.. but not sure if it's the correct approach.
GET /listings/_search
{
"track_total_hits": true,
"from": 0,
"query": {
"match_all": {}
},
"size": 48,
"sort": [
{
"_script" : {
"type": "string",
"script": {
"lang": "painless",
"source": "if (doc['auctionOn.utc'].size() == 0) { return params['maxTimestamp'].toString(); } else { long timestampDoc = doc['auctionOn.utc'].value.toInstant().toEpochMilli();long timestampNow = new Date().getTime();if (timestampDoc > timestampNow) { return timestampDoc.toString(); } else { return params['maxTimestamp'].toString(); } }",
"params": {
"maxTimestamp": 9223372036854776000
}
},
"order": "asc"
}
}
]
}
can someone please advise if this is the correct/performant approach?

Related

Elasticsearch conditional sort on date field

I am trying to sort an Elastic Search query result on a date field, registeredAt. However, registeredAt doesn't exist in all documents returned. In that case, I would want the sort to look for the date on an alternative field, invitedAt.
If we have 3 hits which look like this:
hits = [
{
id: 'hit2'
registeredAt: '2021-06-01T23:00:00.000Z',
invitedAt: '2021-05-31T23:00:00.000Z'
},
{
id: 'hit3'
invitedAt: '2021-05-31T23:00:00.000Z'
},
{
id: 'hit1'
invitedAt: '2021-06-04T23:00:00.000Z'
},
],
then I would want the sort to return them in order from most recent to least recent: [hit1, hit2, hit3].
In each document, the sort script should look for the registeredAt field and take that date as the sort value and, if that field does not exist, look at the value for invitedAt and take that as the sort value.
In that sense, hit1 does not have a registeredAt and has the most recent date for invitedAt and, as such, should come first. hit2 has a registeredAt field and the date for that field is more recent than the invitedAt date of hit3 (which doesn't have a registeredAt field.
I have written the query as such:
client.search({
index: 'users',
track_total_hits: true,
sort: {
_script: {
type: 'number',
script: {
lang: 'painless',
source:
"if (!doc.containsKey('registeredAt') || doc['registeredAt'].empty) { return doc['invitedAt'].value; } else { return doc['registeredAt'].value }",
},
order: 'desc',
},
},
body: {
from: skip,
size: limit,
query: {...},
},
})
The query runs without errors but the sorting does not work and the documents are returned in the order that they were indexed in.
I assume that registeredAt and invitedAt are date in the mapping.
This query should work. What I added is calling .getMillis() after getting the value.
{
"sort": [
{
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": """
if (!doc.containsKey('registeredAt') || doc['registeredAt'].empty) {
return doc['invitedAt'].value.getMillis();
}
else {
return doc['registeredAt'].value.getMillis();
}
"""
},
"order": "desc"
}
}
]
}
Edit: .getMillis() is depricated in version 7.x. .toInstant().toEpochMilli() should be used instead.
This is the query:
{
"sort": [
{
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": """
if (!doc.containsKey('registeredAt') || doc['registeredAt'].empty) {
return doc['invitedAt'].value.toInstant().toEpochMilli();
}
else {
return doc['registeredAt'].value.toInstant().toEpochMilli();
}
"""
},
"order": "desc"
}
}
]
}

On ElasticSearch, how to search based on the number of items in a field?

On Elasticsearch I have a field named Itinerary that can contain multiple values (from 1 up to 6), for example in the picture below there's 2 items in the field.
"Itinerary": [
{
"Carrier": "LH",
"Departure": "2021-07-04T06:55:00Z",
"Number": "1493",
"Arrival": "2021-07-04T08:40:00Z",
},
{
"Carrier": "LH",
"Departure": "2021-07-04T13:30:00Z",
"Number": "422",
"Arrival": "2021-07-04T16:05:00Z",
}
}
]
Is there a way to query the number of results that contains a certain amount of items in this particular field? Something like:
query: {
match: {
number_of_items_in_Itinerary_field : 4
}
}
You can achieve this by using a script query like this:
{
"query": {
"bool": {
"filter": {
"script": {
"script": {
"source": "doc['my_array_field'].length > params.param1",
"lang": "painless",
"params": {
"param1": 1
}
}
}
}
}
}
}
Here is more Information on script queries: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-query.html
and specifically on array types in painless scripts: https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-operators-array.html

Aggregations on nested documents with painless scripting

USING ELASTIC SEARCH 6.2
So I have a deeply nested document structure which has all the proper mapping (nested, text, keyword, etc). A sample document is as follows:
{
"type": "Certain Type",
"lineItems": [
{
"lineValue": 10,
"events": [
{
"name": "CREATED",
"timeStamp": "TIME VALUE"
},
{
"name": "ENDED",
"timeStamp": "TIME VALUE"
}
]
}
]
}
What I want to do is find out the average time required for all lines to go from CREATED to ENDED.
I created the following query
GET /_search
{
"size": 0,
"query": {
"match": {
"type": "Certain Type"
}
},
"aggs": {
"avg time": {
"nested": {
"path": "lineItems.events"
},
"aggs": {
"avg time": {
"avg": {
"script": {
"lang": "painless",
"source": """
long timeDiff = 0;
long fromTime = 0;
long toTime = 0;
if(doc['lineItems.events.name.keyword'] == "CREATED"){
fromTime = doc['lineItems.events.timeValue'].value.getMillis();
}
else if(doc['lineItems.events.name.keyword'] == "ENDED"){
toTime = doc['lineItems.events.timeValue'].value.getMillis();
}
timeDiff = toTime-fromTime;
return (timeDiff)
"""
}
}
}
}
}
}
}
The Result was that I got 0 as the aggregation result which is wrong.
Is there any way to achieve this?
Use doc[ in nested object script does not work as nested are a new document for elastic search.
Use params._source instead (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-script-fields.html). Note access to source would be really slow, if you have a lot of documents or if you need to request this query a lot, consider add this field on main document.
I consider all value exist, add if robustness test if needed, this should work.
long toTime = 0;
long fromTime = 0;
timeDiff = params['_source']['ENDED']
fromTime = params['_source']['CREATED']
return (toTime - fromTime);

How to get the latest value for the bucket in Elasticsearch?

I have a bunch of documents with just count field.
I'm trying to get the latest value for that field aggregated by date:
{
"query": {
"match_all": {}
},
"sort": "_timestamp",
"aggs": {
"result": {
"date_histogram": {
"field": "_timestamp",
"interval": "day",
"min_doc_count": 0
},
"aggs": {
"last_value": {
"scripted_metric": {
"params": {
"_agg": {
"last_value": 0
}
},
"map_script": "_agg.last_value = doc['count'].value",
"reduce_script": "return _aggs.last().last_value"
}
}
}
}
}
}
But the problem here is that documents fall into last_value aggregation not sorted by _timestamp, so I can't guarantee that the last value is really the last value.
So, my questions:
Is it possible to sort data by _timestamp when performing last_value aggregation?
Is there any better way to get the last value aggregated by day?
Looks like it is possible to tune scripted_metric aggregations a little bit to solve the first part of the question (sorting by _timestamp):
"last_value": {
"scripted_metric": {
"params": {
"_agg": {
"value": 0,
"timestamp": 0
}
},
"map_script": "_agg.value = doc['count'].value; _agg.timestamp = doc['_timestamp'].value",
"reduce_script": "value = 0; timestamp=0; for (a in _aggs) { if(a.timestamp > timestamp){ value = a.value; timestamp = a.timestamp} }; return value;"
}
}
But I continue to doubt that this is the best way to solve that

How to subtract aggregate min from aggreagate max(difference) in ES?

How to write an ES query to find the difference between max and min value of a field?
I am a newbee in elastic search,
In my case I feed lot of events along with session_id and time in to elastic search.
My event structure is
Event_name string `json:"Event_name"`
Client_id string `json:"Client_id"`
App_id string `json:"App_id"`
Session_id string `json:"Session_id"`
User_id string `json:"User_id"`
Ip_address string `json:"Ip_address"`
Latitude int64 `json:"Latitude"`
Longitude int64 `json:"Longitude"`
Event_time time.Time `json:"Time"`
I want to find the life time of a session_id based the feeded events.
For that I can retrive the maximum Event_time and minimum Event_time for a particular session_id by the following ES query.
{
"size": 0,
"query": {
"match": {
"Session_id": "dummySessionId"
}
},
"aggs": {
"max_time": {
"max": {
"field": "Time"
}
},
"min_time":{
"min": {
"field": "Time"
}
}
}
}
But what I exact want is (max_time - min_time)
How to write the ES query for the same????
Up to elasticsearch 1.1.1, this is not possible to do any arithmetic operation upon two aggregate function's reasult from elasticsearch side.
If you want then, you should do that from client side.
That is neither possible through scripts, as #eliasah suggests.
In the upcoming versions they may be add such facility.
in the 1.5.1 using the Scripted Metric Aggregation you can do this. Not sure about the performance, but it looks to work. This functionality is experimental and may be changed or removed completely in a future release.
POST test_time
POST test_time/data/1
{"Session_id":1234,"Event_time":"2014-01-01T12:00:00"}
POST test_time/data/3
{"Session_id":1234,"Event_time":"2014-01-01T14:00:00"}
GET /test_time/_search
{
"size": 0,
"aggs": {
"by_user": {
"terms": {
"field": "Session_id"
},
"aggs": {
"session_lenght_sec": {
"scripted_metric": {
"map_script": "_agg['v'] = doc['Event_time'].value",
"reduce_script": "min = null; max = null; for (a in _aggs) {if (min == null || a.v < min) { min = a.v}; if (max == null || a.v > max) { max = a.v }}; return (max-min)/1000"
}
}
}
}
}
}
###### RESPONSE #######
{
...,
"aggregations": {
"by_user": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1234,
"doc_count": 2,
"session_lenght_sec": {
"value": "7200"
}
}
]
}
}
}
This answer is bound to the Elasticsearch 7.8 version.
Followed up the #pippobaudos answer ahead. Elasticsearch has made some major changes since the answer.
the aggregation has a type 'scripted_metric' (click on the link to know more), which has new sub-attributes such as init_script, map_script, combine_script, reduce_script. Out of which, only init_script is optional. Following is the modified query.
"aggs": {
"cumulative":{
"scripted_metric": {
"init_script": {
"source": "state.stars = []"
},
"map_script": {
"source": "if (doc.containsKey('star_count')) { state.stars.add(doc['star_count'].value); }"
},
"combine_script": {
"source": "long min=9223372036854775807L,max=-9223372036854775808L; for (a in state.stars) {if ( a < min) { min = a;} if ( a > max) { max = a; }} return (max-min)"
},
"reduce_script": {
"source": "long max = -9223372036854775808L; for (a in states) { if (a != null && a > max){ max=a; } } return max "
}
}
}
}
Giving directly the query will not help you much, so I suggest you read the documentation about Script Fields and Scripting.

Resources