Elasticsearch need to find users having birthday in current week - elasticsearch

I'm a newbie in Elasticsearch
I have a list of users in my index. I have birthdate key in my data which in unix timestamp.
Now i would like to find the users having upcoming birthday in this week! As we can find in MYSQL using Date and Month .
I have tried by set date format : yyyy-MM-dd but still I am not able to get it
I have created a new key and in that added date format : dd-MM . that is worked for me using range condition!
I have tried as follows for dd-MM formate
GET /demo/_search
{
"query": {
"range": {
"birth_date_format": {
"gte": "30-06",
"lte": "30-06",
"format": "dd-MM"
}
}
}
}
But I would like to find the birth date by yyyy-MM-dd or timestamp so how can i do it? otherwise I can do it by dd-MM by adding new key

Explanation:
now - 6d = subtracting 6 days from today
/w = rounding to the nearest week
It is called date math in elasticsearch.
{
"_source":["birth_date_format"],
"query": {
"range": {
"birth_date_format": {
"gt": "now-6d/w",
"lt":"now+6d/w"
}
}
},
"size":100
}
This will do the trick for you.
Sample data:
[
{
"_source": {
"birth_date_format": "2020-06-23"
}
},
{
"_source": {
"birth_date_format": "2020-06-22"
}
},
{
"_source": {
"birth_date_format": "2020-06-21"
}
},
{
"_source": {
"birth_date_format": "2020-06-20"
}
},
{
"_source": {
"birth_date_format": "2020-06-19"
}
},
{
"_source": {
"birth_date_format": "2020-06-18"
}
},
{
"_source": {
"birth_date_format": "2020-06-17"
}
},
{
"_source": {
"birth_date_format": "2020-06-16"
}
},
{
"_source": {
"birth_date_format": "2020-06-15"
}
},
{
"_source": {
"birth_date_format": "2020-06-26"
}
},
{
"_source": {
"birth_date_format": "2020-06-28"
}
},
{
"_source": {
"birth_date_format": "2020-06-27"
}
},
{
"_source": {
"birth_date_format": "2020-06-29"
}
}
]
Output:
"hits": [
{
"_source": {
"birth_date_format": "2020-06-23"
}
},
{
"_source": {
"birth_date_format": "2020-06-22"
}
},
{
"_source": {
"birth_date_format": "2020-06-26"
}
},
{
"_source": {
"birth_date_format": "2020-06-28"
}
},
{
"_source": {
"birth_date_format": "2020-06-27"
}
}
]
UPDATE:
Elasticsearch stores date as milliseconds - to calculate milliseconds it need year option.
(Built in date formats)[https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats] doesn't have a format without year.
Option1:
You can index birthday as you have currently
You need to index birth Month as another field
You need to index birth date as another field
You can use these two fields to query.
No way to do this with the help of scripting also as we need to get current year to do calculations.

Related

Search and aggregation on two indices

Two indexes are created with the dates.
First index mapping:
PUT /index_one
{
"mappings": {
"properties": {
"date_start": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSSZZ||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
}
}
}
}
Second index mapping:
PUT /index_two
{
"mappings": {
"properties": {
"date_end": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSSZZ||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
}
}
}
}
Need to find a date in a certain range and perform aggregation average of the dates difference.
Tried to make a request like this:
GET /index_one,index_two/_search?scroll=1m&q=[2021-01-01+TO+2021-12-31]&filter_path=aggregations,hits.total.value,hits.hits
{
"aggs": {
"filtered_dates": {
"filter": {
"bool": {
"must": [
{
"exists": {
"field": "date_start"
}
},
{
"exists": {
"field": "date_end"
}
}
]
}
},
"aggs": {
"avg_date": {
"avg": {
"script": {
"lang": "painless",
"source": "doc['date_end'].value.toInstant().toEpochMilli() - doc['date_begin'].value.toInstant().toEpochMilli()"
}
}
}
}
}
}
}
I get the following response to the request:
{
"hits": {
"total": {
"value": 16508
},
"hits": [
{
"_index": "index_one",
"_type": "_doc",
"_id": "93a34c5b-101b-45ea-9965-96a2e0446a28",
"_score": 1.0,
"_source": {
"date_begin": "2021-02-26 07:26:29.732+0300"
}
}
]
},
"aggregations": {
"filtered_dates": {
"meta": {},
"doc_count": 0,
"avg_date": {
"value": null
}
}
}
}
Can you please tell me if it is possible to make a query with search and aggregation over two indices in Elasticsearch? If so, how?
If you stored date_start on the document which contains date_end, it'd be much easier to figure out the average — check my answer to Store time related data in ElasticSearch.
Now, the script context operates on one single document at a time and has "no clue" about the other, potentially related docs. So if you don't store both dates at the same time in at least one doc, you'd need to somehow connect the docs nonetheless.
One option would be to use their ids:
POST index_one/_doc
{ "id":1, "date_start": "2021-01-01" }
POST index_two/_doc
{ "id":1, "date_end": "2021-12-31" }
POST index_one/_doc/2
{ "id":2, "date_start": "2021-01-01" }
POST index_two/_doc/2
{ "id":2, "date_end": "2021-01-31" }
After that, it's possible to:
Target multiple indices — as you already do.
Group the docs by their IDs and select only those that include at least 2 buckets (assuming two buckets represent the start & the end).
Obtain the min & max dates — essentially cherry-picking the date_start and date_end to be used later down the line.
Use a bucket_script aggregation to calculate their difference (in milliseconds).
Leverage a top-level average bucket aggregation to run over all the difference buckets and ... average them.
In concrete terms:
GET /index_one,index_two/_search?scroll=1m&q=[2021-01-01+TO+2021-12-31]&filter_path=aggregations,hits.total.value,hits.hits
{
"aggs": {
"grouped_by_id": {
"terms": {
"field": "id",
"min_doc_count": 2,
"size": 10
},
"aggs": {
"min_date": {
"min": {
"field": "date_start"
}
},
"max_date": {
"max": {
"field": "date_end"
}
},
"diff": {
"bucket_script": {
"buckets_path": {
"min": "min_date",
"max": "max_date"
},
"script": "params.max - params.min"
}
}
}
},
"avg_duration_across_the_board": {
"avg_bucket": {
"buckets_path": "grouped_by_id>diff",
"gap_policy": "skip"
}
}
}
}
If everything goes right, you'll end up with:
...
"aggregations" : {
"grouped_by_id" : {
...
},
"avg_duration_across_the_board" : {
"value" : 1.70208E10 <-- 17,020,800,000 milliseconds ~ 4,728 hrs
}
}
⚠️ Caveat: note that the 2nd level terms aggregation has an adjustable size. You'll probably need to increase it to cover more docs. But there are theoretical and practical limits as to how far it makes sense to increase it.
📖 Shameless plug: this was inspired in part by the chapter Aggregations & Buckets in my recently published Elasticsearch Handbook — containing lots of other real-world, non-trivial examples 🙌

conditionally query for fields in elasticsearch

I m new to Elasticsearch and before posting this question I have googled for help but not understanding how to write the query which i wanted to write.
My problem is I have few bunch of documents which i want to query, few of those documents has field "DueDate" and few of those has "PlannedCompletionDate" but not both exist in a single document. So I want to write a query which should conditionally query for a field from documents and return all documents.
For example below I m proving sample documents of each type and my query should return results from both the documents, I need to write query which should check for field existence and return the document
"_source": {
...
"plannedCompleteDate": "2019-06-30T00:00:00.000Z",
...
}
"_source": {
...
"dueDate": "2019-07-26T07:00:00.000Z",
...
}
You can use range query with the combination of the boolean query to achieve your use case.
Adding a working example with index mapping, data, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"plannedCompleteDate": {
"type": "date",
"format": "yyyy-MM-dd"
},
"dueDate": {
"type": "date",
"format": "yyyy-MM-dd"
}
}
}
}
Index Data:
{
"plannedCompleteDate": "2019-05-30"
}
{
"plannedCompleteDate": "2020-06-30"
}
{
"dueDate": "2020-05-30"
}
Search Query:
{
"query": {
"bool": {
"should": [
{
"range": {
"plannedCompleteDate": {
"gte": "2020-01-01",
"lte": "2020-12-31"
}
}
},
{
"range": {
"dueDate": {
"gte": "2020-01-01",
"lte": "2020-12-31"
}
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "65808850",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"plannedCompleteDate": "2020-06-30"
}
},
{
"_index": "65808850",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"dueDate": "2020-05-30"
}
}
]

Elasticsearch - Trouble querying for exact date with range query

I have the following mapping definition in my events index:
{
"events": {
"mappings": {
"properties": {
"data": {
"properties": {
"reportDate": {
"type": "date",
"format": "M/d/YYYY"
}
}
}
}
}
}
And an example doc:
{
"_index": "events",
"_type": "_doc",
"_id": "12345",
"_version": 1,
"_seq_no": 90,
"_primary_term": 1,
"found": true,
"_source": {
"data": {
"reportDate": "12/4/2018",
}
}
}
My goal is query for docs with an exact data.reportDate of 12/4/2018, but when I run this query:
{
"query": {
"range": {
"data.reportDate": {
"lte": "12/4/2018",
"gte": "12/4/2018",
"format": "M/d/YYYY"
}
}
}
}
I instead get all of the docs that have a data.reportDate that is in the year 2018, not just 12/4/2018. I've tried setting relation to CONTAINS and WITHIN with no luck. Any ideas?
You need to change your date format from M/d/YYYY to M/d/yyyy. Refer to this ES official documentation to know more about date formats. You can even refer to this documentation to know about the difference between yyyy and YYYY
yyyy specifies the calendar year whereas YYYY specifies the year (of
“Week of Year”)
Adding a working example with index mapping, data, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"data": {
"properties": {
"reportDate": {
"type": "date",
"format": "M/d/yyyy"
}
}
}
}
}
}
Index Data:
{
"data": {
"reportDate": "12/3/2018"
}
}
{
"data": {
"reportDate": "12/4/2018"
}
}
{
"data": {
"reportDate": "12/5/2018"
}
}
Search Query:
{
"query": {
"bool": {
"must": {
"range": {
"data.reportDate": {
"lte": "12/4/2018",
"gte": "12/4/2018"
}
}
}
}
}
}
Search Result:
"hits": [
{
"_index": "65312594",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"data": {
"reportDate": "12/4/2018"
}
}
}
]

Elasticsearch sort based on element in array that satisfies filter

My types have a field which is an array of times in ISO 8601 format. I want to get all the listing's which have a time on a certain day, and then order them by the earliest time they occur on that specific day. Problem is my query is ordering based on the earliest time of all days.
You can reproduce the problem below.
curl -XPUT 'localhost:9200/listings?pretty'
curl -XPOST 'localhost:9200/listings/listing/_bulk?pretty' -d '
{"index": { } }
{ "name": "second on 6th (3rd on the 5th)", "times": ["2018-12-05T12:00:00","2018-12-06T11:00:00"] }
{"index": { } }
{ "name": "third on 6th (1st on the 5th)", "times": ["2018-12-05T10:00:00","2018-12-06T12:00:00"] }
{"index": { } }
{ "name": "first on the 6th (2nd on the 5th)", "times": ["2018-12-05T11:00:00","2018-12-06T10:00:00"] }
'
# because ES takes time to add them to index
sleep 2
echo "Query listings on the 6th!"
curl -XPOST 'localhost:9200/listings/_search?pretty' -d '
{
"sort": {
"times": {
"order": "asc",
"nested_filter": {
"range": {
"times": {
"gte": "2018-12-06T00:00:00",
"lte": "2018-12-06T23:59:59"
}
}
}
}
},
"query": {
"bool": {
"filter": {
"range": {
"times": {
"gte": "2018-12-06T00:00:00",
"lte": "2018-12-06T23:59:59"
}
}
}
}
}
}'
curl -XDELETE 'localhost:9200/listings?pretty'
Adding the above script to a .sh file and running it helps reproduce the issue. You'll see the order is happening based on the 5th and not the 6th. Elasticsearch converts the times to a epoch_millis number for sorting, you can see the epoch number in the sort field in the hits object e.g 1544007600000. When doing an asc sort, in takes the smallest number in the array (order not important) and sorts based off that.
Somehow I need it to be ordered on the earliest time that occurs on the queried day i.e the 6th.
Currently using Elasticsearch 2.4 but even if someone can show me how it's done in the current version that would be great.
Here is their doc on nested queries and scripting if that helps.
I think the problem here is that the nested sorting is meant for nested objects, not for arrays.
If you convert the document into one that uses an array of nested objects instead of the simple array of dates, then you can construct a nested filtered sort that works.
The following is Elasticsearch 6.0 - they're changed the syntax a bit for 6.1 onwards, and I'm not sure how much of this works with 2.x:
Mappings:
PUT nested-listings
{
"mappings": {
"listing": {
"properties": {
"name": {
"type": "keyword"
},
"openTimes": {
"type": "nested",
"properties": {
"date": {
"type": "date"
}
}
}
}
}
}
}
Data:
POST nested-listings/listing/_bulk
{"index": { } }
{ "name": "second on 6th (3rd on the 5th)", "openTimes": [ { "date": "2018-12-05T12:00:00" }, { "date": "2018-12-06T11:00:00" }] }
{"index": { } }
{ "name": "third on 6th (1st on the 5th)", "openTimes": [ {"date": "2018-12-05T10:00:00"}, { "date": "2018-12-06T12:00:00" }] }
{"index": { } }
{ "name": "first on the 6th (2nd on the 5th)", "openTimes": [ {"date": "2018-12-05T11:00:00" }, { "date": "2018-12-06T10:00:00" }] }
So instead of the "nextNexpectionOpenTimes", we have an "openTimes" nested object, and each listing contains an array of openTimes.
Now the search:
POST nested-listings/_search
{
"sort": {
"openTimes.date": {
"order": "asc",
"nested_path": "openTimes",
"nested_filter": {
"range": {
"openTimes.date": {
"gte": "2018-12-06T00:00:00",
"lte": "2018-12-06T23:59:59"
}
}
}
}
},
"query": {
"nested": {
"path": "openTimes",
"query": {
"bool": {
"filter": {
"range": {
"openTimes.date": {
"gte": "2018-12-06T00:00:00",
"lte": "2018-12-06T23:59:59"
}
}
}
}
}
}
}
}
The main difference here is the slightly different query, since you need to use a "nested" query to filter on nested objects.
And this gives the following result:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "nested-listings",
"_type": "listing",
"_id": "vHH6e2cB28sphqox2Dcm",
"_score": null,
"_source": {
"name": "first on the 6th (2nd on the 5th)"
},
"sort": [
1544090400000
]
},
{
"_index": "nested-listings",
"_type": "listing",
"_id": "unH6e2cB28sphqox2Dcm",
"_score": null,
"_source": {
"name": "second on 6th (3rd on the 5th)"
},
"sort": [
1544094000000
]
},
{
"_index": "nested-listings",
"_type": "listing",
"_id": "u3H6e2cB28sphqox2Dcm",
"_score": null,
"_source": {
"name": "third on 6th (1st on the 5th)"
},
"sort": [
1544097600000
]
}
]
}
}
I don't think you can actually select a single value from an array in ES, so for sorting, you were always going to be sorting on all the results. The best you can do with a plain array is choose how you treat that array for sorting purposes (use lowest, highest, mean, etc).

How to store date range data in elastic search (aws) and search for a range?

I am trying to store hotel room availability in elasticsearch. And then I need to
search rooms those are available from a date till another date. I have come up with
two ways to store data for availability, and they are as follows:
Here availability dictionary store all dates and value of each date key is true of false, representing its available on
that day or not.
{
"_id": "khg2uo47tyhgjwebu7624787",
"room_type": "garden view",
"hotel_name": "Cool hotel",
"hotel_id": "jytu64r982u0299023",
"room_metadata1": 233,
"room_color": "black",
"availability": {
"2016-07-01": true,
"2016-07-02": true,
"2016-07-03": false,
"2016-07-04": true,
"2016-07-05": true,
"2016-07-06": null,
"2016-07-07": true,
"2016-07-08": true,
----
----
for 365 days
}
}
Here availability array only stores those dates when room is available
{
"_id": "khg2uo47tyhgjwebu7624787",
"room_type": "garden view",
"hotel_name": "Cool hotel",
"hotel_id": "jytu64r982u0299023",
"room_metadata1": 535,
"room_color": "black",
"availability": ["2016-07-01", "2016-07-02", "2016-07-04", "2016-07-05", "2016-07-07", "2016-07-08"] ---for 365 days
}
}
I want to search all rooms, those are available from from_date till to_date and that should look into availability dictionary or array.And my date range may span up to 365 days
How to store these availability data, so that I can perform the above search easily?
And I could not find any way to search through range of dates, so any suggestion?
Please note, items
in availability may not be kept sorted. And I may have more than 100 million records to search through.
One way to model this would be with parent/child documents. Room documents would be parent documents and availability documents would be their child documents. For each room, there would be one availability document per date the room is available. Then, at query time, we can query for parent rooms which have one availability child document for each date in the searched interval (even disjoint ones).
Note that you'll need to make sure that as soon as a room is booked, you remove the corresponding child documents for each booked date.
Let's try this out. First create the index:
PUT /rooms
{
"mappings": {
"room": {
"properties": {
"room_num": {
"type": "integer"
}
}
},
"availability": {
"_parent": {
"type": "room"
},
"properties": {
"date": {
"type": "date",
"format": "date"
},
"available": {
"type": "boolean"
}
}
}
}
}
Then add some data
POST /rooms/_bulk
{"_index": { "_type": "room", "_id": 233}}
{"room_num": 233}
{"_index": { "_type": "availability", "_id": "20160701", "_parent": 233}}
{"date": "2016-07-01"}
{"_index": { "_type": "availability", "_id": "20160702", "_parent": 233}}
{"date": "2016-07-02"}
{"_index": { "_type": "availability", "_id": "20160704", "_parent": 233}}
{"date": "2016-07-04"}
{"_index": { "_type": "availability", "_id": "20160705", "_parent": 233}}
{"date": "2016-07-05"}
{"_index": { "_type": "availability", "_id": "20160707", "_parent": 233}}
{"date": "2016-07-07"}
{"_index": { "_type": "availability", "_id": "20160708", "_parent": 233}}
{"date": "2016-07-08"}
Finally, we can start querying. First, let's say we want to find a room that is available on 2016-07-01:
POST /rooms/room/_search
{
"query": {
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-01"
}
}
}
}
}
=> result: room 233
Then, let's try searching for a room available from 2016-07-01 to 2016-07-03
POST /rooms/room/_search
{
"query": {
"bool": {
"minimum_should_match": 3,
"should": [
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-01"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-02"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-03"
}
}
}
}
]
}
}
}
=> Result: No rooms
However, searching for a room available from 2016-07-01 to 2016-07-02 does yield room 233
POST /rooms/room/_search
{
"query": {
"bool": {
"minimum_should_match": 2,
"should": [
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-01"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-02"
}
}
}
}
]
}
}
}
=> Result: Room 233
We can also search for disjoint intervals, say from 2016-07-01 to 2016-07-02 + from 2016-07-04 to 2016-07-05
POST /rooms/room/_search
{
"query": {
"bool": {
"minimum_should_match": 4,
"should": [
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-01"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-02"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-04"
}
}
}
},
{
"has_child": {
"type": "availability",
"query": {
"term": {
"date": "2016-07-05"
}
}
}
}
]
}
}
}
=> Result: Room 233
And so on... The key point is to add one has_child query per date you need to check availability for and set minimum_should_match to the number of dates you're checking.
UPDATE
Another option would be to use a script filter, but with 100 million documents, I'm not certain it would scale that well.
In this scenario you can keep your original design (preferably the second one, because with the first one, you'll create too many unnecessary fields in your mapping) and the query would look like this:
POST /rooms/room/_search
{
"query": {
"bool": {
"filter": {
"script": {
"script": {
"inline": "def dates = doc.availability.sort(false); from = Date.parse('yyyy-MM-dd', from); to = Date.parse('yyyy-MM-dd', to); def days = to - from; def fromIndex = doc.availability.values.indexOf(from.time); def toIndex = doc.availability.values.indexOf(to.time); return days == (toIndex - fromIndex)",
"params": {
"from": "2016-07-01",
"to": "2016-07-04"
}
}
}
}
}
}
}
i am new and just learning ES. What are the disadvantages of this setup/mapping ?
ciao..remco

Resources