I have an OpenSearch index with the following mapping (simplified):
PUT /house
{
"mappings": {
"properties": {
"house": { "type": "keyword" },
"people": {
"type": "nested",
"properties": {
"forename": { "type": "keyword" },
"surname": { "type": "keyword" }
}
}
}
}
}
I'd like to retrieve an aggregate where the bucket key is "[forename] [surname]".
Toy data:
PUT /house/_doc/1
{
"house": "house1",
"people": [
{ "forename": "Dave", "surname": "Daveson" },
{ "forename": "Jeff", "surname": "Jeffson" }
]
}
PUT /house/_doc/2
{
"house": "house1",
"people": [
{ "forename": "Dave", "surname": "Daveson" },
{ "forename": "Jeffs", "surname": "Jeffsons" }
]
}
The following doesn't return what I'd expect, and I can't figure out what object paths to put in the script to get it to work:
GET house/_search
{
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"people.name": {
"terms": {
"script": "[params._source['forename'], params._source['surname']].join(' ')"
}
}
}
}
},
"size": 0
}
Returns:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"people" : {
"doc_count" : 4,
"people.name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "null null",
"doc_count" : 4
}
]
}
}
}
}
Without script I can aggregate correctly on forename, surname or both, but using both I can't reliably "join" the results since they can be sorted only on the doc_count or key:
GET house/_search
{
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"people.forename": {
"terms": { "field": "people.forename" }
},
"people.surname": {
"terms": { "field": "people.surname" }
}
}
}
},
"size": 0
}
Returns:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"people" : {
"doc_count" : 4,
"people.surname" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Daveson",
"doc_count" : 2
},
{
"key" : "Jeffson",
"doc_count" : 1
},
{
"key" : "Jeffsons",
"doc_count" : 1
}
]
},
"people.forename" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Dave",
"doc_count" : 2
},
{
"key" : "Jeff",
"doc_count" : 1
},
{
"key" : "Jeffs",
"doc_count" : 1
}
]
}
}
}
}
You want this results:
GET house/_search
{
"aggs": {
"people": {
"nested": {
"path": "people"
},
"aggs": {
"people.name": {
"terms": {
"script": "doc['people.forename'].value + ' ' + doc['people.surname'].value"
}
}
}
}
},
"size": 0
}
Results:
"aggregations" : {
"people" : {
"doc_count" : 4,
"people.name" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Dave Daveson",
"doc_count" : 2
},
{
"key" : "Jeff Jeffson",
"doc_count" : 1
},
{
"key" : "Jeffs Jeffsons",
"doc_count" : 1
}
]
}
}
}
Related
I have a index with documents like this
[
{
"customer_id" : "123",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-23"
...
},
{
"customer_id" : "123",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-24"
...
},
{
"customer_id" : "345",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-25"
...
}
]
I want to get the list of all documents from specific country e.g USA, between a give time range with at least 2 occurrences of same customer_id.
With the above data, it should return
[
{
"customer_id" : "123",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-24"
...
}
]
Now, I tried the below ES query
POST /index_name/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"creation_date": {
"gte": "2021-06-23",
"lte": "2021-08-23"
}
}
},
{
"match": {
"country": "USA"
}
}
]
}
},
"aggs": {
"customer_agg": {
"terms": {
"field": "customer_id",
"min_doc_count": 2
}
}
}
}
The above query returns following result
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : 1.5587491,
"hits" : [...]
]
},
"aggregations" : {
"person_agg" : {
"doc_count_error_upper_bound" : 1,
"sum_other_doc_count" : 1,
"buckets" : [
{
"key" : "customer_id",
"doc_count" : 2
}
]
}
}
I don't need the list of buckets in response, but only the list of documents satisfying the condition. How can I achieve it?
On a first glance I noticed that in the search query you are searching by a field called creation_timestamp but in the mapping of the document you say that the date field you want to range check is called creation_date.
I decided to test this locally on Elasticsearch 7.10 and here are the settings I used
PUT /test-index-v1
PUT /test-index-v1/_mapping
{
"properties": {
"customer_id": {
"type": "keyword"
},
"country": {
"type": "keyword"
},
"department": {
"type": "keyword"
},
"creation-date": {
"type": "date"
}
}
}
As you can see I'm using keyword on the fields so that we can use - sorting, aggregation and etc.
After I created the index I imported the documents you gave as an example
POST /test-index-v1/_doc
{
"customer_id" : "345",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-25"
}
POST /test-index-v1/_doc
{
"customer_id" : "123",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-25"
}
POST /test-index-v1/_doc
{
"customer_id" : "123",
"country": "USA",
"department": "IT",
"creation_date" : "2021-06-24"
}
Then I executed this search query including a must match on the customer_id as well:
POST /test-index-v1/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"creation_date": {
"gte": "2021-06-23",
"lte": "2021-08-23"
}
}
},
{
"match": {
"country": "USA"
}
},
{
"match": {
"customer_id": "123"
}
}
]
}
},
"aggs": {
"customer_agg": {
"terms": {
"field": "customer_id",
"min_doc_count": 2
}
}
}
}
This query will return you the search hits as well. Using only an aggregation the searchHits won't be returned.
Here is the response I received:
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.6035349,
"hits" : [
{
"_index" : "test-index-v1",
"_type" : "_doc",
"_id" : "vbVD9HsBRVWFAvvZTW-l",
"_score" : 1.6035349,
"_source" : {
"customer_id" : "123",
"country" : "USA",
"department" : "IT",
"creation_date" : "2021-06-25"
}
},
{
"_index" : "test-index-v1",
"_type" : "_doc",
"_id" : "vrVD9HsBRVWFAvvZU29q",
"_score" : 1.6035349,
"_source" : {
"customer_id" : "123",
"country" : "USA",
"department" : "IT",
"creation_date" : "2021-06-24"
}
}
]
},
"aggregations" : {
"customer_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "123",
"doc_count" : 2
}
]
}
}
}
Hope this helps with your issue. Feel free to leave a comment if you have other questions regarding Elastic! :)
EDIT:
Regarding the grouping by customer_id in a certain date range I used this query:
POST /test-index-v1/_search
{
"aggs": {
"group_by_customer_id": {
"terms": {
"field": "customer_id"
},
"aggs": {
"dates_between": {
"filter": {
"range": {
"creation_date": {
"gte": "2020-06-23",
"lte": "2021-06-24"
}
}
}
}
}
}
}
}
And the response is:
"aggregations" : {
"group_by_customer_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "123",
"doc_count" : 2,
"dates_between" : {
"doc_count" : 1
}
},
{
"key" : "345",
"doc_count" : 1,
"dates_between" : {
"doc_count" : 0
}
}
]
}
}
I created an index like that:
{
"mappings":{
"properties":{
"#timestamp":{
"type":"date",
"doc_values":true
},
"event.category":{
"type":"keyword",
"index":true
},
"action":{
"type":"keyword",
"index":true
},
"success":{
"type":"boolean",
"index":true
},
"raw":{
"type":"text",
"index":false
}
}
}
}
Then I tried to use bucket_script pipeline aggregation to calculate success rate over actions, searching like that
{
"size": 0,
"_source": false,
"query": {
"bool": {
"filter": [{
"term": {
"action": "login"
}
}
]
}
},
"aggs": {
"action_bucket": {
"terms": {
"field": "action",
"show_term_doc_count_error": true
},
"aggs": {
"total": {
"terms": {
"field": "action"
}
},
"action": {
"filter": {
"term": {
"success": true
}
},
"aggs": {
"success": {
"terms": {
"field": "action"
}
}
}
},
"action_success_rate": {
"bucket_script": {
"buckets_path": {
"no_total": "total.doc_count",
"no_success": "action>success.doc_count"
},
"script": "100 * params.no_success / params.no_total"
}
}
}
}
}
}
And inside the response there is not action_success_rate:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 15,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"action_bucket": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "login",
"doc_count": 15,
"doc_count_error_upper_bound": 0,
"total": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "login",
"doc_count": 15
}
]
},
"action": {
"doc_count": 9,
"success": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "login",
"doc_count": 9
}
]
}
}
}
]
}
}
}
How could I fix my search request body to obtain success rate?
Mapping:
{
"mappings":{
"properties":{
"#timestamp":{
"type":"date",
"doc_values":true
},
"event.category":{
"type":"keyword",
"index":true
},
"action":{
"type":"keyword",
"index":true
},
"success":{
"type":"boolean",
"index":true
},
"raw":{
"type":"text",
"index":false
}
}
}
}
Sample Data:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "idx0",
"_type" : "_doc",
"_id" : "E802dnkBHcKLV4rtwh3V",
"_score" : 1.0,
"_source" : {
"action" : "login",
"success" : true
}
},
{
"_index" : "idx0",
"_type" : "_doc",
"_id" : "FM02dnkBHcKLV4rtyx0r",
"_score" : 1.0,
"_source" : {
"action" : "login",
"success" : true
}
},
{
"_index" : "idx0",
"_type" : "_doc",
"_id" : "Fc02dnkBHcKLV4rt1x0t",
"_score" : 1.0,
"_source" : {
"action" : "login",
"success" : false
}
},
{
"_index" : "idx0",
"_type" : "_doc",
"_id" : "Fs04dnkBHcKLV4rtKR3P",
"_score" : 1.0,
"_source" : {
"action" : "logout",
"success" : false
}
},
{
"_index" : "idx0",
"_type" : "_doc",
"_id" : "F804dnkBHcKLV4rtNR3J",
"_score" : 1.0,
"_source" : {
"action" : "logout",
"success" : true
}
}
]
}
}
Query:
{
"size": 0,
"aggs": {
"actions": {
"terms": {
"field": "action.keyword",
"size": 10
},
"aggs": {
"total": {
"value_count": {
"field": "action.keyword"
}
},
"success":{
"filter": {
"term": {
"success":true
}
},
"aggs": {
"success_cnt": {
"value_count": {
"field": "action.keyword"
}
}
}
},
"success_rate":{
"bucket_script": {
"buckets_path": {
"no_total":"total.value",
"no_success":"success>success_cnt.value"
},
"script": "(params.no_success/params.no_total)*100"
}
}
}
}
}
}
Response:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"actions" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "login",
"doc_count" : 3,
"total" : {
"value" : 3
},
"success" : {
"doc_count" : 2,
"success_cnt" : {
"value" : 2
}
},
"success_rate" : {
"value" : 66.66666666666666
}
},
{
"key" : "logout",
"doc_count" : 2,
"total" : {
"value" : 2
},
"success" : {
"doc_count" : 1,
"success_cnt" : {
"value" : 1
}
},
"success_rate" : {
"value" : 50.0
}
}
]
}
}
}
Trying to run a terms query on elastic search and couldn't figure out how to limit the returns to only unique results?
Assuming this is the query.
"query": {
"bool": {
"must": [{
"terms": {
"id": [
"1",
"2",
"3",
],
"boost": 1.0
}
}],
"adjust_pure_negative": true,
"boost": 1.0
}
},
"aggs": {
"top-results": {
"terms": {
"field": "id"
},
"aggs": {
"test": {
"top_hits": {
"size": 1
}
}
}
}
}
Ideally I would like to only have 3 results returned each one matching a id of 1, 2, or 3, but this query returns a lot more than that.
In order to mimic your scenario, have pushed a set of 5 records of employees in elasticsearch having different salaries. So, I am trying to fetch the salaries listed with one record (top-hit) each.
GET /employee/_doc/_search
{
"query": {
"bool": {
"should": [
{ "match": { "salary": 90000 }},
{ "match": { "salary": 80000 }}
]
}
},
"size" : 0,
"aggs": {
"salaries": {
"terms": {
"field": "salary",
"order": { "top_score": "desc" }
},
"aggs": {
"top_score": { "max": { "script": "_score" }},
"salary-num": { "top_hits": { "size": 1 }}
}
}
}
}
OUTPUT
{
...
"aggregations" : {
"salaries" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 80000,
"doc_count" : 2,
"top_score" : {
"value" : 1.0
},
"salary-num" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "employee",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"id" : 10,
"name" : "Lydia",
"dept" : "HR",
"salary" : 80000
}
}
]
}
}
},
{
"key" : 90000,
"doc_count" : 1,
"top_score" : {
"value" : 1.0
},
"salary-num" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "employee",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"id" : 20,
"name" : "Flora",
"dept" : "Accounts",
"salary" : 90000
}
}
]
}
}
}
]
}
}
}
How to go about bucketing on a field and then aggregating all the values of a different field into an array. Here's a sample list.
{
"product": "xyz",
"action": "add",
"user": "bob"
},
{
"product": "xyz",
"action": "update",
"user": "bob"
},
{
"product": "xyz",
"action": "add",
"user": "alice"
},
{
"product": "xyz",
"action": "add",
"user": "eve"
},
{
"product": "xyz",
"action": "delete",
"user": "eve"
}
Expected output:
{
"buckets": [
{
"key": "add",
"doc_count": 3,
"user": ["bob", "alice", "eve"]
},
{
"key": "update",
"doc_count": 1,
"user": ["bob"]
},
{
"key": "delete",
"doc_count": 1,
"user": ["eve"]
}
]
}
How to push user values to an array in each bucket? Is there something similar to mongodb $push or $addToFields in elastic aggregation? Appreciate the help.
Here's the work-in-progress aggregation.
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
}
}
}
}
}
}
Would this do? I just added chained one more Terms Aggregation as mentioned below:
Aggregation Query:
POST <your_index_name>
{
"size": 0,
"aggs": {
"product_filter": {
"filter": {
"term": {
"product": "xyz"
}
},
"aggs": {
"group_by_action": {
"terms": {
"field": "action",
"size":1000,
"order": {
"_count": "desc"
}
},
"aggs": {
"myUsers": {
"terms": {
"field": "user",
"size": 10
}
}
}
}
}
}
}
}
Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"product_filter" : {
"doc_count" : 5,
"group_by_action" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "add",
"doc_count" : 3,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "alice",
"doc_count" : 1
},
{
"key" : "bob",
"doc_count" : 1
},
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "delete",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "eve",
"doc_count" : 1
}
]
}
},
{
"key" : "update",
"doc_count" : 1,
"myUsers" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "bob",
"doc_count" : 1
}
]
}
}
]
}
}
}
}
I'm not sure if it is possible to have them in a single list as you've mentioned.
Hope this helps!
I have database with products. Each Product is composed of fields: uuid, group_id, title, since, till.
since and till define interval of availability.
Intervals [since, till] are disjoint pairs for each group_id. So there are no 2 products within one group for which intervals intersect.
I need to fetch a list of products that meets the following conditions:
on the list should be at most 1 product from each group
each product matches the given title
each product is current (since <= NOW <= till) OR if current product does not exist within its group, it should be the nearest product from the future (min(since) such that since >= NOW)
ES mapping:
{
"products": {
"mappings": {
"dynamic": "false",
"properties": {
"group_id": {
"type": "long",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"since": {
"type": "date",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"till": {
"type": "date",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
Is it possible to create such query in Elasticsearch?
Looking at your mapping, I've created sample documents, the query and its response as below:
Sample Documents:
POST product_index/_doc/1
{
"group_id": 1,
"title": "nike",
"since": "2020-01-01",
"till": "2020-03-31"
}
POST product_index/_doc/2
{
"group_id": 2,
"title": "nike",
"since": "2020-01-01",
"till": "2020-03-31"
}
POST product_index/_doc/3
{
"group_id": 3,
"title": "nike",
"since": "2020-03-15",
"till": "2020-03-31"
}
POST product_index/_doc/4
{
"group_id": 3,
"title": "nike",
"since": "2020-03-19",
"till": "2020-03-31"
}
As mentioned above, there are like 4 documents in total, group 1 and 2 have one document each while group 3 has two documents with both since >= now
Query Request:
The summary of the query is below:
Bool
- Must
- Match title as nike
- Should
- clause 1 - since <= now <= till
- clause 2 - now <= since
Agg
- Terms on GroupId
- Top Hits (retrieve only 1st document as your clause is at most for each group, and sort them by asc order of since)
Below is the actual query:
POST product_index/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"title": "nike"
}
},
{
"bool": {
"should": [
{ <--- since <=now <= till
"bool": {
"must": [
{
"range": {
"till": {
"gte": "now"
}
}
},
{
"range": {
"since": {
"lte": "now"
}
}
}
]
}
},
{ <---- since >= now
"bool": {
"must": [
{
"range": {
"since": {
"gte": "now"
}
}
}
]
}
}
]
}
}
]
}
},
"aggs": {
"my_groups": {
"terms": {
"field": "group_id.keyword",
"size": 10
},
"aggs": {
"my_docs": {
"top_hits": {
"size": 1, <--- Note this to return at most one document
"sort": [
{ "since": { "order": "asc"} <--- Sort to return the lowest value of since
}
]
}
}
}
}
}
}
Notice that I've made use of Terms Aggregation and Top Hits as its sub-aggregation.
Response:
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"my_groups" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "3",
"doc_count" : 2,
"my_docs" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "product_index",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"group_id" : 3,
"title" : "nike",
"since" : "2020-03-15",
"till" : "2020-03-31"
},
"sort" : [
1584230400000
]
}
]
}
}
},
{
"key" : "1",
"doc_count" : 1,
"my_docs" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "product_index",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"group_id" : 1,
"title" : "nike",
"since" : "2020-01-01",
"till" : "2020-03-31"
},
"sort" : [
1577836800000
]
}
]
}
}
},
{
"key" : "2",
"doc_count" : 1,
"my_docs" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "product_index",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"group_id" : 2,
"title" : "nike",
"since" : "2020-01-01",
"till" : "2020-03-31"
},
"sort" : [
1577836800000
]
}
]
}
}
}
]
}
}
}
Let me know if this helps!