I Have following data in ElasticSearch index some_index.
[ {
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 1,
"cart_status": "new",
"grandTotal": 12,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 1,
"cart_status": "paid",
"grandTotal": 12,
"event": "some_event",
"timestamp": "2022-12-02T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 2,
"cart_status": "new",
"grandTotal": 23,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 2,
"cart_status": "paid",
"grandTotal": 23,
"event": "some_event",
"timestamp": "2022-12-04T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 3,
"cart_status": "new",
"grandTotal": 17,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 3,
"cart_status": "new",
"grandTotal": 17,
"event": "some_event",
"timestamp": "2022-12-04T00:00:00.000Z"
}
}
}
]
What I want to get is sum of the grandTotals by the latest cart_statuses of each cart within a given time range.
Having the example above, the result for timestamp >= 2022-12-01 00:00:00 and timestamp<= 2022-12-03 00:00:00 should be something like
cart_status:new, sum grandTotal: 40 because within that time range latest status new have cart_id 3 and 2.
and cart_status:paid, sum grandTotal: 12 and this one because paid is the latest status of only cart_id=1.
What I tried is to use sub-aggregation on top_result, top_hits but ElasticSearch complains that "Aggregator [top_result] of type [top_hits] cannot accept sub-aggregations"
Besides I tried with collapse as well to get the latest by status, but according to docs there is also no possibility to aggregate over the results of collapse.
Can someone please help me solving this, it seems like a common calculation but not very trivial in ElasticSearch.
In SQL this is quite easy with window functions.
I want to avoid persisting intermediate data into another index. Because I need the dynamic query, as the users may want to get their calculations for any time range.
you can try the following way. meanwhile, for card_status, sum value will be 52 as it includes card_id 1 that has "new" as card status along with 2 and 3 for given timestamp.
Mappings:
PUT some_index
{
"mappings" : {
"properties": {
"timestamp" : {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||strict_date_optional_time ||epoch_millis"
},
"cart_id" : {
"type": "keyword"
},
"cart_status" : {
"type": "keyword"
},
"grand_total" : {
"type": "long"
},
"event":{
"type": "keyword"
}
}
}
}
Bulk Insert:
POST _bulk
{ "index" : { "_index" : "some_index", "_id" : "1" } }
{ "cart_id" : "1" , "grand_total":12, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "2" } }
{ "cart_id" : "1" , "grand_total":12, "cart_status" : "paid","timestamp":"2022-12-02T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "3" } }
{ "cart_id" : "2" , "grand_total":23, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "4" } }
{ "cart_id" : "2" , "grand_total":23, "cart_status" : "paid","timestamp":"2022-12-04T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "5" } }
{ "cart_id" : "3" , "grand_total":17, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "6" } }
{ "cart_id" : "3" , "grand_total":17, "cart_status" : "new","timestamp":"2022-12-04T00:00:00.000Z", "event" : "some_event"}
Query:
GET some_index/_search
{
"size":0,
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": "2022-12-01 00:00:00",
"lte": "2022-12-03 00:00:00"
}
}
}
]
}
},
"aggs": {
"card_status": {
"terms": {
"field": "cart_status"
},
"aggs": {
"grandTotal": {
"sum": {
"field": "grand_total"
}
}
}
}
}
}
Output:
{
"took": 86,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"card_status": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new",
"doc_count": 3,
"grandTotal": {
"value": 52
}
},
{
"key": "paid",
"doc_count": 1,
"grandTotal": {
"value": 12
}
}
]
}
}
}
Related
I have this query which return the correct result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
and the result is below:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 3.6012557,
"hits" : [
{
"_index" : "person",
"_type" : "_doc",
"_id" : "70002",
"_score" : 3.6012557,
"_source" : {
"gender" : "Male",
"activeStatus" : "Inactive",
"deceased" : "No",
"nameDetails" : {
"name" : [
{
"nameValue" : {
"firstName" : "Fawsu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Primary Name"
},
{
"nameValue" : {
"firstName" : "Fausu",
"middleName" : "L.",
"surname" : "Pibba"
},
"nameType" : "Spelling Variation"
}
]
}
}
}
]
}
But when I add the filter for Gender, it returns no result
GET /person/_search
{
"query": {
"bool": {
"should": [
{
"fuzzy": {
"nameDetails.name.nameValue.surname": {
"value": "Pibba",
"fuzziness": "AUTO"
}
}
},
{
"fuzzy": {
"nameDetails.nameValue.firstName": {
"value": "Fawsu",
"fuzziness": "AUTO"
}
}
}
],
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
Even I just use filter, it return no result
GET /person/_search
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender": "Male"
}
}
]
}
}
}
You are not getting any search result, because you are using the term query (in the filter clause). Term query will return the document only if it has an exact match.
A standard analyzer is used when no analyzer is specified, which will tokenize Male to male. So either you can search for male instead of Male or use any of the below solutions.
If you have not defined any explicit index mapping, you need to add .keyword to the gender field. This uses the keyword analyzer instead of the standard analyzer (notice the ".keyword" after gender field). Try out this below query -
{
"query": {
"bool": {
"filter": [
{
"term": {
"gender.keyword": "Male"
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "66879128",
"_type": "_doc",
"_id": "1",
"_score": 0.0,
"_source": {
"gender": "Male",
"activeStatus": "Inactive",
"deceased": "No",
"nameDetails": {
"name": [
{
"nameValue": {
"firstName": "Fawsu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Primary Name"
},
{
"nameValue": {
"firstName": "Fausu",
"middleName": "L.",
"surname": "Pibba"
},
"nameType": "Spelling Variation"
}
]
}
}
}
]
If you have defined index mapping, then modify the mapping for gender field as shown below
{
"mappings": {
"properties": {
"gender": {
"type": "keyword"
}
}
}
}
I can't get a count of fields with a filtered document value.
I have this json
``
{
"took" : 6,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "net",
"_type" : "_doc",
"_id" : "RTHRTH",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-05-31 19:01:01",
"data" : [...]
{
"_index" : "net",
"_type" : "_doc",
"_id" : "LLLoIJBHHM",
"_score" : 1.0,
"_source" : {
"created_at" : "2020-06-23 15:11:59",
"data" : [...]
}
}
]
}
}
``
In the "data" field, there are more fields within other fields respectively.
I want to filter the most recent document, and then count a certain value in the most recent document.
This is my query:
`{
"query": {
"match": {
"name.keyword": "net"
}
},
"sort": [
{
"created_at.keyword": {
"order": "desc"
}
}
],
"size": 1,
"aggs": {
"CountValue": {
"terms": {
"field": "data.add.serv.desc.keyword",
"include": "nginx"
}
}
}
}`
And the output is:
`{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"CountValue" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "nginx",
"doc_count" : 2
}
]
}
}`
I suspect that doc_count is the number of documents the value appears in, not the number of times the value is repeated within the filtered document.
Any advice I will be very grateful!
Unless any of the fields under the path data.add.serv are of the nested type, the terms agg will produce per-whole-doc results, not per-field.
Exempli gratia:
POST example/_doc
{
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
produces doc_count==1.
When, however, specified as nested:
DELETE example
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
}
}
}
}
POST example/_doc
{"serv":[{"desc":"nginx"},{"desc":"nginx"},{"desc":"nginx"}]}
then
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
we end up with doc_count==3.
This has to do with the way non-nested array types are flattened and de-duplicated. At the end, you may need to reindex your collections after having applied the nested mapping.
EDIT
In order to only take the latest doc, you could do the following:
PUT example
{
"mappings": {
"properties": {
"serv": {
"type": "nested"
},
"created_at": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
}
}
}
}
then
POST example/_doc
{
"created_at" : "2020-05-31 19:01:01",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
POST example/_doc
{
"created_at" : "2020-06-23 15:11:59",
"serv": [
{
"desc": "nginx"
},
{
"desc": "nginx"
}
]
}
then use a terms agg of size 1, sorted by timestamp desc:
GET example/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "created_at",
"order": {
"_term": "desc"
},
"size": 1
},
"aggs": {
"NAME2": {
"nested": {
"path": "serv"
},
"aggs": {
"NAME": {
"terms": {
"field": "serv.desc.keyword"
}
}
}
}
}
}
}
}
So, I have an index in Elasticsearch 7.6, which has documents similar to this one:
{
"_index": "my-index",
"_type": "_doc",
"_id": "kjdskjwolsjj",
"_version": 1,
"_score": null,
"_source": {
"timestamp": "2018-04-22T20:11:35.0292586Z",
"batchId": "9c96d360-5549-4b3b-85c8-756330117bad",
"userId": "id-001-001",
"things": [
{
"id": 650055867,
"name": "green",
},
{
"id": 523,
"name": "eggs",
},
{
"id": 1269,
"name": "ham",
}
]
}
}
Of course, this is just one document of many in the index. I would like to create an aggregate bucket of all the "things" in my index, so that I could sub aggregate against that bucket.
My agg query looks like this:
{
"aggs": {
"all_things": {
"nested": {
"path": "_source.things"
}
}
}
}
(BTW ... if I used just "things" as the nested path, it complains "[nested] nested path [things] is not nested".)
Finally the result (using the Kibana console) is:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1408,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"all_things" : {
"doc_count" : 0
}
}
}
Could someone explain why I get no docs in my bucket? Or perhaps a decent way to create a bucket of all my "things"?
Thanks.
You've gotta index your things as nested:
PUT my-index
{
"mappings": {
"properties": {
"things": {
"type": "nested"
}
}
}
}
POST my-index/_doc
{
"timestamp": "2018-04-22T20:11:35.0292586Z",
"batchId": "9c96d360-5549-4b3b-85c8-756330117bad",
"userId": "id-001-001",
"things": [
{
"id": 650055867,
"name": "green"
},
{
"id": 523,
"name": "eggs"
},
{
"id": 1269,
"name": "ham"
}
]
}
Then and only then will your nested aggs work:
GET my-index/_search
{
"size": 0,
"aggs": {
"things_ids": {
"nested": {
"path": "things"
},
"aggs": {
"things_ids": {
"cardinality": {
"field": "things.id"
}
}
}
}
}
}
I'm attempting to do some elasticsearch query fu on a set of data I have.
I have a user document that is the parent to many child page view documents. I'm looking to return all users that have viewed a specific page an arbitrary amount of times (defined by user input box). So far, I've got a has_child query that will return me all the users that have a page view with certain ids. However, this will return those parents with all their children. Next, I've tried to write an aggregation on those query results, that will essentially do the same has_child query in aggregation form. Now, I have the right document count for my filtered child documents. I need to use this document count to go back and filter the parents. To explain the query in words, "return to me all the users that have viewed a specific page more than 4 times". It's possible that I may need to restructure my data. Any thoughts?
Here is my query thus far:
curl -XGET 'http://localhost:9200/development_users/_search?pretty=true' -d '
{
"query" : {
"has_child" : {
"type" : "page_view",
"query" : {
"terms" : {
"viewed_id" : [175,180]
}
}
}
},
"aggs" : {
"to_page_view": {
"children": {
"type" : "page_view"
},
"aggs" : {
"page_views_that_match" : {
"filter" : { "terms": { "viewed_id" : [175,180] } }
}
}
}
}
}'
This returns me a response like:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [ {
"_index" : "development_users",
"_type" : "user",
"_id" : "22548",
"_score" : 1.0,
"_source":{"id":22548,"account_id":1009}
} ]
},
"aggregations" : {
"to_page_view" : {
"doc_count" : 53,
"page_views_that_match" : {
"doc_count" : 2
}
}
}
}
Associated Mappings:
{
"development_users" : {
"mappings" : {
"page_view" : {
"dynamic" : "false",
"_parent" : {
"type" : "user"
},
"_routing" : {
"required" : true
},
"properties" : {
"created_at" : {
"type" : "date",
"format" : "date_time"
},
"id" : {
"type" : "integer"
},
"viewed_id" : {
"type" : "integer"
},
"time_on_page" : {
"type" : "integer"
},
"title" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"updated_at" : {
"type" : "date",
"format" : "date_time"
},
"url" : {
"type" : "string"
}
}
},
"user" : {
"dynamic" : "false",
"properties" : {
"account_id" : {
"type" : "integer"
},
"id" : {
"type" : "integer"
}
}
}
}
}
}
Okay, so this is kind of involved. I made a few simplifications to keep it straight in my head. First, I used this mapping:
PUT /test_index
{
"mappings": {
"page_view": {
"_parent": {
"type": "development_user"
},
"properties": {
"viewed_id": {
"type": "string"
}
}
},
"development_user": {
"properties": {
"id": {
"type": "string"
}
}
}
}
}
Then I added some data. In this little universe, I have three users and two pages. I want to find users who have viewed "page_a" at least twice, so if I construct the correct query only user 3 will be returned.
POST /test_index/development_user/_bulk
{"index":{"_type":"development_user","_id":1}}
{"id":"user_1"}
{"index":{"_type":"page_view","_parent":1}}
{"viewed_id":"page_a"}
{"index":{"_type":"development_user","_id":2}}
{"id":"user_2"}
{"index":{"_type":"page_view","_parent":2}}
{"viewed_id":"page_b"}
{"index":{"_type":"development_user","_id":3}}
{"id":"user_3"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_a"}
{"index":{"_type":"page_view","_parent":3}}
{"viewed_id":"page_b"}
To get that answer we'll use aggregations. Notice that I don't want documents returned (the normal way), but I do want to filter down the documents we analyze, because it will make things more efficient. So I use the same basic filter you had before.
So the aggregation tree starts with terms_parent_id which will just separate parent documents. Inside that I have children_page_view which filters the child documents down to the ones I want ("page_a"), and next to it in the hierarchy is bucket_selector_page_id_term_count which uses a bucket selector (you'll need ES 2.x) to filter the parent documents by those meeting the criterium, and then finally a top hits aggregation which shows us the documents that match the requirements.
POST /test_index/development_user/_search
{
"size": 0,
"query": {
"has_child": {
"type": "page_view",
"query": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
},
"aggs": {
"terms_parent_id": {
"terms": {
"field": "id"
},
"aggs": {
"children_page_view": {
"children": {
"type": "page_view"
},
"aggs": {
"filter_page_ids": {
"filter": {
"terms": {
"viewed_id": [
"page_a"
]
}
}
}
}
},
"bucket_selector_page_id_term_count": {
"bucket_selector": {
"buckets_path": {
"children_count": "children_page_view>filter_page_ids._count"
},
"script": "children_count >= 2"
}
},
"top_hits_users": {
"top_hits": {
"_source": {
"include": [
"id"
]
}
}
}
}
}
}
}
which returns:
{
"took": 14,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"terms_parent_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "user_3",
"doc_count": 1,
"children_page_view": {
"doc_count": 3,
"filter_page_ids": {
"doc_count": 2
}
},
"top_hits_users": {
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "development_user",
"_id": "3",
"_score": 1,
"_source": {
"id": "user_3"
}
}
]
}
}
}
]
}
}
}
Here's all the code I used:
http://sense.qbox.io/gist/43f24461448519dc884039db40ebd8e2f5b7304f
I have an index with the following documents (simplified):
{
"user" : "j.johnson",
"certifications" : [{
"certification_date" : "2013-02-09T00:00:00+03:00",
"previous_level" : "No Level",
"obtained_level" : "Junior"
}, {
"certification_date" : "2014-05-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}
]
}
I want just to have a flat list of all certifications passed by all users where certification_date > 2014-01-01. It should be a pretty large array like this:
[{
"certification_date" : "2014-09-08T00:00:00+03:00",
"previous_level" : "No Level",
"obtained_level" : "Junior"
}, {
"certification_date" : "2014-05-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}, {
"certification_date" : "2015-01-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}
...
]
It doesn't seems to be a hard task, but I wasn't able to find an easy way to do that.
I would do it with a parent/child relationship, though you will have to reorganize your data. I don't think you can get what you want with your current schema.
More concretely, I set up an index like this, with user as parent and certification as child:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"user": {
"properties": {
"user_name": { "type": "string" }
}
},
"certification":{
"_parent": { "type": "user" },
"properties": {
"certification_date": { "type": "date" },
"previous_level": { "type": "string" },
"obtained_level": { "type": "string" }
}
}
}
}
added some docs:
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"user","_id":1}}
{"user_name":"j.johnson"}
{"index":{"_index":"test_index","_type":"certification","_parent":1}}
{"certification_date" : "2013-02-09T00:00:00+03:00","previous_level" : "No Level","obtained_level" : "Junior"}
{"index":{"_index":"test_index","_type":"certification","_parent":1}}
{"certification_date" : "2014-05-26T00:00:00+03:00","previous_level" : "Junior","obtained_level" : "Middle"}
{"index":{"_index":"test_index","_type":"user","_id":2}}
{ "user_name":"b.bronson"}
{"index":{"_index":"test_index","_type":"certification","_parent":2}}
{"certification_date" : "2013-09-05T00:00:00+03:00","previous_level" : "No Level","obtained_level" : "Junior"}
{"index":{"_index":"test_index","_type":"certification","_parent":2}}
{"certification_date" : "2014-07-20T00:00:00+03:00","previous_level" : "Junior","obtained_level" : "Middle"}
Now I can just search certifications with a range filter:
POST /test_index/certification/_search
{
"query": {
"constant_score": {
"filter": {
"range": {
"certification_date": {
"gte": "2014-01-01"
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "certification",
"_id": "QGXHp7JZTeafWYzb_1FZiA",
"_score": 1,
"_source": {
"certification_date": "2014-05-26T00:00:00+03:00",
"previous_level": "Junior",
"obtained_level": "Middle"
}
},
{
"_index": "test_index",
"_type": "certification",
"_id": "yvO2A9JaTieI5VHVRikDfg",
"_score": 1,
"_source": {
"certification_date": "2014-07-20T00:00:00+03:00",
"previous_level": "Junior",
"obtained_level": "Middle"
}
}
]
}
}
This structure is still not completely flat the way you asked for, but I think this is as close as ES will let you get.
Here is the code I used:
http://sense.qbox.io/gist/3c733ec75e6c0856fa2772cc8f67bd7c00aba637