Make a flat array from Elasticsearch query results - elasticsearch

I have an index with the following documents (simplified):
{
"user" : "j.johnson",
"certifications" : [{
"certification_date" : "2013-02-09T00:00:00+03:00",
"previous_level" : "No Level",
"obtained_level" : "Junior"
}, {
"certification_date" : "2014-05-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}
]
}
I want just to have a flat list of all certifications passed by all users where certification_date > 2014-01-01. It should be a pretty large array like this:
[{
"certification_date" : "2014-09-08T00:00:00+03:00",
"previous_level" : "No Level",
"obtained_level" : "Junior"
}, {
"certification_date" : "2014-05-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}, {
"certification_date" : "2015-01-26T00:00:00+03:00",
"previous_level" : "Junior",
"obtained_level" : "Middle"
}
...
]
It doesn't seems to be a hard task, but I wasn't able to find an easy way to do that.

I would do it with a parent/child relationship, though you will have to reorganize your data. I don't think you can get what you want with your current schema.
More concretely, I set up an index like this, with user as parent and certification as child:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"user": {
"properties": {
"user_name": { "type": "string" }
}
},
"certification":{
"_parent": { "type": "user" },
"properties": {
"certification_date": { "type": "date" },
"previous_level": { "type": "string" },
"obtained_level": { "type": "string" }
}
}
}
}
added some docs:
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"user","_id":1}}
{"user_name":"j.johnson"}
{"index":{"_index":"test_index","_type":"certification","_parent":1}}
{"certification_date" : "2013-02-09T00:00:00+03:00","previous_level" : "No Level","obtained_level" : "Junior"}
{"index":{"_index":"test_index","_type":"certification","_parent":1}}
{"certification_date" : "2014-05-26T00:00:00+03:00","previous_level" : "Junior","obtained_level" : "Middle"}
{"index":{"_index":"test_index","_type":"user","_id":2}}
{ "user_name":"b.bronson"}
{"index":{"_index":"test_index","_type":"certification","_parent":2}}
{"certification_date" : "2013-09-05T00:00:00+03:00","previous_level" : "No Level","obtained_level" : "Junior"}
{"index":{"_index":"test_index","_type":"certification","_parent":2}}
{"certification_date" : "2014-07-20T00:00:00+03:00","previous_level" : "Junior","obtained_level" : "Middle"}
Now I can just search certifications with a range filter:
POST /test_index/certification/_search
{
"query": {
"constant_score": {
"filter": {
"range": {
"certification_date": {
"gte": "2014-01-01"
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "certification",
"_id": "QGXHp7JZTeafWYzb_1FZiA",
"_score": 1,
"_source": {
"certification_date": "2014-05-26T00:00:00+03:00",
"previous_level": "Junior",
"obtained_level": "Middle"
}
},
{
"_index": "test_index",
"_type": "certification",
"_id": "yvO2A9JaTieI5VHVRikDfg",
"_score": 1,
"_source": {
"certification_date": "2014-07-20T00:00:00+03:00",
"previous_level": "Junior",
"obtained_level": "Middle"
}
}
]
}
}
This structure is still not completely flat the way you asked for, but I think this is as close as ES will let you get.
Here is the code I used:
http://sense.qbox.io/gist/3c733ec75e6c0856fa2772cc8f67bd7c00aba637

Related

Aggregation on Latest Records Of same status in ElasticSearch

I Have following data in ElasticSearch index some_index.
[ {
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 1,
"cart_status": "new",
"grandTotal": 12,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 1,
"cart_status": "paid",
"grandTotal": 12,
"event": "some_event",
"timestamp": "2022-12-02T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 2,
"cart_status": "new",
"grandTotal": 23,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 2,
"cart_status": "paid",
"grandTotal": 23,
"event": "some_event",
"timestamp": "2022-12-04T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 3,
"cart_status": "new",
"grandTotal": 17,
"event": "some_event",
"timestamp": "2022-12-01T00:00:00.000Z"
}
}
},
{
"_index": "some_index",
"_source": {
"cart": {
"cart_id": 3,
"cart_status": "new",
"grandTotal": 17,
"event": "some_event",
"timestamp": "2022-12-04T00:00:00.000Z"
}
}
}
]
What I want to get is sum of the grandTotals by the latest cart_statuses of each cart within a given time range.
Having the example above, the result for timestamp >= 2022-12-01 00:00:00 and timestamp<= 2022-12-03 00:00:00 should be something like
cart_status:new, sum grandTotal: 40 because within that time range latest status new have cart_id 3 and 2.
and cart_status:paid, sum grandTotal: 12 and this one because paid is the latest status of only cart_id=1.
What I tried is to use sub-aggregation on top_result, top_hits but ElasticSearch complains that "Aggregator [top_result] of type [top_hits] cannot accept sub-aggregations"
Besides I tried with collapse as well to get the latest by status, but according to docs there is also no possibility to aggregate over the results of collapse.
Can someone please help me solving this, it seems like a common calculation but not very trivial in ElasticSearch.
In SQL this is quite easy with window functions.
I want to avoid persisting intermediate data into another index. Because I need the dynamic query, as the users may want to get their calculations for any time range.
you can try the following way. meanwhile, for card_status, sum value will be 52 as it includes card_id 1 that has "new" as card status along with 2 and 3 for given timestamp.
Mappings:
PUT some_index
{
"mappings" : {
"properties": {
"timestamp" : {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||strict_date_optional_time ||epoch_millis"
},
"cart_id" : {
"type": "keyword"
},
"cart_status" : {
"type": "keyword"
},
"grand_total" : {
"type": "long"
},
"event":{
"type": "keyword"
}
}
}
}
Bulk Insert:
POST _bulk
{ "index" : { "_index" : "some_index", "_id" : "1" } }
{ "cart_id" : "1" , "grand_total":12, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "2" } }
{ "cart_id" : "1" , "grand_total":12, "cart_status" : "paid","timestamp":"2022-12-02T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "3" } }
{ "cart_id" : "2" , "grand_total":23, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "4" } }
{ "cart_id" : "2" , "grand_total":23, "cart_status" : "paid","timestamp":"2022-12-04T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "5" } }
{ "cart_id" : "3" , "grand_total":17, "cart_status" : "new","timestamp":"2022-12-01T00:00:00.000Z", "event" : "some_event"}
{ "index" : { "_index" : "some_index", "_id" : "6" } }
{ "cart_id" : "3" , "grand_total":17, "cart_status" : "new","timestamp":"2022-12-04T00:00:00.000Z", "event" : "some_event"}
Query:
GET some_index/_search
{
"size":0,
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": "2022-12-01 00:00:00",
"lte": "2022-12-03 00:00:00"
}
}
}
]
}
},
"aggs": {
"card_status": {
"terms": {
"field": "cart_status"
},
"aggs": {
"grandTotal": {
"sum": {
"field": "grand_total"
}
}
}
}
}
}
Output:
{
"took": 86,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"card_status": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "new",
"doc_count": 3,
"grandTotal": {
"value": 52
}
},
{
"key": "paid",
"doc_count": 1,
"grandTotal": {
"value": 12
}
}
]
}
}
}

Elasticsearch query showing weird behavior : bug?

To sum up things quickly, we are using Elasticsearch 6.8.4 and have documents with fields such as "statutPublicOuInterne" (public or internal state) or "identifiant" (identifier).
I cannot share the whole JSON (_source) for security reasons (corporate restrictions), but it looks like the following:
"_source": {
"dateCreation": "2020-11-05T16:31:28.404+01:00",
"dateDerModif": "2020-11-05T16:31:49.183+01:00",
"contenu": { ... }
"langue": "fr",
"observations": null,
"statutPublicOuInterne": "enAttenteTraitementCommissionTask",
"identifiant": "SFB-20201105-ELUH",
(...)
}
Some of the "statutPublicOuInterne" can have values such as "enAttenteTraitementCommissionTask" or "enCoursTraitementCommissionTask".
1st question: for some reason, when I search for statutPublicOuInterne=enCoursTraitementCommissionTask, it doesn't work, but if I search for statutPublicOuInterne=enCoursTraitementCommission (without "Task"), it works! That seems so weird to me and I really can't explain it.
2nd question: if I assume I need to search without the "Task" at the end, then searching for statutPublicOuInterne=enCoursTraitementCommission works but statutPublicOuInterne=enAttenteTraitementCommission doesn't work! (nor does statutPublicOuInterne=enAttenteTraitementCommissionTask work)
The query is as follows:
{
"query": {
"bool" : {
"must" : [
{
"match" : {
"statutPublicOuInterne" : {
"query" : "enAttenteTraitementCommission"
}
}
}
]
}
}
}
I just can't understand why it doesn't find anything, because if I search for this document with its "identifiant" field, then it works:
{
"query": {
"bool" : {
"must" : [
{
"match" : {
"identifiant" : {
"query" : "SFB-20201105-ELUH"
}
}
}
]
}
}
}
The response is:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 2.0283146,
"hits": [
{
"_index": "some-index",
"_type": "demandes",
"_id": "SFB-20201105-ELUH",
"_score": 2.0283146,
"_source": {
"dateCreation": "2020-11-05T16:31:28.404+01:00",
"dateDerModif": "2020-11-05T16:31:49.183+01:00",
"contenu": { ... }
"langue": "fr",
"observations": null,
"statutPublicOuInterne": "enAttenteTraitementCommissionTask",
"identifiant": "SFB-20201105-ELUH",
(...)
}
}
]
}
}
We can clearly see "statutPublicOuInterne": "enAttenteTraitementCommissionTask" in the response.
Am I missing something?
Many thanks in advance for your help!
Adding a working example with index data, mapping, search query, and search result
Index Mapping:
{
"mappings": {
"properties": {
"statutPublicOuInterne": {
"type": "text"
}
}
}
}
Index Data:
{
"dateCreation": "2020-11-05T16:31:28.404+01:00",
"dateDerModif": "2020-11-05T16:31:49.183+01:00",
"langue": "fr",
"observations": null,
"statutPublicOuInterne": "enAttenteTraitementCommissionTask",
"identifiant": "SFB-20201105-ELUH"
}
Search Query:
{
"query": {
"bool": {
"must": [
{
"match": {
"statutPublicOuInterne": {
"query": "enAttenteTraitementCommissionTask"
}
}
}
]
}
}
}
Search Result:
"hits": [
{
"_index": "64700803",
"_type": "_doc",
"_id": "1",
"_score": 0.2876821,
"_source": {
"dateCreation": "2020-11-05T16:31:28.404+01:00",
"dateDerModif": "2020-11-05T16:31:49.183+01:00",
"langue": "fr",
"observations": null,
"statutPublicOuInterne": "enAttenteTraitementCommissionTask",
"identifiant": "SFB-20201105-ELUH"
}
}
]

elasticsearch nested aggregation is empty

So, I have an index in Elasticsearch 7.6, which has documents similar to this one:
{
"_index": "my-index",
"_type": "_doc",
"_id": "kjdskjwolsjj",
"_version": 1,
"_score": null,
"_source": {
"timestamp": "2018-04-22T20:11:35.0292586Z",
"batchId": "9c96d360-5549-4b3b-85c8-756330117bad",
"userId": "id-001-001",
"things": [
{
"id": 650055867,
"name": "green",
},
{
"id": 523,
"name": "eggs",
},
{
"id": 1269,
"name": "ham",
}
]
}
}
Of course, this is just one document of many in the index. I would like to create an aggregate bucket of all the "things" in my index, so that I could sub aggregate against that bucket.
My agg query looks like this:
{
"aggs": {
"all_things": {
"nested": {
"path": "_source.things"
}
}
}
}
(BTW ... if I used just "things" as the nested path, it complains "[nested] nested path [things] is not nested".)
Finally the result (using the Kibana console) is:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1408,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"all_things" : {
"doc_count" : 0
}
}
}
Could someone explain why I get no docs in my bucket? Or perhaps a decent way to create a bucket of all my "things"?
Thanks.
You've gotta index your things as nested:
PUT my-index
{
"mappings": {
"properties": {
"things": {
"type": "nested"
}
}
}
}
POST my-index/_doc
{
"timestamp": "2018-04-22T20:11:35.0292586Z",
"batchId": "9c96d360-5549-4b3b-85c8-756330117bad",
"userId": "id-001-001",
"things": [
{
"id": 650055867,
"name": "green"
},
{
"id": 523,
"name": "eggs"
},
{
"id": 1269,
"name": "ham"
}
]
}
Then and only then will your nested aggs work:
GET my-index/_search
{
"size": 0,
"aggs": {
"things_ids": {
"nested": {
"path": "things"
},
"aggs": {
"things_ids": {
"cardinality": {
"field": "things.id"
}
}
}
}
}
}

Elasticsearch advanced autocomplete

I want to autocomplete user input with Elasticsearch. Now There are tons of tutorials out there how to do so, but none go into the really detailed stuff.
The last issue I'm having with my query is that it should score Results that are not real "autocompletions" lower. Example:
IS:
I type: "Bed"
I find: "Bed", "Bigbed", "Fancy Bed", "Bed Frame"
WANT:
I type: "Bed"
I find: "Bed", "Bed Frame", [other "Bed XXX" results], "Fancy Bed", "Bigbed"
So i want Elasticsearch to first complete "to the right" if that makes sense. And then use results that have words in front of it.
I've tried the completion suggester I doesn't do other stuff I want but also has the same issue.
In German there are lots of examples of words like Bigbed (which isn't a real word in English, I know. But I don't want those words as high results. But since they match closer than Bed Frame (because that is 2 Tokens) they show up so high.
This is my query currently:
POST autocompletion/_search?pretty
{
"query": {
"function_score": {
"query": {
"match": {
"keyword": {
"query": "Bed",
"fuzziness": 1,
"minimum_should_match": "100%"
}
}
},
"field_value_factor": {
"field": "bias",
"factor": 1
}
}
}
}
If you use elasticsearch completion suggester, as explained at https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html, when querying like:
{
"suggest": {
"song-suggest" : {
"prefix" : "bed",
"completion" : {
"field" : "suggest"
}
}
}
}
You will get:
{
"took": 13,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": 0.0,
"hits": []
},
"suggest": {
"song-suggest": [
{
"text": "bed",
"offset": 0,
"length": 3,
"options": [
{
"text": "Bed",
"_index": "autocomplete",
"_type": "_doc",
"_id": "1",
"_score": 34.0,
"_source": {
"suggest": {
"input": [
"Bed"
],
"weight": 34
}
}
},
{
"text": "Bed Frame",
"_index": "autocomplete",
"_type": "_doc",
"_id": "3",
"_score": 34.0,
"_source": {
"suggest": {
"input": [
"Bed Frame"
],
"weight": 34
}
}
}
]
}
]
}
}
If you want to use the search API instead, you can use 2 queries:
prefix query "bed ****"
with a term starting by "bed"
Here the mapping:
{
"mappings": {
"_doc" : {
"properties" : {
"suggest" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
}
}
}
}
Here the search query:
{
"query" : {
"bool" : {
"must" : [
{
"match" : {
"suggest" : "Bed"
}
}
],
"should" : [
{
"prefix" : {
"suggest.keyword" : "Bed"
}
}
]
}
}
}
The should clause will boost document starting by "Bed". Et voilĂ !

Elasticsearch Query - Return all documents that do not have a corresponding document

I have an index that contains documents who have a status. These are initially imported with a job and their status is set to 0.
For simplicity:
{
"_uid" : 1234
"id" : 1
"name" : "someName",
"status" : 0
}
Then another import job runs and extends these objects by iterating over each object with status=0. Each object that is extended gets the status 1.
{
"_uid" : 1234
"id" : 1
"name" : "someName",
"newProperty" : "someValue",
"status" : 1
}
(Note the unchanged _uid. It's the same object)
Now I have a third import job that takes all objects with status one, takes their ID (the ID!!! Not their _uid!) and creates a new object with the same ID, but different UID:
{
"_uid" : 5678
"id" : 1
"completelyDifferentProperty" : "someValue"
"status" : 2
}
So now, for each ID, I have two objects: One with status = 1, One with status = 2.
For the last job I need to make sure that it only picks objects with status =1 that DO NOT YET have a corresponding status=2 object.
So I need a query to the effect of
"Get all objects where status == 1 for which no status == 2 object with the same ID exists".
I have a feeling aggregations might help me but I haven't gotten it figured out yet.
You can do it fairly easily with a parent/child relationship. This is sort of a special-case use of the capability, but I think it could be used to solve your problem.
To test it out, I set up an index like this, with parent_doc type and a child_doc type (I only included the properties necessary to set up the capability; it doesn't hurt to add more in your documents):
PUT /test_index
{
"mappings": {
"parent_doc": {
"_id": {
"path": "id"
},
"properties": {
"id": {
"type": "long"
},
"_uid": {
"type": "long"
},
"status": {
"type": "integer"
}
}
},
"child_doc": {
"_parent": {
"type": "parent_doc"
},
"_id": {
"path": "id"
},
"properties": {
"id": {
"type": "long"
},
"_uid": {
"type": "long"
},
"status": {
"type": "long"
}
}
}
}
}
Then I added four docs; three parents, one child. There is one document that has "status: 1 that doesn't have a corresponding child document.
POST /test_index/_bulk
{"index":{"_type":"parent_doc"}}
{"_uid":1234,"id":1,"name":"someName","newProperty":"someValue","status":0}
{"index":{"_type":"parent_doc"}}
{"_uid":1234,"id":2,"name":"someName","newProperty":"someValue","status":1}
{"index":{"_type":"child_doc","_parent":2}}
{"_uid":5678,"id":2,"completelyDifferentProperty":"someValue","status":2}
{"index":{"_type":"parent_doc"}}
{"_uid":4321,"id":3,"name":"anotherName","newProperty":"anotherValue","status":1}
We can find the document we want like this; notice we are querying only the parent_doc type, and that our conditions are that status is 1 and no child (at all) exists:
POST /test_index/parent_doc/_search
{
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"term": {
"status": 1
}
},
{
"not": {
"filter": {
"has_child": {
"type": "child_doc",
"query": {
"match_all": {}
}
}
}
}
}
]
}
}
}
}
}
This returns:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "parent_doc",
"_id": "3",
"_score": 1,
"_source": {
"_uid": 4321,
"id": 3,
"name": "anotherName",
"newProperty": "anotherValue",
"status": 1
}
}
]
}
}
Here's all the code I used to test it:
http://sense.qbox.io/gist/d1a0267087d6e744b991de5cdec1c31d947ebc13

Resources