Elasticsearch merging documents in response - elasticsearch

I am having data in 3 indexes. I want to generate a invoice report using information from other indexes. For example the following are the sample document of each index
Users index
{
"_id": "userId1",
"name": "John"
}
Invoice index
{
"_id": "invoiceId1",
"userId": "userId1",
"cost": "10000",
"startdate": "",
"enddate": ""
}
Orders index
{
"_id": "orderId1",
"userId": "userId1",
"productName": "Mobile"
}
I want to generate a invoice report by combining information from these three indexes as follows
{
"_id": "invoiceId1",
"userName": "John",
"productName": "Mobile",
"cost": "10000",
"startdate": "",
"enddate": ""
}
How to write Elasticsearch query which returns response by combining information from other index documents?

You cannot do query-time joins in Elasticsearch and will need to denormalize your data in order to efficiently retrieve and group it.
Having said that, you could:
leverage the multi-target syntax and query multiple indices at once
use an OR query on the id and userId -- since either of those is referenced at least once in any of your docs
and then trivially join your data through a map/reduce tool called scripted metric aggregations
Quick side note: you won't be able to use the _id keyword inside your docs because it's reserved.
Assuming your docs and indices are structured as follows:
POST users_index/_doc
{"id":"userId1","name":"John"}
POST invoices_index/_doc
{"id":"invoiceId1","userId":"userId1","cost":"10000","startdate":"","enddate":""}
POST orders_index/_doc
{"id":"orderId1","userId":"userId1","productName":"Mobile"}
Here's how the scripted metric aggregation could look like:
POST users_index,invoices_index,orders_index/_search
{
"size": 0,
"query": {
"bool": {
"should": [
{
"term": {
"id.keyword": {
"value": "userId1"
}
}
},
{
"term": {
"userId.keyword": {
"value": "userId1"
}
}
}
]
}
},
"aggs": {
"group_by_invoiceId": {
"scripted_metric": {
"init_script": "state.users = []; state.invoices = []; state.orders = []",
"map_script": """
def source = params._source;
if (source.containsKey("name")) {
// we're dealing with the users index
state.users.add(source);
} else if (source.containsKey("cost")) {
// we're dealing with the invoices index
state.invoices.add(source);
} else if (source.containsKey("productName")) {
// we're dealing with the orders index
state.orders.add(source);
}
""",
"combine_script": """
def non_empty_state = [:];
for (entry in state.entrySet()) {
if (entry != null && entry.getValue().length > 0) {
non_empty_state[entry.getKey()] = entry.getValue();
}
}
return non_empty_state;
""",
"reduce_script": """
def final_invoices = [];
def all_users = [];
def all_invoices = [];
def all_orders = [];
// flatten all resources
for (state in states) {
for (kind_entry in state.entrySet()) {
def map_kind = kind_entry.getKey();
if (map_kind == "users") {
all_users.addAll(kind_entry.getValue());
} else if (map_kind == "invoices") {
all_invoices.addAll(kind_entry.getValue());
} else if (map_kind == "orders") {
all_orders.addAll(kind_entry.getValue());
}
}
}
// iterate the invoices and enrich them
for (invoice_entry in all_invoices) {
def invoiceId = invoice_entry.id;
def userId = invoice_entry.userId;
def userName = all_users.stream().filter(u -> u.id == userId).findFirst().get().name;
def productName = all_orders.stream().filter(o -> o.userId == userId).findFirst().get().productName;
def cost = invoice_entry.cost;
def startdate = invoice_entry.startdate;
def enddate = invoice_entry.enddate;
final_invoices.add([
'id': invoiceId,
'userName': userName,
'productName': productName,
'cost': cost,
'startdate': startdate,
'enddate': enddate
]);
}
return final_invoices;
"""
}
}
}
}
which'd return
{
...
"aggregations" : {
"group_by_invoiceId" : {
"value" : [
{
"cost" : "10000",
"enddate" : "",
"id" : "invoiceId1",
"userName" : "John",
"startdate" : "",
"productName" : "Mobile"
}
]
}
}
}
Summing up, there are workarounds to achieve query-time joins. At the same time, scripts like this shouldn't be used in production because they could take forever.
Instead, this aggregation should be emulated outside of Elasticsearch after the query resolves and returns the index-specific hits.
BTW — I set size: 0 to return just the aggregation results so increase this parameter if you want to get some actual hits.

Related

ElasticSearch aggregation - buckets/scripted_metric

i try to calculate rate function rate(val,ts)= v2-v1/t2-t1 for each document in my index.
My mappings are in form : { "name":keyword","value":"double","timestamp":"integer"}.
So for an example if i have 2 documents in my index:
doc1:{"name":name1,"value":5,"timestamp":2 }
doc2: {name":name1,"value":10,"timestamp":3 },
i need to get result(ts=3) = (10-5)/(3-2).
Is there any way to do this in elasticsearch?
I tried to write my own metric script in this form :
GET test1/_search
{
"size":15,
"aggs":{
"sum_the_hard_way": {
"scripted_metric": {
"init_script": {
"source": "state.values = []; state.timestamps = [];"
},
"map_script": {
"source": "state.values.add(doc['value'].value);state.timestamps.add(doc['timestamp'].value);"
},
"combine_script": {
"source": "def rates = []; for ( int i = 0; i <= state.values.size()-1 ; i++ ) { rate[i+1] = (state.value[i+1]- state.value[i])/(state.timestamp[i+1]- state.timestamp[i]);} return values"
},
"reduce_script": {
"source": "def vals = []; for (a in states) { vals.add(a) } return vals"
}
}
}
}
}
But it doesn't work, i got
"reason" : "index_out_of_bounds_exception: Index 0 out of bounds for
length
Thank you in advance!
TLDR;
Some type slipped into your code.
I hope I have fixed it.
GET /so_agg_painless/_search
{
"size":15,
"aggs":{
"sum_the_hard_way": {
"scripted_metric": {
"init_script": {
"source": """
state.values = [];
state.timestamps = [];
"""
},
"map_script": {
"source": """
state.values.add(doc['value'].value);
state.timestamps.add(doc['timestamp'].value);
"""
},
"combine_script": {
"source": """
def rates = [];
for ( int i = 0; i <= state.values.size()-2 ; i++ ) {
def nom = (state.values[i+1]- state.values[i]);
def denom = (state.timestamps[i+1]- state.timestamps[i]);
rates.add(nom/denom);
}
return rates
"""
},
"reduce_script": {
"source": """
def vals = [];
for (a in states) {
vals.add(a)
}
return vals
"""
}
}
}
}
}
Hey man, you have many typos in your code, that's why it is failing.
But I think the logic is perfect so kudos to you, so were super close.
You need to pay attention to those s in your code.
Also as you may see in the code I posted. use the """. So you can indent the code it makes it wayyy easier to read and fix.

Elasticsearch ingest pipeline: how to recursively modify values in a HashMap

Using an ingest pipeline, I want to iterate over a HashMap and remove underscores from all string values (where underscores exist), leaving underscores in the keys intact. Some values are arrays that must further be iterated over to do the same modification.
In the pipeline, I use a function to traverse and modify the values of a Collection view of the HashMap.
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
void findReplace(Collection collection) {
collection.forEach(element -> {
if (element instanceof String) {
element.replace('_',' ');
} else {
findReplace(element);
}
return true;
})
}
Collection samples = ctx.samples;
samples.forEach(sample -> { //sample.sample_tags is a HashMap
Collection sample_tags = sample.sample_tags.values();
findReplace(sample_tags);
return true;
})
"""
}
}
]
}
When I simulate the pipeline ingestion, I find the string values are not modified. Where am I going wrong?
POST /_ingest/pipeline/samples/_simulate
{
"docs": [
{
"_index": "samples",
"_id": "xUSU_3UB5CXFr25x7DcC",
"_source": {
"samples": [
{
"sample_tags": {
"Entry_A": [
"A_hyphentated-sample",
"sample1"
],
"Entry_B": "A_multiple_underscore_example",
"Entry_C": [
"sample2",
"another_example_with_underscores"
],
"Entry_E": "last_example"
}
}
]
}
}
]
}
\\Result
{
"docs" : [
{
"doc" : {
"_index" : "samples",
"_type" : "_doc",
"_id" : "xUSU_3UB5CXFr25x7DcC",
"_source" : {
"samples" : [
{
"sample_tags" : {
"Entry_E" : "last_example",
"Entry_C" : [
"sample2",
"another_example_with_underscores"
],
"Entry_B" : "A_multiple_underscore_example",
"Entry_A" : [
"A_hyphentated-sample",
"sample1"
]
}
}
]
},
"_ingest" : {
"timestamp" : "2020-12-01T17:29:52.3917165Z"
}
}
}
]
}
Here is a modified version of your script that will work on the data you provided:
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
String replaceString(String value) {
return value.replace('_',' ');
}
void findReplace(Map map) {
map.keySet().forEach(key -> {
if (map[key] instanceof String) {
map[key] = replaceString(map[key]);
} else {
map[key] = map[key].stream().map(this::replaceString).collect(Collectors.toList());
}
});
}
ctx.samples.forEach(sample -> {
findReplace(sample.sample_tags);
return true;
});
"""
}
}
]
}
The result looks like this:
{
"samples" : [
{
"sample_tags" : {
"Entry_E" : "last example",
"Entry_C" : [
"sample2",
"another example with underscores"
],
"Entry_B" : "A multiple underscore example",
"Entry_A" : [
"A hyphentated-sample",
"sample1"
]
}
}
]
}
You were on the right path but you were working on copies of values and weren't setting the modified values back onto the document context ctx which is eventually returned from the pipeline. This means you'll need to keep track of the current iteration indexes -- so for the array lists, as for the hash maps and everything in between -- so that you can then target the fields' positions in the deeply nested context.
Here's an example taking care of strings and (string-only) array lists. You'll need to extend it to handle hash maps (and other types) and then perhaps extract the whole process into a separate function. But AFAIK you cannot return multiple data types in Java so it may be challenging...
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
ArrayList samples = ctx.samples;
for (int i = 0; i < samples.size(); i++) {
def sample = samples.get(i).sample_tags;
for (def entry : sample.entrySet()) {
def key = entry.getKey();
def val = entry.getValue();
def replaced_val;
if (val instanceof String) {
replaced_val = val.replace('_',' ');
} else if (val instanceof ArrayList) {
replaced_val = new ArrayList();
for (int j = 0; j < val.length; j++) {
replaced_val.add(val[j].replace('_',' '));
}
}
// else if (val instanceof HashMap) {
// do your thing
// }
// crucial part
ctx.samples[i][key] = replaced_val;
}
}
"""
}
}
]
}

Painless scripting initialize new array

I'm trying to add or update a nested object in Elasticsearch using a script. Below script works fine if integrations is already an array, but when it is null, below script throws a null pointer exception. How do I initialize ctx._source.integrations to be an empty array if its value is null? (Something like the equivalent of JavaScript's myObject.integrations = myObject.integrations ?? [])
POST /products/_update/VFfrnQrKlC5bwdfdeaQ7
{
"script": {
"source": """
ctx._source.integrations.removeIf(i -> i.id == params.integration.id);
ctx._source.integrations.add(params.integration);
ctx._source.integrationCount = ctx._source.integrations.length;
""",
"params": {
"integration": {
"id": "dVTV8GjHj8pXFnlYUUlI",
"from": true,
"to": false,
"vendor": "sfeZWDpZXlF5Qa8mUsiF",
"targetProduct": {
"id": "nyILhphvCrGYm53cfaOx",
"name": "Test Product",
"categoryIds": []
}
}
}
}
}
ok i think this does the trick:
if (ctx._source.integrations == null) {
ctx._source.integrations = new ArrayList();
}
is there a short hand to this like in the JS example?

Elasticsearch pre-processing to remove null fields as part of ingest

I have a use case where an API i'm calling to retrieve data to put into elasticsearch is returning nulls.
I need to write an ingest pipeline that uses processors to remove all null fields before writing it into elasticsearch. Processors may or may not use painless scripting.
Here is a sample payload that i currently get from the API
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
}
}
The pipeline should ideally insert the document as -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1"
}
}
Note, the fields are dynamic so i can't write a processor that checks for nulls against a defined set of fields.
Thanks!
Null fields are not indexed nor are searchable.I have written below pipeline to remove such fields. Please test it before use on all of your scenarios. After posting documents using this pipeline, you won't be able to search null fields using "exists"
Pipeline:
PUT _ingest/pipeline/remove_null_fields
{
"description": "Remove any null field",
"processors": [
{
"script": {
"source": """
// return list of field with null values
def loopAllFields(def x){
def ret=[];
if(x instanceof Map){
for (entry in x.entrySet()) {
if (entry.getKey().indexOf("_")==0) {
continue;
}
def val=entry.getValue();
if( val instanceof HashMap ||
val instanceof Map ||
val instanceof ArrayList)
{
def list=[];
if(val instanceof ArrayList)
{
def index=0;
// Call for each object in arraylist
for(v in val)
{
list=loopAllFields(v);
for(item in list)
{
ret.add(entry.getKey()+"["+index+"]."+ item);
}
index++;
}
}
else
{
list =loopAllFields(val);
}
if(list.size()==val.size())
{
ret.add(entry.getKey());
}
else{
for(item in list)
{
ret.add(entry.getKey()+"."+ item);
}
}
}
if(val==null)
{
ret.add(entry.getKey());
}
}
}
return ret;
}
/* remove fields from source, recursively deletes fields which part of other fields */
def removeField(def ctx, def fieldname)
{
def pos=fieldname.indexOf(".");
if(pos>0)
{
def str=fieldname.substring(0,pos);
if(str.indexOf('[')>0 && str.indexOf(']')>0)
{
def s=str.substring(0,str.indexOf('['));
def i=str.substring(str.indexOf('[')+1,str.length()-1);
removeField(ctx[s][Integer.parseInt(i)],fieldname.substring(pos+1,fieldname.length()));
}
else
{
if(ctx[str] instanceof Map)
{
removeField(ctx[str],fieldname.substring(pos+1,fieldname.length()));
}
}
}else{
ctx.remove(fieldname);
}
return ctx;
}
def list=[];
list=loopAllFields(ctx);
for(item in list)
{
removeField(ctx,item);
}
"""
}
}
]
}
Post Document:
POST index8/_doc?pipeline=remove_null_fields
{
"master_desc": "TESTING PART",
"ddd":null,
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
}
}
Result:
"hits" : [
{
"_index" : "index8",
"_type" : "_doc",
"_id" : "06XAyXEBAWHHnYGOSa_M",
"_score" : 1.0,
"_source" : {
"date_added" : "2019-10-24T09:30:03",
"master_no" : {
"master_no" : 18460110,
"barcode" : "NLSKYTEST1-1"
},
"master_desc" : "TESTING PART"
}
}
]
#Jaspreet, so the script almost worked. It didn't however eliminate empty objects, empty arrays or empty values. Here is a doc i tried to index -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
},
"remote_sync_state": "",
"lib_title_footage": [],
"prj_no": {
"prj_no": null,
"prj_desc": null,
}
The above returned -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1"
},
"remote_sync_state": "",
"lib_title_footage": [ ],
"prj_no": { }
I tried updated the script to have the condition check for these patterns but got a compile error unfortunately.

Group Data on elastic search with same value on two key

I have just started to learn about elastic search and facing a problem on group aggregation. I have a data set on elastic search like :
[{
srcIP : "10.0.11.12",
dstIP : "19.67.78.91",
totalMB : "0.25"
},{
srcIP : "10.45.11.62",
dstIP : "19.67.78.91",
totalMB : "0.50"
},{
srcIP : "13.67.52.91",
dstIP : "10.0.11.12",
totalMB : "0.75"
},{
srcIP : "10.23.64.12",
dstIP : "10.45.11.62",
totalMB : "0.25"
}]
I Just want to group data on the basis of srcIP and sum the field totalMB but I just wanna add up on more thing like when group by performing on scrIP then it will match the srcIP value to dstIP value and also sum the totalMB for dstIP.
Output should be like this :
buckets : [{
key : "10.0.11.12",
total_GB_SrcIp :{
value : "0.25"
},
total_GB_dstIP :{
value : "0.75"
}
},
{
key : "10.45.11.62",
total_MB_SrcIp :{
value : "0.50"
},
total_MB_dstIP :{
value : "0.25"
}
}]
I have done normal aggregation for one key but didn't get the final query for my problem.
Query :
GET /index*/_search
{
size : 0,
"aggs": {
"group_by_srcIP": {
"terms": {
"field": "srcIP",
"size": 100,
"order": {
"total_MB_SrcIp": "desc"
}
},
"aggs": {
"total_MB_SrcIp": {
"sum": {
"field": "TotalMB"
}
}
}
}
}
}
Hope you understand my problem on the basis of sample output.
Thanks in advance.
As per my understanding, you need a sum aggregation on field (totalMB) with respect to distinct values in two another fields (srcIP, dstIP).
AFAIK, elastic search is not that good for aggregating on values of multiple fields, unless you combine those fields together using some document ingestion or combine it on application side itself. (I may be wrong here, though).
I gave it a try to get required output using scripted_metric aggregation. (Please read about it if you don't know what it is or how it works)
I experimented on painless script to do following in aggregation:
pick srcIp, dstIp & totalMB from each doc
populate a cross-mapping like IP -> { (src : totalMBs), (dst : totalMBs) } in a map
return this map as result of aggregation
Here is the actual search query with aggregation:
GET /testIndex/testType/_search
{
"size": 0,
"aggs": {
"ip-addr": {
"scripted_metric": {
"init_script": "params._agg.addrs = []",
"map_script": "def lst = []; lst.add(doc.srcIP.value); lst.add(doc.dstIP.value); lst.add(doc.totalMB.value); params._agg.addrs.add(lst);",
"combine_script": "Map ipMap = new HashMap(); for(entry in params._agg.addrs) { def srcIp = entry.get(0); def dstIp = entry.get(1); def mbs = entry.get(2); if(ipMap.containsKey(srcIp)) {def srcMbSum = mbs + ipMap.get(srcIp).get('srcMB'); ipMap.get(srcIp).put('srcMB',srcMbSum); } else {Map types = new HashMap(); types.put('srcMB', mbs); types.put('dstMB', 0.0); ipMap.put(srcIp, types); } if(ipMap.containsKey(dstIp)) {def dstMbSum = mbs + ipMap.get(dstIp).get('dstMB'); ipMap.get(dstIp).put('dstMB',dstMbSum); } else {Map types = new HashMap(); types.put('srcMB', 0.0); types.put('dstMB', mbs); ipMap.put(dstIp, types); } } return ipMap;",
"reduce_script": "Map resultMap = new HashMap(); for(ipMap in params._aggs) {for(entry in ipMap.entrySet()) {def ip = entry.getKey(); def srcDestMap = entry.getValue(); if(resultMap.containsKey(ip)) {Map types = new HashMap(); types.put('srcMB', srcDestMap.get('srcMB') + resultMap.get(ip).get('srcMB')); types.put('dstMB', srcDestMap.get('dstMB') + resultMap.get(ip).get('dstMB')); resultMap.put(ip, types); } else {resultMap.put(ip, srcDestMap); } } } return resultMap;"
}
}
}
}
Here are experiment details:
Index mapping:
GET testIndex/_mapping
{
"testIndex": {
"mappings": {
"testType": {
"dynamic": "true",
"_all": {
"enabled": false
},
"properties": {
"dstIP": {
"type": "ip"
},
"srcIP": {
"type": "ip"
},
"totalMB": {
"type": "double"
}
}
}
}
}
}
Sample input:
POST testIndex/testType
{
"srcIP" : "10.0.11.12",
"dstIP" : "19.67.78.91",
"totalMB" : "0.25"
}
POST testIndex/testType
{
"srcIP" : "10.45.11.62",
"dstIP" : "19.67.78.91",
"totalMB" : "0.50"
}
POST testIndex/testType
{
"srcIP" : "13.67.52.91",
"dstIP" : "10.0.11.12",
"totalMB" : "0.75"
}
POST testIndex/testType
{
"srcIP" : "10.23.64.12",
"dstIP" : "10.45.11.62",
"totalMB" : "0.25"
}
Query output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"ip-addr": {
"value": {
"13.67.52.91": {
"srcMB": 0.75,
"dstMB": 0
},
"10.23.64.12": {
"srcMB": 0.25,
"dstMB": 0
},
"10.45.11.62": {
"srcMB": 0.5,
"dstMB": 0.25
},
"19.67.78.91": {
"srcMB": 0,
"dstMB": 0.75
},
"10.0.11.12": {
"srcMB": 0.25,
"dstMB": 0.75
}
}
}
}
}
Here is readable query for better understanding.
"scripted_metric": {
"init_script": "params._agg.addrs = []",
"map_script": """
def lst = [];
lst.add(doc.srcIP.value);
lst.add(doc.dstIP.value);
lst.add(doc.totalMB.value);
params._agg.addrs.add(lst);
""",
"combine_script": """
Map ipMap = new HashMap();
for(entry in params._agg.addrs) {
def srcIp = entry.get(0);
def dstIp = entry.get(1);
def mbs = entry.get(2);
if(ipMap.containsKey(srcIp)) {
def srcMbSum = mbs + ipMap.get(srcIp).get('srcMB');
ipMap.get(srcIp).put('srcMB',srcMbSum);
} else {
Map types = new HashMap();
types.put('srcMB', mbs);
types.put('dstMB', 0.0);
ipMap.put(srcIp, types);
}
if(ipMap.containsKey(dstIp)) {
def dstMbSum = mbs + ipMap.get(dstIp).get('dstMB');
ipMap.get(dstIp).put('dstMB',dstMbSum);
} else {
Map types = new HashMap();
types.put('srcMB', 0.0);
types.put('dstMB', mbs);
ipMap.put(dstIp, types);
}
}
return ipMap;
""",
"reduce_script": """
Map resultMap = new HashMap();
for(ipMap in params._aggs) {
for(entry in ipMap.entrySet()) {
def ip = entry.getKey();
def srcDestMap = entry.getValue();
if(resultMap.containsKey(ip)) {
Map types = new HashMap();
types.put('srcMB', srcDestMap.get('srcMB') + resultMap.get(ip).get('srcMB'));
types.put('dstMB', srcDestMap.get('dstMB') + resultMap.get(ip).get('dstMB'));
resultMap.put(ip, types);
} else {
resultMap.put(ip, srcDestMap);
}
}
}
return resultMap;
"""
}
However, prior to going in depth, I would suggest you to test it out on some sample data and check if it works. Scripted metric aggregations do have considerable impact on query performance.
One more thing, to get required key string in aggregation result, replace all occurrences of 'srcMB' & 'dstMB' in script to 'total_GB_SrcIp' & 'total_GB_DstIp' as per your need.
Hope this may help you or some one.
FYI, I tested this on ES v5.6.11.

Resources