dynamic data struct (nested json ) with dependency conditionals in go [closed] - go

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 4 years ago.
Improve this question
I'm trying to create dynamically nested json in go. i understand that go is static type and there are various ways to create dynamic objects(interfaces) and wondering if there is a way to tackle my dependency mapping in nested json
sample json
[
{
"display" : "Environment" ,
"field" : "test_env" ,
"value" : {
"display" : "staging",
"value" : "s"
},
"type" : "drop-down" ,
"data" : [
{
"display" : "version" ,
"field" : "test_version" ,
"value" : {
"display" : "1.1.9" ,
"value" : "1.1.9"
},
"type" : "drop-down" ,
"data" : [
{
"display" : "DataCenter" ,
"field" : "test_dc" ,
"value" : {
"display" : "washington",
"value" : "wa"
},
"type" : "drop-down" ,
"data" : [{
"display" : "Secondary" ,
"field" : "test_secondary_dc" ,
"value" : {
"display" : "miami" ,
"value" : "mi"
},
"type" : "drop-down" ,
"data" : [{
"display" : "Size" ,
"field" : "test_size" ,
"value" : {
"display" : "small" ,
"value" : "s"
}
}]
}]
}
]
}
]
},
{
"display" : "Environment" ,
"field" : "test_env" ,
"value" : {
"display" : "production",
"value" : "p"
},
"type" : "drop-down" ,
"data" : [
{
"display" : "version" ,
"field" : "test_version" ,
"value" : {
"display" : "1.1.9" ,
"value" : "1.1.9"
},
"type" : "drop-down" ,
"data" : [
{
"display" : "DataCenter" ,
"field" : "test_dc" ,
"value" : {
"display" : "miami",
"value" : "mi"
},
"type" : "drop-down" ,
"data" : [{
"display" : "Secondary" ,
"field" : "test_secondary_dc" ,
"value" : {
"display" : "washington" ,
"value" : "wa"
},
"type" : "drop-down" ,
"data" : [{
"display" : "Size" ,
"field" : "test_size" ,
"value" : {
"display" : "medium" ,
"value" : "m"
}
}]
}]
}
]
}
]
}
]
sample code :
package main
import (
"fmt"
"reflect"
)
// struct definition ///
type RootElem struct {
RDisplay string `json:"display"`
RField string `json:"field"`
RType string `json:"type"`
RData RSlice `json:"data"`
RValue RValue `json:"value"`
}
type RValue struct {
Display string `json:"display"`
Evalue string `json:"value"`
}
type Vars struct {
Env string `json:"environment"`
Version string `json:"version"`
Zone string `json:"zone"`
PDcenter string `json:"primary_dc"`
SDcenter string `json:"secondary_dc,omitempty"`
Size string `json:"size"`
}
type RSlice []RootElem
func valueFactory(etype, evalue string) string {
switch (etype) {
case "ENVIRONMENT":
return environmentValue(evalue);
case "VERSION":
return versionValue(evalue);
case "ZONE":
return zoneValue(evalue);
case "PRIMARYDC":
return primaryValue(evalue);
case "SECONDARYDC":
return secondaryValue(evalue);
case "SIZE":
return sizeValue(evalue);
default:
return("Specifying a type we don't have.");
}
}
func sizeValue(sz string) string {
switch (sz) {
case "Small":
return "s"
case "Medium":
return "m"
case "Large" :
return "l"
default:
return "This is not a size environment value"
}
}
func environmentValue(env string) string {
switch (env) {
case "Production":
return "p"
case "staging":
return "s"
default:
return "This is not a valid environment value"
}
}
func versionValue(ver string) string {
switch (ver) {
case "1.1.9":
return "1.1.9"
default:
return "This is not a valid version value"
}
}
func zoneValue(zone string) string {
switch (zone) {
case "BLACK":
return "Black"
case "GREEN" :
return "Green"
default:
return "This is not a valid zone value"
}
}
func primaryValue(pdc string) string {
switch (pdc) {
case "washington ":
return "wa"
case "Miami" :
return "mi"
default:
return "This is not a valid primary data center value"
}
}
func secondaryValue(sdc string) string {
switch (sdc) {
case "washington":
return "wa"
case "Miami" :
return "mi"
default:
return "This is not a valid secondary data center value"
}
}
func dataGeneric(display, field, etype string) (relm RootElem) {
relm.RDisplay = display
relm.RField = field
relm.RValue.Display = ""
relm.RValue.Evalue = ""
relm.RType = etype
return relm
}
func dataEnvironment() RootElem {
display := "Environment"
field := "test_env"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataVersion() RootElem {
display := "Version"
field := "test_version"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataZone() RootElem {
display := "Zone"
field := "test_zone"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataPrimary() RootElem {
display := "Primary Data Center"
field := "test_dc"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataSecondary() RootElem {
display := "Secondary Data Center"
field := "test_secondary_dc"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataSize() RootElem {
display := "size"
field := "test_size"
etype := "dropdown"
return dataGeneric(display, field, etype)
}
func dataFactory(etype string) RootElem {
var rem RootElem
switch (etype) {
case "ENVIRONMENT":
return dataEnvironment()
case "VERSION":
return dataVersion()
case "ZONE":
return dataZone()
case "PRIMARYDC":
return dataPrimary()
case "SECONDARYDC":
return dataSecondary()
case "SIZE":
return dataSize()
}
return rem
}
func main() {
// sample element ///
var elment = Vars{
Env: "Production" ,
Version: "1.1.9" ,
Zone: "GREEN" ,
PDcenter: "Washington" ,
SDcenter: "Miami" ,
Size: "Small" ,
}
var Dict = []string{"ENVIRONMENT" , "VERSION" , "ZONE" , "PRIMARYDC" , "SECONDARYDC" , "SIZE" }
var newData, finalElem RootElem
for i := 0 ; i < reflect.ValueOf(elment).NumField() ; i++ {
currentElement := reflect.ValueOf(elment).Field(i).Interface()
currentElemType := Dict[i]
newData = dataFactory(currentElemType)
newData.RValue.Display = currentElement.(string)
newData.RValue.Evalue = valueFactory(currentElemType, currentElement.(string))
if finalElem.RDisplay == "" {
finalElem = newData
} else {
if len(finalElem.RData) == 0 {
finalElem.RData = append(finalElem.RData, newData)
} else {
if len(finalElem.RData[0].RData) == 0 {
finalElem.RData[0].RData = append( finalElem.RData[0].RData , newData)
} else {
if len(finalElem.RData[0].RData[0].RData) == 0 {
finalElem.RData[0].RData[0].RData = append (finalElem.RData[0].RData[0].RData , newData)
} else {
if len(finalElem.RData[0].RData[0].RData[0].RData) == 0 {
finalElem.RData[0].RData[0].RData[0].RData = append(finalElem.RData[0].RData[0].RData[0].RData, newData )
} else {
finalElem.RData[0].RData[0].RData[0].RData[0].RData = append(finalElem.RData[0].RData[0].RData[0].RData[0].RData, newData)
}
}
}
}
}
}
fmt.Println("final element" , finalElem)
}
wondering if there is a way to write a recursive function for creating dynamic nested json in go?
thanks

I don't know exactly what you are trying to achive, I ran your application, and you are building a tree from the flat structure. Why and what your original plan was is not clear.
Yet, your application's ever growing if tree is always doing the same with the last appended RootElem and can be written as follows. As you can see, your if structure is now independent from NumField()
var appendHere *RootElem
for i := 0; i < reflect.ValueOf(elment).NumField(); i++ {
[ ... stuff deleted ... ]
if finalElem.RDisplay == "" {
finalElem = newData
appendHere = &finalElem
} else {
appendHere.RData = append(appendHere.RData, newData)
appendHere = &(appendHere.RData[0])
}
}
fmt.Println("final element", finalElem)
}
Would have written this as comment, but the answer is too large for comments.

Related

Elasticsearch ingest pipeline: how to recursively modify values in a HashMap

Using an ingest pipeline, I want to iterate over a HashMap and remove underscores from all string values (where underscores exist), leaving underscores in the keys intact. Some values are arrays that must further be iterated over to do the same modification.
In the pipeline, I use a function to traverse and modify the values of a Collection view of the HashMap.
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
void findReplace(Collection collection) {
collection.forEach(element -> {
if (element instanceof String) {
element.replace('_',' ');
} else {
findReplace(element);
}
return true;
})
}
Collection samples = ctx.samples;
samples.forEach(sample -> { //sample.sample_tags is a HashMap
Collection sample_tags = sample.sample_tags.values();
findReplace(sample_tags);
return true;
})
"""
}
}
]
}
When I simulate the pipeline ingestion, I find the string values are not modified. Where am I going wrong?
POST /_ingest/pipeline/samples/_simulate
{
"docs": [
{
"_index": "samples",
"_id": "xUSU_3UB5CXFr25x7DcC",
"_source": {
"samples": [
{
"sample_tags": {
"Entry_A": [
"A_hyphentated-sample",
"sample1"
],
"Entry_B": "A_multiple_underscore_example",
"Entry_C": [
"sample2",
"another_example_with_underscores"
],
"Entry_E": "last_example"
}
}
]
}
}
]
}
\\Result
{
"docs" : [
{
"doc" : {
"_index" : "samples",
"_type" : "_doc",
"_id" : "xUSU_3UB5CXFr25x7DcC",
"_source" : {
"samples" : [
{
"sample_tags" : {
"Entry_E" : "last_example",
"Entry_C" : [
"sample2",
"another_example_with_underscores"
],
"Entry_B" : "A_multiple_underscore_example",
"Entry_A" : [
"A_hyphentated-sample",
"sample1"
]
}
}
]
},
"_ingest" : {
"timestamp" : "2020-12-01T17:29:52.3917165Z"
}
}
}
]
}
Here is a modified version of your script that will work on the data you provided:
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
String replaceString(String value) {
return value.replace('_',' ');
}
void findReplace(Map map) {
map.keySet().forEach(key -> {
if (map[key] instanceof String) {
map[key] = replaceString(map[key]);
} else {
map[key] = map[key].stream().map(this::replaceString).collect(Collectors.toList());
}
});
}
ctx.samples.forEach(sample -> {
findReplace(sample.sample_tags);
return true;
});
"""
}
}
]
}
The result looks like this:
{
"samples" : [
{
"sample_tags" : {
"Entry_E" : "last example",
"Entry_C" : [
"sample2",
"another example with underscores"
],
"Entry_B" : "A multiple underscore example",
"Entry_A" : [
"A hyphentated-sample",
"sample1"
]
}
}
]
}
You were on the right path but you were working on copies of values and weren't setting the modified values back onto the document context ctx which is eventually returned from the pipeline. This means you'll need to keep track of the current iteration indexes -- so for the array lists, as for the hash maps and everything in between -- so that you can then target the fields' positions in the deeply nested context.
Here's an example taking care of strings and (string-only) array lists. You'll need to extend it to handle hash maps (and other types) and then perhaps extract the whole process into a separate function. But AFAIK you cannot return multiple data types in Java so it may be challenging...
PUT /_ingest/pipeline/samples
{
"description": "preprocessing of samples.json",
"processors": [
{
"script": {
"tag": "remove underscore from sample_tags values",
"source": """
ArrayList samples = ctx.samples;
for (int i = 0; i < samples.size(); i++) {
def sample = samples.get(i).sample_tags;
for (def entry : sample.entrySet()) {
def key = entry.getKey();
def val = entry.getValue();
def replaced_val;
if (val instanceof String) {
replaced_val = val.replace('_',' ');
} else if (val instanceof ArrayList) {
replaced_val = new ArrayList();
for (int j = 0; j < val.length; j++) {
replaced_val.add(val[j].replace('_',' '));
}
}
// else if (val instanceof HashMap) {
// do your thing
// }
// crucial part
ctx.samples[i][key] = replaced_val;
}
}
"""
}
}
]
}

Elasticsearch pre-processing to remove null fields as part of ingest

I have a use case where an API i'm calling to retrieve data to put into elasticsearch is returning nulls.
I need to write an ingest pipeline that uses processors to remove all null fields before writing it into elasticsearch. Processors may or may not use painless scripting.
Here is a sample payload that i currently get from the API
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
}
}
The pipeline should ideally insert the document as -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1"
}
}
Note, the fields are dynamic so i can't write a processor that checks for nulls against a defined set of fields.
Thanks!
Null fields are not indexed nor are searchable.I have written below pipeline to remove such fields. Please test it before use on all of your scenarios. After posting documents using this pipeline, you won't be able to search null fields using "exists"
Pipeline:
PUT _ingest/pipeline/remove_null_fields
{
"description": "Remove any null field",
"processors": [
{
"script": {
"source": """
// return list of field with null values
def loopAllFields(def x){
def ret=[];
if(x instanceof Map){
for (entry in x.entrySet()) {
if (entry.getKey().indexOf("_")==0) {
continue;
}
def val=entry.getValue();
if( val instanceof HashMap ||
val instanceof Map ||
val instanceof ArrayList)
{
def list=[];
if(val instanceof ArrayList)
{
def index=0;
// Call for each object in arraylist
for(v in val)
{
list=loopAllFields(v);
for(item in list)
{
ret.add(entry.getKey()+"["+index+"]."+ item);
}
index++;
}
}
else
{
list =loopAllFields(val);
}
if(list.size()==val.size())
{
ret.add(entry.getKey());
}
else{
for(item in list)
{
ret.add(entry.getKey()+"."+ item);
}
}
}
if(val==null)
{
ret.add(entry.getKey());
}
}
}
return ret;
}
/* remove fields from source, recursively deletes fields which part of other fields */
def removeField(def ctx, def fieldname)
{
def pos=fieldname.indexOf(".");
if(pos>0)
{
def str=fieldname.substring(0,pos);
if(str.indexOf('[')>0 && str.indexOf(']')>0)
{
def s=str.substring(0,str.indexOf('['));
def i=str.substring(str.indexOf('[')+1,str.length()-1);
removeField(ctx[s][Integer.parseInt(i)],fieldname.substring(pos+1,fieldname.length()));
}
else
{
if(ctx[str] instanceof Map)
{
removeField(ctx[str],fieldname.substring(pos+1,fieldname.length()));
}
}
}else{
ctx.remove(fieldname);
}
return ctx;
}
def list=[];
list=loopAllFields(ctx);
for(item in list)
{
removeField(ctx,item);
}
"""
}
}
]
}
Post Document:
POST index8/_doc?pipeline=remove_null_fields
{
"master_desc": "TESTING PART",
"ddd":null,
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
}
}
Result:
"hits" : [
{
"_index" : "index8",
"_type" : "_doc",
"_id" : "06XAyXEBAWHHnYGOSa_M",
"_score" : 1.0,
"_source" : {
"date_added" : "2019-10-24T09:30:03",
"master_no" : {
"master_no" : 18460110,
"barcode" : "NLSKYTEST1-1"
},
"master_desc" : "TESTING PART"
}
}
]
#Jaspreet, so the script almost worked. It didn't however eliminate empty objects, empty arrays or empty values. Here is a doc i tried to index -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1",
"external_key": null,
"umid": null
},
"remote_sync_state": "",
"lib_title_footage": [],
"prj_no": {
"prj_no": null,
"prj_desc": null,
}
The above returned -
{
"master_desc": "TESTING PART",
"date_added": "2019-10-24T09:30:03",
"master_no": {
"master_no": 18460110,
"barcode": "NLSKYTEST1-1"
},
"remote_sync_state": "",
"lib_title_footage": [ ],
"prj_no": { }
I tried updated the script to have the condition check for these patterns but got a compile error unfortunately.

Group Data on elastic search with same value on two key

I have just started to learn about elastic search and facing a problem on group aggregation. I have a data set on elastic search like :
[{
srcIP : "10.0.11.12",
dstIP : "19.67.78.91",
totalMB : "0.25"
},{
srcIP : "10.45.11.62",
dstIP : "19.67.78.91",
totalMB : "0.50"
},{
srcIP : "13.67.52.91",
dstIP : "10.0.11.12",
totalMB : "0.75"
},{
srcIP : "10.23.64.12",
dstIP : "10.45.11.62",
totalMB : "0.25"
}]
I Just want to group data on the basis of srcIP and sum the field totalMB but I just wanna add up on more thing like when group by performing on scrIP then it will match the srcIP value to dstIP value and also sum the totalMB for dstIP.
Output should be like this :
buckets : [{
key : "10.0.11.12",
total_GB_SrcIp :{
value : "0.25"
},
total_GB_dstIP :{
value : "0.75"
}
},
{
key : "10.45.11.62",
total_MB_SrcIp :{
value : "0.50"
},
total_MB_dstIP :{
value : "0.25"
}
}]
I have done normal aggregation for one key but didn't get the final query for my problem.
Query :
GET /index*/_search
{
size : 0,
"aggs": {
"group_by_srcIP": {
"terms": {
"field": "srcIP",
"size": 100,
"order": {
"total_MB_SrcIp": "desc"
}
},
"aggs": {
"total_MB_SrcIp": {
"sum": {
"field": "TotalMB"
}
}
}
}
}
}
Hope you understand my problem on the basis of sample output.
Thanks in advance.
As per my understanding, you need a sum aggregation on field (totalMB) with respect to distinct values in two another fields (srcIP, dstIP).
AFAIK, elastic search is not that good for aggregating on values of multiple fields, unless you combine those fields together using some document ingestion or combine it on application side itself. (I may be wrong here, though).
I gave it a try to get required output using scripted_metric aggregation. (Please read about it if you don't know what it is or how it works)
I experimented on painless script to do following in aggregation:
pick srcIp, dstIp & totalMB from each doc
populate a cross-mapping like IP -> { (src : totalMBs), (dst : totalMBs) } in a map
return this map as result of aggregation
Here is the actual search query with aggregation:
GET /testIndex/testType/_search
{
"size": 0,
"aggs": {
"ip-addr": {
"scripted_metric": {
"init_script": "params._agg.addrs = []",
"map_script": "def lst = []; lst.add(doc.srcIP.value); lst.add(doc.dstIP.value); lst.add(doc.totalMB.value); params._agg.addrs.add(lst);",
"combine_script": "Map ipMap = new HashMap(); for(entry in params._agg.addrs) { def srcIp = entry.get(0); def dstIp = entry.get(1); def mbs = entry.get(2); if(ipMap.containsKey(srcIp)) {def srcMbSum = mbs + ipMap.get(srcIp).get('srcMB'); ipMap.get(srcIp).put('srcMB',srcMbSum); } else {Map types = new HashMap(); types.put('srcMB', mbs); types.put('dstMB', 0.0); ipMap.put(srcIp, types); } if(ipMap.containsKey(dstIp)) {def dstMbSum = mbs + ipMap.get(dstIp).get('dstMB'); ipMap.get(dstIp).put('dstMB',dstMbSum); } else {Map types = new HashMap(); types.put('srcMB', 0.0); types.put('dstMB', mbs); ipMap.put(dstIp, types); } } return ipMap;",
"reduce_script": "Map resultMap = new HashMap(); for(ipMap in params._aggs) {for(entry in ipMap.entrySet()) {def ip = entry.getKey(); def srcDestMap = entry.getValue(); if(resultMap.containsKey(ip)) {Map types = new HashMap(); types.put('srcMB', srcDestMap.get('srcMB') + resultMap.get(ip).get('srcMB')); types.put('dstMB', srcDestMap.get('dstMB') + resultMap.get(ip).get('dstMB')); resultMap.put(ip, types); } else {resultMap.put(ip, srcDestMap); } } } return resultMap;"
}
}
}
}
Here are experiment details:
Index mapping:
GET testIndex/_mapping
{
"testIndex": {
"mappings": {
"testType": {
"dynamic": "true",
"_all": {
"enabled": false
},
"properties": {
"dstIP": {
"type": "ip"
},
"srcIP": {
"type": "ip"
},
"totalMB": {
"type": "double"
}
}
}
}
}
}
Sample input:
POST testIndex/testType
{
"srcIP" : "10.0.11.12",
"dstIP" : "19.67.78.91",
"totalMB" : "0.25"
}
POST testIndex/testType
{
"srcIP" : "10.45.11.62",
"dstIP" : "19.67.78.91",
"totalMB" : "0.50"
}
POST testIndex/testType
{
"srcIP" : "13.67.52.91",
"dstIP" : "10.0.11.12",
"totalMB" : "0.75"
}
POST testIndex/testType
{
"srcIP" : "10.23.64.12",
"dstIP" : "10.45.11.62",
"totalMB" : "0.25"
}
Query output:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0,
"hits": []
},
"aggregations": {
"ip-addr": {
"value": {
"13.67.52.91": {
"srcMB": 0.75,
"dstMB": 0
},
"10.23.64.12": {
"srcMB": 0.25,
"dstMB": 0
},
"10.45.11.62": {
"srcMB": 0.5,
"dstMB": 0.25
},
"19.67.78.91": {
"srcMB": 0,
"dstMB": 0.75
},
"10.0.11.12": {
"srcMB": 0.25,
"dstMB": 0.75
}
}
}
}
}
Here is readable query for better understanding.
"scripted_metric": {
"init_script": "params._agg.addrs = []",
"map_script": """
def lst = [];
lst.add(doc.srcIP.value);
lst.add(doc.dstIP.value);
lst.add(doc.totalMB.value);
params._agg.addrs.add(lst);
""",
"combine_script": """
Map ipMap = new HashMap();
for(entry in params._agg.addrs) {
def srcIp = entry.get(0);
def dstIp = entry.get(1);
def mbs = entry.get(2);
if(ipMap.containsKey(srcIp)) {
def srcMbSum = mbs + ipMap.get(srcIp).get('srcMB');
ipMap.get(srcIp).put('srcMB',srcMbSum);
} else {
Map types = new HashMap();
types.put('srcMB', mbs);
types.put('dstMB', 0.0);
ipMap.put(srcIp, types);
}
if(ipMap.containsKey(dstIp)) {
def dstMbSum = mbs + ipMap.get(dstIp).get('dstMB');
ipMap.get(dstIp).put('dstMB',dstMbSum);
} else {
Map types = new HashMap();
types.put('srcMB', 0.0);
types.put('dstMB', mbs);
ipMap.put(dstIp, types);
}
}
return ipMap;
""",
"reduce_script": """
Map resultMap = new HashMap();
for(ipMap in params._aggs) {
for(entry in ipMap.entrySet()) {
def ip = entry.getKey();
def srcDestMap = entry.getValue();
if(resultMap.containsKey(ip)) {
Map types = new HashMap();
types.put('srcMB', srcDestMap.get('srcMB') + resultMap.get(ip).get('srcMB'));
types.put('dstMB', srcDestMap.get('dstMB') + resultMap.get(ip).get('dstMB'));
resultMap.put(ip, types);
} else {
resultMap.put(ip, srcDestMap);
}
}
}
return resultMap;
"""
}
However, prior to going in depth, I would suggest you to test it out on some sample data and check if it works. Scripted metric aggregations do have considerable impact on query performance.
One more thing, to get required key string in aggregation result, replace all occurrences of 'srcMB' & 'dstMB' in script to 'total_GB_SrcIp' & 'total_GB_DstIp' as per your need.
Hope this may help you or some one.
FYI, I tested this on ES v5.6.11.

How to take the shortest distance per person (with multiple addresses) to an origin point and sort on that value

I have People documents in my elastic index and each person has multiple addresses, each address has a lat/long point associated.
I'd like to geo sort all the people by proximity to a specific origin location, however multiple locations per person complicates this matter. What has been decided is [Objective:] to take the shortest distance per person to the origin point and use that number as the sort number.
Example of my people index roughed out in 'pseudo-JSON' showing a couple of person documents each having multiple addresses:
person {
name: John Smith
addresses [
{ lat: 43.5234, lon: 32.5432, 1 Main St. }
{ lat: 44.983, lon: 37.3432, 2 Queen St. W. }
{ ... more addresses ... }
]
}
person {
name: Jane Doe
addresses [
... she has a bunch of addresses too ...
]
}
... many more people docs each having multiple addresses like above ...
Currently I'm using an elastic script field with an inline groovy script like so - it uses a groovy script to calculate meters from origin for each address, shoves all those meter distances into an array per person and picks the minimum number from the array per person making it the sort value.
string groovyShortestDistanceMetersSortScript = string.Format("[doc['geo1'].distance({0}, {1}), doc['geo2'].distance({0}, {1})].min()",
origin.Latitude,
origin.Longitude);
var shortestMetersSort = new SortDescriptor<Person>()
.Script(sd => sd
.Type("number")
.Script(script => script
.Inline(groovyShortestDistanceMetersSortScript)
)
.Order(SortOrder.Ascending)
);
Although this works, I wonder if using a scripted field might be more expensive or too complex at querying time, and if there is a better way to achieve the desired sort order outcome by indexing the data differently and/or by using aggregations, maybe even doing away with the script field altogether.
Any thoughts and guidance are appreciated. I'm sure somebody else has run into this same requirement (or similar) and has found a different or better solution.
I'm using the Nest API in this code sample but will gladly accept answers in elasticsearch JSON format because I can port those into the NEST API code.
When sorting on distance from a specified origin where the field being sorted on contains a collection of values (in this case geo_point types), we can specify how a value should be collected from the collection using the sort_mode. In this case, we can specify a sort_mode of "min" to use the nearest location to the origin as the sort value. Here's an example
public class Person
{
public string Name { get; set; }
public IList<Address> Addresses { get; set; }
}
public class Address
{
public string Name { get; set; }
public GeoLocation Location { get; set; }
}
void Main()
{
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var indexName = "people";
var connectionSettings = new ConnectionSettings(pool)
.InferMappingFor<Person>(m => m.IndexName(indexName));
var client = new ElasticClient(connectionSettings);
if (client.IndexExists(indexName).Exists)
client.DeleteIndex(indexName);
client.CreateIndex(indexName, c => c
.Settings(s => s
.NumberOfShards(1)
.NumberOfReplicas(0)
)
.Mappings(m => m
.Map<Person>(mm => mm
.AutoMap()
.Properties(p => p
.Nested<Address>(n => n
.Name(nn => nn.Addresses.First().Location)
.AutoMap()
)
)
)
)
);
var people = new[] {
new Person {
Name = "John Smith",
Addresses = new List<Address>
{
new Address
{
Name = "Buckingham Palace",
Location = new GeoLocation(51.501476, -0.140634)
},
new Address
{
Name = "Empire State Building",
Location = new GeoLocation(40.748817, -73.985428)
}
}
},
new Person {
Name = "Jane Doe",
Addresses = new List<Address>
{
new Address
{
Name = "Eiffel Tower",
Location = new GeoLocation(48.858257, 2.294511)
},
new Address
{
Name = "Uluru",
Location = new GeoLocation(-25.383333, 131.083333)
}
}
}
};
client.IndexMany(people);
// call refresh for testing (avoid in production)
client.Refresh("people");
var towerOfLondon = new GeoLocation(51.507313, -0.074308);
client.Search<Person>(s => s
.MatchAll()
.Sort(so => so
.GeoDistance(g => g
.Field(f => f.Addresses.First().Location)
.PinTo(towerOfLondon)
.Ascending()
.Unit(DistanceUnit.Meters)
// Take the minimum address location distance from
// our target location, The Tower of London
.Mode(SortMode.Min)
)
)
);
}
This creates the following search
{
"query": {
"match_all": {}
},
"sort": [
{
"_geo_distance": {
"addresses.location": [
{
"lat": 51.507313,
"lon": -0.074308
}
],
"order": "asc",
"mode": "min",
"unit": "m"
}
}
]
}
which returns
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : null,
"hits" : [ {
"_index" : "people",
"_type" : "person",
"_id" : "AVcxBKuPlWTRBymPa4yT",
"_score" : null,
"_source" : {
"name" : "John Smith",
"addresses" : [ {
"name" : "Buckingham Palace",
"location" : {
"lat" : 51.501476,
"lon" : -0.140634
}
}, {
"name" : "Empire State Building",
"location" : {
"lat" : 40.748817,
"lon" : -73.985428
}
} ]
},
"sort" : [ 4632.035195223564 ]
}, {
"_index" : "people",
"_type" : "person",
"_id" : "AVcxBKuPlWTRBymPa4yU",
"_score" : null,
"_source" : {
"name" : "Jane Doe",
"addresses" : [ {
"name" : "Eiffel Tower",
"location" : {
"lat" : 48.858257,
"lon" : 2.294511
}
}, {
"name" : "Uluru",
"location" : {
"lat" : -25.383333,
"lon" : 131.083333
}
} ]
},
"sort" : [ 339100.6843074794 ]
} ]
}
}
The value returned in the sort array for each hit is the minimum distance in the sort unit specified (in our case, metres) from the specified point (The Tower of London) and the addresses for each person.
Per the guidelines in Sorting By Distance documentation, often it can make more sense to score by distance, which can be achieved by using function_score query with a decay function.

Setting up React in combination with NVD3

I am fairly new to React (and front-end work in general). How can I make set up React to work with NVD3?
Currently I am trying to implement the example code of react-nvd3, but get the following error thrown by the d3 function d3.transform():
Unexpected value translate(145, NaN) parsing transform attribute.
g.setAttribute("transform", string);
My code is as follows:
var TestData = [{
key: "Cumulative Return",
values: [
{
"label" : "A" ,
"value" : -29.765957771107
} ,
{
"label" : "B" ,
"value" : 0
} ,
{
"label" : "C" ,
"value" : 32.807804682612
} ,
{
"label" : "D" ,
"value" : 196.45946739256
} ,
{
"label" : "E" ,
"value" : 0.19434030906893
} ,
{
"label" : "F" ,
"value" : -98.079782601442
} ,
{
"label" : "G" ,
"value" : -13.925743130903
} ,
{
"label" : "H" ,
"value" : -5.1387322875705
}
]
}];
var App = React.createClass({
render: function() {
return (
<NVD3Chart id="chart" type="discreteBarChart" datum={TestData} x="test" y="test"/>
);
}
});
ReactDOM.render(
<App/>,
document.getElementById('container')
);
See https://jsfiddle.net/x2wuko8g/2/ I guess it has to do something with the format of TestData, but can't figure it out.
Help is much appreciated!
I think the problem is your x and y props are misconfigured.
<NVD3Chart id="chart" type="discreteBarChart" datum={TestData} x="test" y="test"/>
You set x to test and y to test but those fields are not present in your data.
Try changing x to label and y to value in this way:
<NVD3Chart id="chart" type="discreteBarChart" datum={TestData} x="label" y="value"/>
If you pass a string (like in this case) the library will look up in each item of the data for a key with that string.
I hope this help.

Resources