Jolt - change format of elasticsearch response - elasticsearch

This should be a no brainer as Jolt was built mainly to transform ES ,mongodb responses.But I am unable to figure it out
I want to parse ES response and print only selected fields. For instance I want to transform the response to
{
"time" : 63,
"totalhits":100,
"0";{ response1.field1,response1.field2},
"1";{ response2.field1,response2.field2},
"2";{ response3.field1,response3.field2},
}
{
"took" : 63,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1000,
"max_score" : null,
"hits" : [ {
"_index" : "bank",
"_type" : "account",
"_id" : "0",
"sort": [0],
"_score" : null,
"_source" : {"account_number":0,"balance":16623,"firstname":"Bradshaw","lastname":"Mckenzie","age":29,"gender":"F","address":"244 Columbus Place","employer":"Euron","email":"bradshawmckenzie#euron.com","city":"Hobucken","state":"CO"}
}, {
"_index" : "bank",
"_type" : "account",
"_id" : "1",
"sort": [1],
"_score" : null,
"_source" : {"account_number":1,"balance":39225,"firstname":"Amber","lastname":"Duke","age":32,"gender":"M","address":"880 Holmes Lane","employer":"Pyrami","email":"amberduke#pyrami.com","city":"Brogan","state":"IL"}
}, ...
]
}
}
The spec file I got so far is
[
{
"operation": "shift",
"spec": {
"hits": {
"*": {
"*": "&"
}
}
}
}
]

figured it out.
[
{
"operation": "shift",
"spec": {
"took": "took",
"hits": {
"total": "total_hits",
"hits": {
"*": {
"_source": {
"country": "Response[&2].country",
"city": "Response[&2].city",
"year": "Response[&2].year"
}
}
}
}
}
}
]

Related

Get specific fields only elasticsearch

I need some help regarding querying in elasticsearch.
So basically, the api looks something like this:
{
"took": 58,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1020900,
"max_score": 1,
"hits": [
{
"_index": "index-20192029",
"_type": "_doc",
"_id": "urn:22291760",
"_score": 1,
"_source": {
"user_id": 1234567,
"document": [
{
"documentType": "application/pdf",
"documentUrl": "http://somethingxyz1234.pdf"
},
{
"documentType": "application/xml",
"documentUrl": "http://somethingxyz1234.xml"
}
], .....
How do I only get the url that is an xml?
I tried doing
"_source": ["user_id", "document.documentType", "document.documentUrl"],
"query": {
"bool": {
"match": { "document.documentType" :"application/xml"}
}
}
But that also included the pdf.
I just want the documentUrl to give only the url that's xml.
Thanks
If document is nested you can use inner_hits to get the document query match.
GET test/_search
{
"query": {
"nested": {
"path": "document",
"query": {
"term": {
"document.documentType": {
"value": "application/pdf"
}
}
},
"inner_hits": {}
}
}
}
Results:
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "pv36jIIB-X7q7ErxEhyg",
"_score" : 0.6931471,
"_source" : {
"document" : [
{
"documentType" : "application/pdf",
"documentUrl" : "http://somethingxyz1234.pdf"
},
{
"documentType" : "application/xml",
"documentUrl" : "http://somethingxyz1234.xml"
}
]
},
"inner_hits" : {
"document" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.6931471,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "pv36jIIB-X7q7ErxEhyg",
"_nested" : {
"field" : "document",
"offset" : 0
},
"_score" : 0.6931471,
"_source" : {
"documentType" : "application/pdf",
"documentUrl" : "http://somethingxyz1234.pdf"
}
}
]
}
}
}
}
]

Getting incorrect inner hits from parent child relationship when combined with boolean query

Getting incorrect inner hits from parent child relationship when combined with boolean query
Hi Everyone
I am getting incorrect inner hits results when combining parent-child query with boolean query. To reproduce the issue, I create this Index
PUT /my-index-000001
{
"mappings": {
"_routing": {
"required": true
},
"properties": {
"parentProperty": {
"type": "text"
},
"childProperty": {
"type": "text"
},
"id": {
"type": "integer"
},
"myJoinField": {
"type": "join",
"relations": {
"parent": "mychild"
}
}
}
}
}
then I add these three documents (document with Id equals "1" is the parent of the other two documents)
POST /my-index-000001/_doc/1?routing=1
{
"id": 1,
"parentProperty": "a parent document",
"myJoinField": "parent"
}
POST /my-index-000001/_doc/2?routing=1
{
"id": 2,
"childProperty": "queensland civil administration",
"myJoinField": {
"name":"mychild",
"parent":"1"
}
}
POST /my-index-000001/_doc/3?routing=1
{
"id": 3,
"childProperty": "beautiful weather",
"myJoinField": {
"name":"mychild",
"parent":"1"
}
}
now we set up our index with 3 documents. I am looking for all child documents that meet this boolean query: [childProperty contains either "queensland civil" or both "beautiful" and "nothing"].
I expect that elastic returns only the child document with Id "2" since the child document with Id "3" does not have the term "nothing" in it.
The translated version of this query is as follows:
GET /my-index-000001/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"has_child": {
"inner_hits": {
"name": "opr1"
},
"query": {
"query_string": {
"analyzer": "stop",
"query": "childProperty:(\"queensland civil\")"
}
},
"type": "mychild"
}
},
{
"bool": {
"must": [
{
"has_child": {
"inner_hits": {
"name": "opr2"
},
"query": {
"query_string": {
"query": "childProperty:(beautiful)"
}
},
"type": "mychild"
}
},
{
"has_child": {
"inner_hits": {
"name": "opr3"
},
"query": {
"query_string": {
"query": "childProperty:(nothing)"
}
},
"type": "mychild"
}
}
]
}
}
]
}
}
}
and the result that is returned from elasitc is as follows:
{
"took" : 24,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "my-index-000001",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_routing" : "1",
"_source" : {
"id" : 1,
"parentProperty" : "a parent document",
"myJoinField" : "parent"
},
"inner_hits" : {
"opr1" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.2814486,
"hits" : [
{
"_index" : "my-index-000001",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.2814486,
"_routing" : "1",
"_source" : {
"id" : 2,
"childProperty" : "queensland civil administration",
"myJoinField" : {
"name" : "mychild",
"parent" : "1"
}
}
}
]
}
},
"opr2" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.7549127,
"hits" : [
{
"_index" : "my-index-000001",
"_type" : "_doc",
"_id" : "3",
"_score" : 0.7549127,
"_routing" : "1",
"_source" : {
"id" : 3,
"childProperty" : "beautiful weather",
"myJoinField" : {
"name" : "mychild",
"parent" : "1"
}
}
}
]
}
},
"opr3" : {
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
}
}
]
}
}
as you can see in the result the elastic returns both child document which clearly is against what I have written in the "must" section of the query.
but if I rewrite the query as following then it will return ONLY the expected document (document with Id "2"):
GET /my-index-000001/_search
{
"query": {
"bool": {
"must": [
{
"has_child": {
"inner_hits": {
"name": "opr1"
},
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"query_string": {
"query": "childProperty:(\"queensland civil\")"
}
},
{
"bool": {
"must": [
{
"query_string": {
"query": "childProperty:(beautiful)"
}
},
{
"query_string": {
"query": "childProperty:(weather1)"
}
}
]
}
}
]
}
},
"type": "mychild"
}
}
]
}
}
}
here is the correct result:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "my-index-000001",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_routing" : "1",
"_source" : {
"id" : 1,
"parentProperty" : "a parent document",
"myJoinField" : "parent"
},
"inner_hits" : {
"opr1" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.2814486,
"hits" : [
{
"_index" : "my-index-000001",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.2814486,
"_routing" : "1",
"_source" : {
"id" : 2,
"childProperty" : "queensland civil administration",
"myJoinField" : {
"name" : "mychild",
"parent" : "1"
}
}
}
]
}
}
}
}
]
}
}
I appreciate it if someone tells me what I did wrong in the first query or if this is the default behavior in elasitc when it comes to parent/child relationship.

How can I search the specific value in the _source from elasticSearch inquired result?

I'm collecting logs through Elastic Search. And I look up the results through a query.
When inquiring with the following query
GET test/_search
{
"query": {
"match_all":{
}
}
}
The result is inquired as follows.
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 100,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "test",
"_id" : "1a2b3c4d5e6f",
"_score" : 1.0,
"_source" : {
"team" : "Marketing"
"number" : "3"
"name" : "Mark"
}
},
{
"_index" : "test",
"_id" : "1a2b3c4d5e66",
"_score" : 1.0,
"_source" : {
"team" : "HR"
"number" : "1"
"name" : "John"
}
},
........
but, I want to be inquired as below.(Specific value of Inner_hits)
{
"name": "Mark"
},
{
"name": "John"
},
So, How can I query a specific value inner_hits?
Thanks.
You could simply use the source_filtering feature of ES, so in your case, your query will like below:
{
"_source": "name",
"query": {
"match_all": {}
}
}
And it returns search results like
"hits": [
{
"_index": "64214413",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"name": "Mark"
}
},
{
"_index": "64214413",
"_type": "_doc",
"_id": "2",
"_score": 1.0,
"_source": {
"name": "John"
}
}
]

Elasticsearch Query Returning Zero Results for Substring

I created my first AWS ElasticSearch cluster and uploaded some data to it (shown below).
When I search for a domain such as example.com, I get zero results.
Is this a search query or indexing issue?
# curl -XGET -u username:password 'https://xxxxx.us-east-1.es.amazonaws.com/hosts/_search?q=example.com&pretty=true'
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
I confirmed that a match_all query does return all the records.
match_all
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "hosts",
"_type" : "_doc",
"_id" : "KK0PcnMBqk4TBzxZPeGU",
"_score" : 1.0,
"_source" : {
"name" : "mail.stackoverflow.com",
"type" : "a",
"value" : "10.0.0.3"
}
},
{
"_index" : "hosts",
"_type" : "_doc",
"_id" : "J60PcnMBqk4TBzxZPeGU",
"_score" : 1.0,
"_source" : {
"name" : "ns1.guardian.co.uk",
"type" : "a",
"value" : "10.0.0.2"
}
},
{
"_index" : "hosts",
"_type" : "_doc",
"_id" : "Ka0PcnMBqk4TBzxZPeGU",
"_score" : 1.0,
"_source" : {
"name" : "test.example.com",
"type" : "a",
"value" : "10.0.0.4"
}
}
]
}
}
Bulk Upload Command
curl -XPUT -u username:password https://xxxxx.us-east-1.es.amazonaws.com/_bulk --data-binary #bulk.json -H 'Content-Type: application/json'
bulk.json
{ "index" : { "_index": "hosts" } }
{"name":"ns1.guardian.co.uk","type":"a","value":"10.0.0.2"}
{ "index" : { "_index": "hosts" } }
{"name":"mail.stackoverflow.com","type":"a","value":"10.0.0.3"}
{ "index" : { "_index": "hosts" } }
{"name":"test.example.com","type":"a","value":"10.0.0.4"}
You can use the Path hierarchy tokenizer that takes a hierarchical value like a filesystem path, splits on the path separator, and emits a term for each component in the tree.
Index Mapping:
{
"settings": {
"analysis": {
"analyzer": {
"path-analyzer": {
"type": "custom",
"tokenizer": "path-tokenizer"
}
},
"tokenizer": {
"path-tokenizer": {
"type": "path_hierarchy",
"delimiter": ".",
"reverse": "true"
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "path-analyzer",
"search_analyzer": "keyword"
}
}
}
}
Analyze API
In the index mapping above,reverse is set to true which will emit the tokens in reverse order. (reverse is by default set to false)
POST /hosts/_analyze
{
"analyzer": "path-analyzer",
"text": "test.example.com"
}
This will produce three tokens:
{
"tokens": [
{
"token": "test.example.com",
"start_offset": 0,
"end_offset": 16,
"type": "word",
"position": 0
},
{
"token": "example.com",
"start_offset": 5,
"end_offset": 16,
"type": "word",
"position": 0
},
{
"token": "com",
"start_offset": 13,
"end_offset": 16,
"type": "word",
"position": 0
}
]
}
Search Query:
{
"query": {
"term": {
"name": "example.com"
}
}
}
Search Result:
"hits": [
{
"_index": "hosts",
"_type": "_doc",
"_id": "d67gdHMBcF4W0YVjq8ed",
"_score": 1.3744103,
"_source": {
"name": "test.example.com",
"type": "a",
"value": "10.0.0.4"
}
}
]

Elasticsearch: How to optimize the source parameter in a script function?

I have the following data in an Elasticsearch index called products
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "products",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"prod_id" : 1,
"currency" : "USD",
"price" : 1
}
},
{
"_index" : "products",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"prod_id" : 2,
"currency" : "INR",
"price" : 60
}
},
{
"_index" : "products",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"prod_id" : 3,
"currency" : "EUR",
"price" : 2
}
},
{
"_index" : "products",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"prod_id" : 5,
"currency" : "MYR",
"price" : 1
}
}
]
}
}
I am sorting the data based on the price field,
I have the following script to do so -
GET products/_search
{
"query": {
"function_score": {
"query": {
"match_all": {}
},
"functions": [{
"script_score": {
"script": {
"params": {
"USD": 1,
"SGD": 0.72,
"MYR": 0.24,
"INR": 0.014,
"EUR": 1.12
},
"source": "doc['price'].value * (doc.currency.value == 'eur'? params.EUR : doc.currency.value == 'myr' ? params.MYR : doc.currency.value == 'inr' ? params.INR : 1)"
}
}
}]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}
Because the field currency in the product index is of type text,
it is indexed with Standard Analyzer, which converts it to lower case.
I wish to optimise this part of the script, As I may end up with 20-30 currencies -
"source": "doc['price'].value * (doc.currency.value == 'eur'? params.EUR : doc.currency.value == 'myr' ? params.MYR : doc.currency.value == 'inr' ? params.INR : 1)"
I was able to optimize the source script with the following working solution -
GET products/_search
{
"query": {
"function_score": {
"query": {
"match_all": {}
},
"functions": [{
"script_score": {
"script": {
"params": {
"USD": 1,
"SGD": 0.72,
"MYR": 0.24,
"INR": 0.014,
"EUR": 1.12
},
"source": "doc['price'].value * params[doc['currency.keyword'].value]"
}
}
}]
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
]
}

Resources