Elasticsearch: generating terms from array using script - elasticsearch

Would love an explanation of why this happens and how to correct it.
Here's a snippet of the source document:
{
"created_time":1412988495000,
"tags":{
"items":[
{
"tag_type":"Placement",
"tag_id":"id1"
},
{
"tag_type":"Product",
"tag_id":"id2"
}
]
}
}
The following terms aggregation:
"aggs":{
"tags":{
"terms":{
"script":"doc['tags'].value != null ? doc['tags.items.tag_type'].value + ':' + doc['tags.items.tag_id'].value : ''",
"size":2000,
"exclude":{
"pattern":"null:null"
}
}
}
}
returns:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Placement:id2",
"doc_count":1
}
]
...when you would expect:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Product:id2",
"doc_count":1
}
]

I would probably go with a nested type. I don't know all the details of your setup, but here is a proof of concept, at least. I took out the "items" property because I didn't need that many layers, and just used "tags" as the nested type. It could be added back in if needed, I think.
So I set up an index with a "nested" property:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"created_time": {
"type": "date"
},
"tags": {
"type": "nested",
"properties": {
"tag_type": {
"type": "string",
"index": "not_analyzed"
},
"tag_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then added a couple of docs (notice that the structure differs slightly from yours):
PUT /test_index/doc/1
{
"created_time": 1412988495000,
"tags": [
{
"tag_type": "Placement",
"tag_id": "id1"
},
{
"tag_type": "Product",
"tag_id": "id2"
}
]
}
PUT /test_index/doc/2
{
"created_time": 1412988475000,
"tags": [
{
"tag_type": "Type3",
"tag_id": "id3"
},
{
"tag_type": "Type4",
"tag_id": "id3"
}
]
}
Now a scripted terms aggregation inside a nested aggregation seems to do the trick:
POST /test_index/_search?search_type=count
{
"query": {
"match_all": {}
},
"aggs": {
"tags": {
"nested": { "path": "tags" },
"aggs":{
"tag_vals": {
"terms": {
"script": "doc['tag_type'].value+':'+doc['tag_id'].value"
}
}
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"tags": {
"doc_count": 4,
"tag_vals": {
"buckets": [
{
"key": "Placement:id1",
"doc_count": 1
},
{
"key": "Product:id2",
"doc_count": 1
},
{
"key": "Type3:id3",
"doc_count": 1
},
{
"key": "Type4:id3",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4ceaf8693f85ff257c2fd0639ba62295f2e5e8c5

Related

Elasticsearch null_pointer_exception with top_hits aggregation

When having a nested top_hits aggregation inside a nested terms aggregation inside a children aggregation, I'm getting a null_pointer_exception. I expect to get a valid response.
Steps to reproduce:
create mapping
PUT http://localhost:9200/test
{
"mappings": {
"doc": {
"properties": {
"docType": {
"type": "text"
},
"userId": {
"type": "long"
},
"userName": {
"type": "text"
},
"title": {
"type": "text"
},
"joinField": {
"type": "join",
"relations": {
"post": "comment"
}
}
}
}
}
}
insert example post
PUT http://localhost:9200/test/doc/1
{
"joinField": {
"name": "post"
},
"docType": "post",
"title": "Example Post"
}
insert comment
PUT http://localhost:9200/test/doc/2?routing=1
{
"joinField": {
"name": "comment",
"parent": "1"
},
"userId": 22,
"userName": "John Doe",
"title": "Random comment",
"docType": "comment"
}
Perform search
POST http://localhost:9200/test/doc/_search
{
"aggs": {
"to-comment": {
"children": {
"type": "comment"
},
"aggs": {
"by-user": {
"terms": {
"field": "userId"
},
"aggs": {
"data": {
"top_hits": {
"size": 1
}
}
}
}
}
}
},
"query": {
"bool": {
"filter": [
{
"term": {
"docType": "post"
}
}
]
}
}
}
Response:
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 4,
"skipped": 0,
"failed": 1,
"failures": [
{
"shard": 3,
"index": "test",
"node": "0RbF1bIbRO-yN5C1m-HXPA",
"reason": {
"type": "null_pointer_exception",
"reason": null
}
}
]
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
},
"aggregations": {
"to-comment": {
"doc_count": 0,
"by-user": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
}
It works if I remove the query, but in the actual hits I want to get all the posts only. It also works if I remove the terms aggregations, but I want to filter the posts by other queries (e.g. match on title).
It seems that this is a bug with elastic search. The bug has been reported and will hopefully be fixed soon (https://github.com/elastic/elasticsearch/issues/37650).
If you have any alternative solutions on how to build a similar aggregation, please let me know.
Edit: You can use the painless scripting language for a work-around:
"script": {
"lang": "painless",
"source": "params._source.userName"
}

Elastic Search: Aggregation sum on a particular field

I am new to elastic search and requesting some help.
Basically I have some 2 million documents in my elastic search and the documents look like below:
{
"_index": "flipkart",
"_type": "PSAD_ThirdParty",
"_id": "430001_MAM_2016-02-04",
"_version": 1,
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 206,
"bookedNumbers": 15000,
"ut": 1454567400000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
I want to write an aggregation query which satisfies below conditions:
1) First query based on "_index", "_type" and "pliId".
2) Do aggregation sum on metrics.value based on metrics.id = "Metric1".
Basically I need to query records based on some fields and aggregate sum on a particular metrics value based on metrics id.
Please can you help me in getting my query right.
Your metrics field needs to be of type nested:
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
}
}
}
If you want Metric1 to match, meaning upper-case letter, then as you see above the id needs to be not_analyzed.
Then, if you only want metrics.id = "Metric1" aggregations, you need something like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"pliId": 206
}
}
]
}
}
}
},
"aggs": {
"by_metrics": {
"nested": {
"path": "metrics"
},
"aggs": {
"metric1_only": {
"filter": {
"bool": {
"must": [
{
"term": {
"metrics.id": {
"value": "Metric1"
}
}
}
]
}
},
"aggs": {
"by_metric_id": {
"terms": {
"field": "metrics.id"
},
"aggs": {
"total_delivery": {
"sum": {
"field": "metrics.value"
}
}
}
}
}
}
}
}
}
}
Created new index:
Method : PUT ,
URL : http://localhost:9200/google/
Body:
{
"mappings": {
"PSAD_Primary": {
"properties": {
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
},
"value": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then I inserted some 200 thousand documents and than ran the query and it worked.
Response:
{
"took": 34,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "google",
"_type": "PSAD_Primary",
"_id": "383701291_MAM_2016-01-06",
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 221244,
"bookedNumbers": 15000,
"ut": 1452061800000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
]
},
"aggregations": {
"by_metrics": {
"doc_count": 3,
"metric1_only": {
"doc_count": 1,
"by_metric_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Metric1",
"doc_count": 1,
"total_delivery": {
"value": 70
}
}
]
}
}
}
}
}

Not able to aggregate on nested fields in elasticsearch

I have set a field to nested and now i am not able to aggregate on it.
Sample document -
{
"attributes" : [
{ "name" : "snake" , "type" : "reptile" },
{ "name" : "cow" , "type" : "mamal" }
]
}
attributes field is nested.
Following terms query is not working on this
{
"aggs" : {
"terms" : { "field" : "attributes.name" }
}
}
How can I do the aggregation in elasticsearch?
Use a nested aggregation.
As a simple example, I created an index with a nested property matching what you posted:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"attributes": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
Then added your document:
PUT /test_index/doc/1
{
"attributes": [
{ "name": "snake", "type": "reptile" },
{ "name": "cow", "type": "mammal" }
]
}
Now I can get "attribute.name" terms as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_attributes": {
"nested": {
"path": "attributes"
},
"aggs": {
"name_terms": {
"terms": {
"field": "attributes.name"
}
}
}
}
}
}
...
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_attributes": {
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cow",
"doc_count": 1
},
{
"key": "snake",
"doc_count": 1
}
]
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/0e3ed9c700f240e523be08a27551707d4448a9df

Elasticsearch - How to get min/max/avg of set of nested documents

Given the following mapping and documents in Elasticsearch, how would I get the min/max/avg of a set of nested documents that match a certain condition? For instance, how would I get them min age of pet that are dogs? My filter gets the correct people that have dogs, but how do I make the min then calculate against the correct nested documents.
(1) Mapping
{
"myIndex": {
"mappings": {
"person": {
"properties": {
"name": {
"type": "string"
},
"pets": {
"type": "nested",
"properties": {
"age": {
"type": "long"
},
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
}
(2) Data
{
"name": "bob",
"pets": [
{
"type": "dog",
"name": "wolfie",
"age": 20
},
{
"type": "cat",
"name": "kitty",
"age": 6
}
]
}
{
"name": "bill",
"pets": [
{
"type": "fish",
"name": "goldie",
"age": 2
},
{
"type": "cat",
"name": "meowie",
"age": 18
}
]
}
(3) Query and aggregation
{
"query": {
"filtered": {
"filter": {
"nested": {
"path": "pets",
"filter" : {
"terms": {
"pets.type": ["dog"]
}
}
}
}
}
},
"aggs": {
"minage": {
"nested": {
"path": "pets"
},
"aggs": {
"minage": {
"min": {
"field": "age"
}
}
}
}
}
}
I think you can get what you want with a combination of filter aggregation and the nested filter's join option.
This code worked for me:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"person": {
"properties": {
"name": {
"type": "string"
},
"pets": {
"type": "nested",
"properties": {
"age": {
"type": "long"
},
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
PUT /test_index/person/1
{
"name": "bob",
"pets": [
{
"type": "dog",
"name": "wolfie",
"age": 20
},
{
"type": "cat",
"name": "kitty",
"age": 6
}
]
}
PUT /test_index/person/2
{
"name": "bill",
"pets": [
{
"type": "fish",
"name": "goldie",
"age": 2
},
{
"type": "cat",
"name": "meowie",
"age": 18
}
]
}
PUT /test_index/person/3
{
"name": "john",
"pets": [
{
"type": "dog",
"name": "oldie",
"age": 25
}
]
}
POST /test_index/_search?search_type=count
{
"aggs": {
"minage_1": {
"nested": {
"path": "pets"
},
"aggs": {
"minage_2": {
"filter": {
"nested": {
"path": "pets",
"filter": {
"terms": {
"pets.type": [
"dog"
]
}
},
"join": false
}
},
"aggs": {
"min_age_3": {
"min": {
"field": "age"
}
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0,
"hits": []
},
"aggregations": {
"minage_1": {
"doc_count": 5,
"minage_2": {
"doc_count": 2,
"min_age_3": {
"value": 20
}
}
}
}
}

ElasticSearch - Dot in field name of nested object

I have data of this form:
{
"workers": {
"worker.1": {
"jobs": 1234
},
},
"total_jobs": 1234
}
and I'm trying to deal with having the "dot" in the field-name. I tried this mapping:
{
"worker_stats": {
"properties": {
"workers": {
"type": "object",
"properties": {
"worker.1": {
"type": "nested",
"index_name": "worker_1",
"properties": {
"jobs": {
"type": "integer"
}
}
}
}
},
"total_jobs": {
"type": "integer"
}
}
}
}
but when I fetch my mapping, the index_name is no-where to be seen, and when I add a document, it's still got the dot.
Ultimately, I'm just trying to do some aggregations:
{
"query": {
"filtered": {
"query": {
"match_all": {}
}
}
},
"aggs": {
"worker1_stats": {
"aggs": {
"stats": {
"stats": {
"field": "workers.worker.1.jobs"
}
}
},
"nested": {
"path": "workers.worker.1"
}
}
}
}
but the dot interferes.
What can I do to deal with this dot? Is there a way to use script instead of field? (Is my use of nested even correct?
I think you can use a index_name, path, and type : object in your mapping to change the name of that field during indexing.
Here is my example:
PUT /twitter/
{
"settings" : {
"number_of_shards" : 5,
"number_of_replicas" : 0
},
"mappings": {
"tweet":{
"properties": {
"desc.youbet":{"type":"object","path":"just_name",
"properties": {
"one": {
"type": "integer", "index_name":"one"
}
}
}
}
}
}
}
PUT /twitter/tweet/1
{
"name":"chicken",
"desc.youbet":{
"one":1,
}
}
PUT /twitter/tweet/2
{
"name":"chicken",
"desc.youbet":{
"one":1,
}
}
You can now used desc to do operations on and search for what was one in your document so this:
POST /twitter/tweet/_search
{
"query": {"match_all": {}},
"aggs":{
"stats": {
"stats": {"field": "one"}
}
}, "size":0
}
Results in something like this:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"stats": {
"count": 2,
"min": 1,
"max": 1,
"avg": 1,
"sum": 2
}
}
}

Resources