ElasticSearch - Dot in field name of nested object - elasticsearch

I have data of this form:
{
"workers": {
"worker.1": {
"jobs": 1234
},
},
"total_jobs": 1234
}
and I'm trying to deal with having the "dot" in the field-name. I tried this mapping:
{
"worker_stats": {
"properties": {
"workers": {
"type": "object",
"properties": {
"worker.1": {
"type": "nested",
"index_name": "worker_1",
"properties": {
"jobs": {
"type": "integer"
}
}
}
}
},
"total_jobs": {
"type": "integer"
}
}
}
}
but when I fetch my mapping, the index_name is no-where to be seen, and when I add a document, it's still got the dot.
Ultimately, I'm just trying to do some aggregations:
{
"query": {
"filtered": {
"query": {
"match_all": {}
}
}
},
"aggs": {
"worker1_stats": {
"aggs": {
"stats": {
"stats": {
"field": "workers.worker.1.jobs"
}
}
},
"nested": {
"path": "workers.worker.1"
}
}
}
}
but the dot interferes.
What can I do to deal with this dot? Is there a way to use script instead of field? (Is my use of nested even correct?

I think you can use a index_name, path, and type : object in your mapping to change the name of that field during indexing.
Here is my example:
PUT /twitter/
{
"settings" : {
"number_of_shards" : 5,
"number_of_replicas" : 0
},
"mappings": {
"tweet":{
"properties": {
"desc.youbet":{"type":"object","path":"just_name",
"properties": {
"one": {
"type": "integer", "index_name":"one"
}
}
}
}
}
}
}
PUT /twitter/tweet/1
{
"name":"chicken",
"desc.youbet":{
"one":1,
}
}
PUT /twitter/tweet/2
{
"name":"chicken",
"desc.youbet":{
"one":1,
}
}
You can now used desc to do operations on and search for what was one in your document so this:
POST /twitter/tweet/_search
{
"query": {"match_all": {}},
"aggs":{
"stats": {
"stats": {"field": "one"}
}
}, "size":0
}
Results in something like this:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"stats": {
"count": 2,
"min": 1,
"max": 1,
"avg": 1,
"sum": 2
}
}
}

Related

How to get the parent document in a nested top_hits aggregation?

This is my document/mapping with a nested prices array:
{
"name": "Foobar",
"type": 1,
"prices": [
{
"date": "2016-03-22",
"price": 100.41
},
{
"date": "2016-03-23",
"price": 200.41
}
]
}
Mapping:
{
"properties": {
"name": {
"index": "not_analyzed",
"type": "string"
},
"type": {
"type": "byte"
},
"prices": {
"type": "nested",
"properties": {
"date": {
"format": "dateOptionalTime",
"type": "date"
},
"price": {
"type": "double"
}
}
}
}
}
I use a top_hits aggregation to get the min price of the nested price array. I also have to filter the prices by date. Here is the query and the response:
POST /index/type/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"prices": {
"nested": {
"path": "prices"
},
"aggs": {
"date_filter": {
"filter": {
"range": {
"prices.date": {
"gte": "2016-03-21"
}
}
},
"aggs": {
"min": {
"top_hits": {
"sort": {
"prices.price": {
"order": "asc"
}
},
"size": 1
}
}
}
}
}
}
}
}
Response:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"prices": {
"doc_count": 4,
"date_filter": {
"doc_count": 4,
"min": {
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "4225796ALL2016061541031",
"_nested": {
"field": "prices",
"offset": 0
},
"_score": null,
"_source": {
"date": "2016-03-22",
"price": 100.41
},
"sort": [
100.41
]
}
]
}
}
}
}
}
}
Is there a way to get the parent source document (or some fields from it) with _id="4225796ALL2016061541031" in the response (e.g. name)? A second query is not an option.
Instead of applying aggregations use query and inner_hits like :
{
"query": {
"nested": {
"path": "prices",
"query": {
"range": {
"prices.date": {
"gte": "2016-03-21"
}
}
},
"inner_hits": {
"sort": {
"prices.price": {
"order": "asc"
}
},
"size": 1
}
}
}
}
Fetch data of parent_documentdata from _source and actual data from inner_hits.
Hope it helps

How to get analyzed word count by Elasticsearch?

I would like to count each token analyzed.
First, I tried following codes:
mapping:
{
"docs": {
"mappings": {
"doc": {
"dynamic": "false",
"properties": {
"text": {
"type": "string",
"analyzer": "kuromoji"
}
}
}
}
}
}
query:
{
"query": {
"match_all": {}
},
"aggs": {
"word-count": {
"terms": {
"field": "text",
"size": "1000"
}
}
},
"size": 0
}
I queried my index after inserting my data, I got a following result:
{
"took": 41
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 10000,
"max_score": 0,
"hits": []
},
"aggregations": {
"word-count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 36634,
"buckets": [
{
"key": "はい",
"doc_count": 4734
},
{
"key": "いただく",
"doc_count": 2440
},
...
]
}
}
}
Unfortunately, term aggregation provides only a doc_count. It's not a word count. So, I think the way to get approximate word count using _index['text']['TERM'].df() and _index['text']['TERM'].ttf().
Maybe the approximate word count is the following equation:
WordCount = doc_count['TERM'] / _index['text']['TERM'].df() * _index['text']['TERM'].ttf()
'TERM' is key in buckets. I tried to write a scripted metric aggregation, but i didn't know how to get keys in buckets.
{
"query": {
"match_all": {}
},
"aggs": {
"doc-count": {
"terms": {
"field": "text",
"size": "1000"
}
},
"aggs": {
"word-count": {
"scripted_metric": {
// ???
}
}
}
},
"size": 0
}
How can I get keys in buckets?
If it is impossible, how can I get a analyzed word count?
You can try with the token count data type. Simply add a sub-field of that type to your text field:
{
"docs": {
"mappings": {
"doc": {
"dynamic": "false",
"properties": {
"text": {
"type": "string",
"analyzer": "kuromoji"
},
"fields": {
"nb_tokens": {
"type": "token_count",
"analyzer": "kuromoji"
}
}
}
}
}
}
}
Then you can use text.nb_tokens in your aggregation.
Can you try dynamic_scripting,though this will affect performance..
{
"query": {
"match_all": {}
},
"aggs": {
"word-count": {
"terms": {
"script": "_source.text",
"size": "1000"
}
}
},
"size": 0
}

ElasticSearch: Aggregations of URLs keeps splitting field

I'm trying to write an elasticsearch query that groups all blogs with the same blog domain (wordpress.com, blog.com, etc). This is how my query looks like:
{
"engagements": [
"blogs"
],
"query": {
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"range": {
"weight": {
"gte": 120,
"lte": 150
}
}
}
]
}
}
}
},
"facets": {
"my_facet": {
"terms": {
"field": "blog_domain" <-------------------------------------
}
}
}
},
"api": "_search"
}
However, it's returning this:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
...
]
},
"facets": {
"my_facet": {
"_type": "terms",
"missing": 0,
"total": 21,
"other": 3,
"terms": [
{
"term": "http",
"count": 3
},
{
"term": "noblepig.com",
"count": 2
},
{
"term": "hawaiian",
"count": 2
},
{
"term": "dream",
"count": 2
},
{
"term": "dessert",
"count": 2
},
{
"term": "2015",
"count": 2
},
{
"term": "05",
"count": 2
},
{
"term": "www.bt",
"count": 1
},
{
"term": "photos",
"count": 1
},
{
"term": "images.net",
"count": 1
}
]
}
}
}
This isn't what I want.
Right now my database has three records:
"http://www.bt-images.net/8-cute-photos-cats/",
"http://noblepig.com/2015/05/hawaiian-dream-dessert/",
"http://noblepig.com/2015/05/hawaiian-dream-dessert/"
I want it to return something like:
"facets": {
"my_facet": {
"_type": "terms",
"missing": 0,
"total": 21,
"other": 3,
"terms": [
{
"term": "http://noblepig.com/2015/05/hawaiian-dream-dessert/",
"count": 2
},
{
"term": "http://www.bt-images.net/8-cute-photos-cats/",
"count": 1
},
How would I do this? I looked it up and saw people recommending mappings but I don't know where to put that in this query and my table has 100 million records so it's too late to do that. If you have suggestions, could you please paste the whole query?
The same happens when I use aggs:
{
"engagements": [
"blogs"
],
"query": {
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"range": {
"weight": {
"gte": 13,
"lte": 75
}
}
}
]
}
}
}
},
"aggs": {
"blah": {
"terms": {
"field": "blog_domain"
}
}
}
},
"api": "_search"
}
The right way to do this is to have a different mapping for that field. You can change the mapping on the way by adding a sub-field to blog_domain but you cannot change the documents that were already indexed. The mapping change will take effect for the new documents.
Just for the sake of mentioning this, your blog_domain should look like this:
"blog_domain": {
"type": "string",
"fields": {
"notAnalyzed": {
"type": "string",
"index": "not_analyzed"
}
}
}
meaning it should have a sub-field (in my sample is called notAnalyzed) and in your aggregation you should use blog_domain.notAnalyzed.
But, if you don't want to or can't make this change, there is a way but I believe it's slower: using scripted aggregation. Something like this:
{
"aggs": {
"blah": {
"terms": {
"script": "_source.blog_domain",
"size": 10
}
}
}
}
And you need to enable dynamic scripting, if you don't have it enabled.
If you use Elasticsearch 5.x, you could the mapping below
PUT your_index
{
"mappings": {
"your_type": {
"properties": {
"blog_domain": {
"type": "keyword",
"index": "not_analyzed"
}
}
}
}
}

Not able to aggregate on nested fields in elasticsearch

I have set a field to nested and now i am not able to aggregate on it.
Sample document -
{
"attributes" : [
{ "name" : "snake" , "type" : "reptile" },
{ "name" : "cow" , "type" : "mamal" }
]
}
attributes field is nested.
Following terms query is not working on this
{
"aggs" : {
"terms" : { "field" : "attributes.name" }
}
}
How can I do the aggregation in elasticsearch?
Use a nested aggregation.
As a simple example, I created an index with a nested property matching what you posted:
PUT /test_index
{
"mappings": {
"doc": {
"properties": {
"attributes": {
"type": "nested",
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
}
}
}
}
}
}
}
Then added your document:
PUT /test_index/doc/1
{
"attributes": [
{ "name": "snake", "type": "reptile" },
{ "name": "cow", "type": "mammal" }
]
}
Now I can get "attribute.name" terms as follows:
POST /test_index/_search?search_type=count
{
"aggs": {
"nested_attributes": {
"nested": {
"path": "attributes"
},
"aggs": {
"name_terms": {
"terms": {
"field": "attributes.name"
}
}
}
}
}
}
...
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"nested_attributes": {
"doc_count": 2,
"name_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "cow",
"doc_count": 1
},
{
"key": "snake",
"doc_count": 1
}
]
}
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/0e3ed9c700f240e523be08a27551707d4448a9df

Elasticsearch: generating terms from array using script

Would love an explanation of why this happens and how to correct it.
Here's a snippet of the source document:
{
"created_time":1412988495000,
"tags":{
"items":[
{
"tag_type":"Placement",
"tag_id":"id1"
},
{
"tag_type":"Product",
"tag_id":"id2"
}
]
}
}
The following terms aggregation:
"aggs":{
"tags":{
"terms":{
"script":"doc['tags'].value != null ? doc['tags.items.tag_type'].value + ':' + doc['tags.items.tag_id'].value : ''",
"size":2000,
"exclude":{
"pattern":"null:null"
}
}
}
}
returns:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Placement:id2",
"doc_count":1
}
]
...when you would expect:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Product:id2",
"doc_count":1
}
]
I would probably go with a nested type. I don't know all the details of your setup, but here is a proof of concept, at least. I took out the "items" property because I didn't need that many layers, and just used "tags" as the nested type. It could be added back in if needed, I think.
So I set up an index with a "nested" property:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"created_time": {
"type": "date"
},
"tags": {
"type": "nested",
"properties": {
"tag_type": {
"type": "string",
"index": "not_analyzed"
},
"tag_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then added a couple of docs (notice that the structure differs slightly from yours):
PUT /test_index/doc/1
{
"created_time": 1412988495000,
"tags": [
{
"tag_type": "Placement",
"tag_id": "id1"
},
{
"tag_type": "Product",
"tag_id": "id2"
}
]
}
PUT /test_index/doc/2
{
"created_time": 1412988475000,
"tags": [
{
"tag_type": "Type3",
"tag_id": "id3"
},
{
"tag_type": "Type4",
"tag_id": "id3"
}
]
}
Now a scripted terms aggregation inside a nested aggregation seems to do the trick:
POST /test_index/_search?search_type=count
{
"query": {
"match_all": {}
},
"aggs": {
"tags": {
"nested": { "path": "tags" },
"aggs":{
"tag_vals": {
"terms": {
"script": "doc['tag_type'].value+':'+doc['tag_id'].value"
}
}
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"tags": {
"doc_count": 4,
"tag_vals": {
"buckets": [
{
"key": "Placement:id1",
"doc_count": 1
},
{
"key": "Product:id2",
"doc_count": 1
},
{
"key": "Type3:id3",
"doc_count": 1
},
{
"key": "Type4:id3",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4ceaf8693f85ff257c2fd0639ba62295f2e5e8c5

Resources