Elastic Search: Aggregation sum on a particular field - elasticsearch

I am new to elastic search and requesting some help.
Basically I have some 2 million documents in my elastic search and the documents look like below:
{
"_index": "flipkart",
"_type": "PSAD_ThirdParty",
"_id": "430001_MAM_2016-02-04",
"_version": 1,
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 206,
"bookedNumbers": 15000,
"ut": 1454567400000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
I want to write an aggregation query which satisfies below conditions:
1) First query based on "_index", "_type" and "pliId".
2) Do aggregation sum on metrics.value based on metrics.id = "Metric1".
Basically I need to query records based on some fields and aggregate sum on a particular metrics value based on metrics id.
Please can you help me in getting my query right.

Your metrics field needs to be of type nested:
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
}
}
}
If you want Metric1 to match, meaning upper-case letter, then as you see above the id needs to be not_analyzed.
Then, if you only want metrics.id = "Metric1" aggregations, you need something like this:
{
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"pliId": 206
}
}
]
}
}
}
},
"aggs": {
"by_metrics": {
"nested": {
"path": "metrics"
},
"aggs": {
"metric1_only": {
"filter": {
"bool": {
"must": [
{
"term": {
"metrics.id": {
"value": "Metric1"
}
}
}
]
}
},
"aggs": {
"by_metric_id": {
"terms": {
"field": "metrics.id"
},
"aggs": {
"total_delivery": {
"sum": {
"field": "metrics.value"
}
}
}
}
}
}
}
}
}
}

Created new index:
Method : PUT ,
URL : http://localhost:9200/google/
Body:
{
"mappings": {
"PSAD_Primary": {
"properties": {
"metrics": {
"type": "nested",
"properties": {
"id": {
"type": "string",
"index": "not_analyzed"
},
"value": {
"type": "integer",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then I inserted some 200 thousand documents and than ran the query and it worked.
Response:
{
"took": 34,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "google",
"_type": "PSAD_Primary",
"_id": "383701291_MAM_2016-01-06",
"_score": 1,
"_source": {
"metrics": [
{
"id": "Metric1",
"value": 70
},
{
"id": "Metric2",
"value": 90
},
{
"id": "Metric3",
"value": 120
}
],
"primary": true,
"ticketId": 1,
"pliId": 221244,
"bookedNumbers": 15000,
"ut": 1452061800000,
"startDate": 1451629800000,
"endDate": 1464589800000,
"tz": "EST"
}
}
]
},
"aggregations": {
"by_metrics": {
"doc_count": 3,
"metric1_only": {
"doc_count": 1,
"by_metric_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Metric1",
"doc_count": 1,
"total_delivery": {
"value": 70
}
}
]
}
}
}
}
}

Related

Elasticsearch - Nested field sorting

I have an index defined by the following :
{
"mappings": {
"properties": {
"firstName": {
"type": "keyword"
},
"lastName": {
"type": "keyword"
},
"affiliations": {
"type": "nested",
"properties": {
"organisation": {
"type": "keyword"
},
"team": {
"type": "keyword"
},
"dateBeginning": {
"type": "date",
"format": "yyyy-MM-dd"
},
"dateEnding": {
"type": "date",
"format": "yyyy-MM-dd"
},
"country": {
"type": "keyword"
}
}
}
}
}
}
Basically, for each researcher (researchers is how I named my index) I want to sort the the affiliations by dateBeginning, in descending order. I've read about inner hits in the EL official doc, and not being exactly sure how it works I've tried this for researcher with _id : 3 :
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
},
"inner_hits": {
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations",
"filter": {
"term": { "_id": 3 }
}
}
}
}
]
}
}
}
}
And it doesn't really work.
Having two affiliation for researchers with _id : 3, with one dateBeginning set on 2015-06-30, and the other on 2017-06-30. So I've tried this also :
{
"sort" : [
{
"affiliations.dateBeginning" : {
"order" : "desc",
"nested": {
"path": "affiliations"
}
}
}
],
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": { "_id": 3 }
}
}
}
}
And it doesn't sort the affiliations by dateBeginning.
I've also tried to do it with the SQL API (since I'm more familiar with SQL language), and still, I can't get the data I want.
So I'm quite new to ElasticSearch, I'm using version 7.10, and I don't know what else to do.
Any suggestions about what I'm doing wrong here ?
EDIT
here's an example of a document from that index:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [{
"_index": "researchers",
"_type": "_doc",
"_id": "3",
"_score": 1.0,
"_source": {
"firstName": "Kimmich",
"lastName": "Yoshua",
"affiliations": [{
"organisation": "University of Ottawa",
"team": "Neural Network Elite Team",
"dateBeginning": "2015-06-30",
"datEnding": "2017-01-31",
"country": "Canada"
},
{
"organisation": "University of Montréal",
"team": "Picture processing team",
"dateBeginning": "2017-06-30",
"dateEnding": null,
"country": "Canada"
}
]
}
}]
}
}
Once you're inside the nested query, the inner hits don't need the extra nested query. Remove it and the sort will work properly:
{
"query": {
"nested": {
"path": "affiliations",
"query": {
"match": {
"_id": 3
}
},
"inner_hits": {
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc"
}
}
]
}
}
}
}
Note that this wouldn't sort the top-level hits -- only the inner hits.
But you can sort on the top level by the values of affiliations.dateBeginning like so:
POST researchers/_search
{
"sort": [
{
"affiliations.dateBeginning": {
"order": "desc",
"nested_path": "affiliations"
}
}
]
}
but note that the syntax is now slightly different: instead of path we're saying nested_path.

Query only those documents where image field is not empty

I have the following mapping **(dynamic strict on the type)**
"created": {
"type": "date"
},
"images": {
"properties": {
"checksum": {
"type": "text",
"index": false
},
"path": {
"type": "text",
"index": false
},
"url": {
"type": "text",
"index": false
}
}
},
I want to query documents where there is a image present
I tried couple of combinations but no luck so far.
This is the last i tried
POST catalog/_search
{
"query": {
"script": {
"script": "doc['images'].values.length > 0"
}
}
}
POST catalog/_search
{
"query": {
"script": {
"script": "doc['images.url'].values.length > 0"
}
}
}
But here it says that field data is not true for text fields. Is there anyway I can do this without changing my mapping.
Ideally this should give me all the records where there is no images. But this is returning all records
POST catalog/_search
{
"query": {
"bool": {
"must_not": [
{
"exists": {
"field": "images"
}
}
]
}
}
}
Here is the example document in which there is a image.
{
"_index": "catalog-2018-03-03",
"_type": "product",
"_id": "151755703145e27e4983a0bd1b70be44",
"_score": 1,
"_source": {
"merchant": {
"link": "http://shophive.com/",
"name": "shophive"
},
"images": [],
"updated": "2018-03-18T13:06:33.583480",
"name": "Plantronics Savi Talk",
"created": "2018-03-18T13:06:33.583459",
"url": "http://www.shophive.com/plantronics-savi-talk",
"price": {
"new": 24999,
"old": 24999,
"discount_percent": 0
},
"category": {
"level_1": {
"url": "computers/tablets/networking",
"name": "Computers/Tablets & Networking "
},
"level_2": {
"url": "tablets/ebook-readers",
"name": "Tablets & eBook Readers"
}
}
}
}
Updated
With the below query I am expecting that elasticsearch would return the documents in which image is missing
POST catalog/product/_search
{
"query": {
"bool": {
"must_not": [
{
"exists": {
"field": "images"
}
}
]
}
}
}
But the result i receive is all the documents in my index and apparently every document has one image. Here is the example document i get with above query
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 20967,
"max_score": 1,
"hits": [
{
"_index": "catalog-2018-03-03",
"_type": "product",
"_id": "151755703145e27e4983a0bd1b70be44",
"_score": 1,
"_source": {
"merchant": {
"link": "http://shophive.com/",
"name": "shophive"
},
"images": [
{
"url": "http://www.shophive.com/media/catalog/product/cache/1/small_image/165x/9df78eab33525d08d6e5fb8d27136e95/p/l/plantronics_savi_talk.jpg",
"path": "full/8e3587bd2b6107f0beafa9b1ba05f476539be0a8.jpg",
"checksum": "fa74ade23c8e80e9590d48d4e59b6b64"
}
],
"updated": "2018-03-18T13:06:33.583480",
"name": "Plantronics Savi Talk",
"created": "2018-03-18T13:06:33.583459",
"url": "http://www.shophive.com/plantronics-savi-talk",
"price": {
"new": 24999,
"old": 24999,
"discount_percent": 0
},
"category": {
"level_1": {
"url": "computers/tablets/networking",
"name": "Computers/Tablets & Networking "
},
"level_2": {
"url": "tablets/ebook-readers",
"name": "Tablets & eBook Readers"
}
}
}
}
}
}
You should leave out the the square brackets in the query as you only have one clause
POST /catalog/_search
{
"query": {
"bool": {
"must_not": {
"exists": {
"field": "images"
}
}
}
}
}
This returns the docs with out images for me and if you need only those that have images
POST /catalog/_search
{
"query": {
"exists": {
"field": "images"
}
}
}

How to get the parent document in a nested top_hits aggregation?

This is my document/mapping with a nested prices array:
{
"name": "Foobar",
"type": 1,
"prices": [
{
"date": "2016-03-22",
"price": 100.41
},
{
"date": "2016-03-23",
"price": 200.41
}
]
}
Mapping:
{
"properties": {
"name": {
"index": "not_analyzed",
"type": "string"
},
"type": {
"type": "byte"
},
"prices": {
"type": "nested",
"properties": {
"date": {
"format": "dateOptionalTime",
"type": "date"
},
"price": {
"type": "double"
}
}
}
}
}
I use a top_hits aggregation to get the min price of the nested price array. I also have to filter the prices by date. Here is the query and the response:
POST /index/type/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"prices": {
"nested": {
"path": "prices"
},
"aggs": {
"date_filter": {
"filter": {
"range": {
"prices.date": {
"gte": "2016-03-21"
}
}
},
"aggs": {
"min": {
"top_hits": {
"sort": {
"prices.price": {
"order": "asc"
}
},
"size": 1
}
}
}
}
}
}
}
}
Response:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": [
]
},
"aggregations": {
"prices": {
"doc_count": 4,
"date_filter": {
"doc_count": 4,
"min": {
"hits": {
"total": 4,
"max_score": null,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "4225796ALL2016061541031",
"_nested": {
"field": "prices",
"offset": 0
},
"_score": null,
"_source": {
"date": "2016-03-22",
"price": 100.41
},
"sort": [
100.41
]
}
]
}
}
}
}
}
}
Is there a way to get the parent source document (or some fields from it) with _id="4225796ALL2016061541031" in the response (e.g. name)? A second query is not an option.
Instead of applying aggregations use query and inner_hits like :
{
"query": {
"nested": {
"path": "prices",
"query": {
"range": {
"prices.date": {
"gte": "2016-03-21"
}
}
},
"inner_hits": {
"sort": {
"prices.price": {
"order": "asc"
}
},
"size": 1
}
}
}
}
Fetch data of parent_documentdata from _source and actual data from inner_hits.
Hope it helps

Configuring elasticsearch to search and filter with has many / belongs to relationship

I have a Product model where each product has many skus.
I need to be able to search and filter via elasticsearch across both models, but not quite sure how to go about it. I'm currently uploading to elasticsearch in this format:
[{
id: 1
title: 'Product 1'
image: 'image1.jpg'
skus: [{
id: 1
material: 'cotton'
quantity: 4
},{
id: 2
material: 'polyester'
quantity: 22
}]
},{
...
}]
I can search the title just fine, but I am unsure as to how I could do something like
Search for title 'foobar' and filter by material 'cotton' and quantity > 5
Is this possible with elasticsearch?
Edit
I am open to uploading in a different format or using multiple indices.
I think the parent/child relationship is what you're looking for.
As a quick example, I can set up an index with a parent type and child type like this:
PUT /test_index
{
"mappings": {
"product": {
"properties": {
"id": {
"type": "long"
},
"image": {
"type": "string"
},
"title": {
"type": "string"
}
}
},
"sku": {
"_parent": {
"type": "product"
},
"properties": {
"id": {
"type": "long"
},
"material": {
"type": "string"
},
"quantity": {
"type": "long"
}
}
}
}
}
Then add a parent document and two child documents:
POST /test_index/_bulk
{"index":{"_type":"product","_id":1}}
{"id": 1,"title": "Product1","image": "image1.jpg"}
{"index":{"_type":"sku", "_id":1,"_parent":1}}
{"id": 1,"material": "cotton","quantity": 4}
{"index":{"_type":"sku","_id":2,"_parent":1}}
{"id": 2,"material": "polyester","quantity": 22}
Now if I search for a "product" with "title": "Product1" that has a child "sku" with "material": "cotton" and "quantity" greater than 5, I won't find one:
POST /test_index/product/_search
{
"query": {
"filtered": {
"query": {
"match": {
"title": "Product1"
}
},
"filter": {
"has_child": {
"type": "sku",
"filter": {
"bool": {
"must": [
{
"term": {
"material": "cotton"
}
},
{
"range": {
"quantity": {
"gt": 5
}
}
}
]
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
But if I search for a "product" with "title": "Product1" that has a child "sku" with "material": "polyester" and "quantity" greater than 5, I will find one:
POST /test_index/product/_search
{
"query": {
"filtered": {
"query": {
"match": {
"title": "Product1"
}
},
"filter": {
"has_child": {
"type": "sku",
"filter": {
"bool": {
"must": [
{
"term": {
"material": "polyester"
}
},
{
"range": {
"quantity": {
"gt": 5
}
}
}
]
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.4054651,
"hits": [
{
"_index": "test_index",
"_type": "product",
"_id": "1",
"_score": 1.4054651,
"_source": {
"id": 1,
"title": "Product1",
"image": "image1.jpg"
}
}
]
}
}
Here is some code I used for testing:
http://sense.qbox.io/gist/d1989a28372ac9daae335d585601c11818b2fa11

Elasticsearch: generating terms from array using script

Would love an explanation of why this happens and how to correct it.
Here's a snippet of the source document:
{
"created_time":1412988495000,
"tags":{
"items":[
{
"tag_type":"Placement",
"tag_id":"id1"
},
{
"tag_type":"Product",
"tag_id":"id2"
}
]
}
}
The following terms aggregation:
"aggs":{
"tags":{
"terms":{
"script":"doc['tags'].value != null ? doc['tags.items.tag_type'].value + ':' + doc['tags.items.tag_id'].value : ''",
"size":2000,
"exclude":{
"pattern":"null:null"
}
}
}
}
returns:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Placement:id2",
"doc_count":1
}
]
...when you would expect:
"buckets":[
{
"key":"Placement:id1",
"doc_count":1
},
{
"key":"Product:id2",
"doc_count":1
}
]
I would probably go with a nested type. I don't know all the details of your setup, but here is a proof of concept, at least. I took out the "items" property because I didn't need that many layers, and just used "tags" as the nested type. It could be added back in if needed, I think.
So I set up an index with a "nested" property:
DELETE /test_index
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"created_time": {
"type": "date"
},
"tags": {
"type": "nested",
"properties": {
"tag_type": {
"type": "string",
"index": "not_analyzed"
},
"tag_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
}
Then added a couple of docs (notice that the structure differs slightly from yours):
PUT /test_index/doc/1
{
"created_time": 1412988495000,
"tags": [
{
"tag_type": "Placement",
"tag_id": "id1"
},
{
"tag_type": "Product",
"tag_id": "id2"
}
]
}
PUT /test_index/doc/2
{
"created_time": 1412988475000,
"tags": [
{
"tag_type": "Type3",
"tag_id": "id3"
},
{
"tag_type": "Type4",
"tag_id": "id3"
}
]
}
Now a scripted terms aggregation inside a nested aggregation seems to do the trick:
POST /test_index/_search?search_type=count
{
"query": {
"match_all": {}
},
"aggs": {
"tags": {
"nested": { "path": "tags" },
"aggs":{
"tag_vals": {
"terms": {
"script": "doc['tag_type'].value+':'+doc['tag_id'].value"
}
}
}
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"tags": {
"doc_count": 4,
"tag_vals": {
"buckets": [
{
"key": "Placement:id1",
"doc_count": 1
},
{
"key": "Product:id2",
"doc_count": 1
},
{
"key": "Type3:id3",
"doc_count": 1
},
{
"key": "Type4:id3",
"doc_count": 1
}
]
}
}
}
}
Here is the code I used:
http://sense.qbox.io/gist/4ceaf8693f85ff257c2fd0639ba62295f2e5e8c5

Resources