Search by internal field in Elasticsearch - elasticsearch

Structure:
{
.................
"mp": "CAR",
"nPhoto": 1,
"items": [
{
"availableQuantity": 3,
},
{
"availableQuantity": 0,
},
{
"availableQuantity": 0,
}
],
............................
}
}
If I filter by mp field, I generate the following query:
GET catalog/_search
{
"from" : 0,
"size" : 0,
"aggregations" : {
"brand" : {
"filter" : {
"bool" : {
"must" : {
"term" : {
"mp" : "CAR"
}
}
}
},
"aggregations" : {
"photosQuantity" : { "sum" : { "field" : "nPhoto" } }
}
}
}
}
But how to generate query if you need to filter by field availableQuantity, where availableQuantity > 0 at least one of the items?

What you probably want is nested query in filter part.
something along line of this:
{
"from": 0,
"size": 0,
"aggregations": {
"brand": {
"filter": {
"nested": {
"path": "items",
"query": {
"range": {
"items.availableQuantity": {
"gte": 0
}
}
}
}
},
"aggregations": {
"photosQuantity": {
"sum": {
"field": "nPhoto"
}
}
}
}
}
}

Related

How to count number of fields inside nested field? - Elasticsearch

I did the following mapping. I would like to count the number of products in each nested field "products" (for each document separately). I would also like to do a histogram aggregation, so that I would know the number of specific bucket sizes.
PUT /receipts
{
"mappings": {
"properties": {
"id" : {
"type": "integer"
},
"user_id" : {
"type": "integer"
},
"date" : {
"type": "date"
},
"sum" : {
"type": "double"
},
"products" : {
"type": "nested",
"properties": {
"name" : {
"type" : "text"
},
"number" : {
"type" : "double"
},
"price_single" : {
"type" : "double"
},
"price_total" : {
"type" : "double"
}
}
}
}
}
}
I've tried this query, but I get the number of all the products instead of number of products for each document separately.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products"
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count" : 6552,
"bucket_size" : {
"value" : 0
}
}
}
UPDATE
Now I have this code where I make separate buckets for each id and count the number of products inside them.
GET /receipts/_search
{
"query": {
"match_all": {}
},
"size" : 0,
"aggs": {
"terms":{
"terms":{
"field": "_id"
},
"aggs": {
"nested": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
}
}
Result of the query:
"aggregations" : {
"terms" : {
"doc_count_error_upper_bound" : 5,
"sum_other_doc_count" : 490,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"nested" : {
"doc_count" : 21,
"bucket_size" : {
"value" : 21
}
}
},
{
"key" : "10",
"doc_count" : 1,
"nested" : {
"doc_count" : 5,
"bucket_size" : {
"value" : 5
}
}
},
{
"key" : "100",
"doc_count" : 1,
"nested" : {
"doc_count" : 12,
"bucket_size" : {
"value" : 12
}
}
},
...
Is is possible to group these values (21, 5, 12, ...) into buckets to make a histogram of them?
products is only the path to the array of individual products, not an aggregatable field. So you'll need to use it on one of your product's field -- such as the number:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"field": "products.number"
}
}
}
}
}
}
Note that is a product has no number, it'll not contribute to the total count. It's therefore best practice to always include an ID in each of them and then aggregate on that field.
Alternatively you could use a script to account for missing values. Luckily value_count does not deduplicate -- meaning if two products are alike and/or have empty values, they'll still be counted as two:
GET receipts/_search
{
"size": 0,
"aggs": {
"terms": {
"nested": {
"path": "products"
},
"aggs": {
"bucket_size": {
"value_count": {
"script": {
"source": "doc['products.number'].toString()"
}
}
}
}
}
}
}
UPDATE
You could also use a nested composite aggregation which'll give you the histogrammed product count w/ the corresponding receipt id:
GET /receipts/_search
{
"size": 0,
"aggs": {
"my_aggs": {
"nested": {
"path": "products"
},
"aggs": {
"composite_parent": {
"composite": {
"sources": [
{
"receipt_id": {
"terms": {
"field": "_id"
}
}
},
{
"product_number": {
"histogram": {
"field": "products.number",
"interval": 1
}
}
}
]
}
}
}
}
}
}
The interval is modifiable.

Elasticsearch Aggregation Query Builder

I am new to working with aggregations in Elasticsearch. I am on version 5.3 of Elasticsearch. I have a query to do a terms aggregation and also a filter, but when I try to use AggregationsBuilder to build the same query, I can't get it to look the same as the manual query. The manual query works and it is:
{
"size": 0,
"aggs": {
"data": {
"nested": {
"path": "data"
},
"aggs": {
"my_filters": {
"filter": {
"bool": {
"must": [
{
"term": {
"data.genre": "sci-fi"
}
}
]
}
},
"aggs": {
"my_agg_field": {
"terms": {
"field": "data.director.keyword"
}
}
}
}
}
}
}
}
But in my code, I need to use AggregationBuilder to create the query. So I use the following:
AggregationBuilder aggregation =
AggregationBuilders
.nested("data", "data")
.subAggregation(
AggregationBuilders.filter("filters", query)
.subAggregation(
AggregationBuilders
.terms("bucket_field").field("data.directors.keyword")
)
);
With the AggregationsBuilder, I get the following query:
{
"aggs": {
"filters" : {
"filter" : {
"nested" : {
"query" : {
"bool" : {
"must" : [
{
"match" : {
"data.genre" : {
"query" : "sci-fi",
"operator" : "OR"
}
}
}
]
}
},
"path" : "data"
}
},
"aggregations" : {
"bucket_field" : {
"terms" : {
"field" : "data.director.keyword"
}
}
}
}
}
}
This aggregation query returns results but doesn't aggregate the buckets like the manual query does and just returns:
"aggregations": {
"filters": {
"doc_count": 62,
"bucket_field": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
},
I need help converting my manual query into an AggregationsBuilder query. Thanks!

aggregating properties in elastic search

I have an indexed entry that has optional properties. So, for example, I have entries like this
{
"id":1
"field1":"XYZ"
},
{
"id":2
"field2":"XYZ"
},
{
"id":3
"field1":"XYZ"
}
I would like to make an aggregation that will tell me how many entries I have with field1 and field2 populated.
The expected result should be:
{
"field1":2
"field2":1
}
Is this even possible with elasticsaerch?
Yes, you can do it like this:
POST myindex/_search
{
"size": 0,
"aggs": {
"field_exists": {
"filters": {
"filters": {
"field1": {
"exists": {
"field": "field1"
}
},
"field2": {
"exists": {
"field": "field2"
}
}
}
}
}
}
}
You'll get an answer like this one:
"aggregations" : {
"field_exists" : {
"buckets" : {
"field1" : {
"doc_count" : 2
},
"field2" : {
"doc_count" : 1
}
}
}
}

Elastic Search: Selecting multiple vlaues in aggregates

In Elastic Search I have the following index with 'allocated_bytes', 'total_bytes' and other fields:
{
"_index" : "metrics-blockstore_capacity-2017_06",
"_type" : "datapoint",
"_id" : "AVzHwgsi9KuwEU6jCXy5",
"_score" : 1.0,
"_source" : {
"timestamp" : 1498000001000,
"resource_guid" : "2185d15c-5298-44ac-8646-37575490125d",
"allocated_bytes" : 1.159196672E9,
"resource_type" : "machine",
"total_bytes" : 1.460811776E11,
"machine" : "2185d15c-5298-44ac-8646-37575490125d"
}
I have the following query to
1)get a point for 30 minute interval using date-histogram
2)group by field on resource_guid.
3)max aggregate to find the max value.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValue": {
"max": {
"field": "allocated_bytes"
}
}
}
},
"sumUnique": {
"sum_bucket": {
"buckets_path": "groupByField>maxValue"
}
}
}
}
}
}
But with this query I am able to get only allocated_bytes, but I need to have both allocated_bytes and total_bytes at the result point.
Following is the result from the above query:
{
"key_as_string" : "2017-06-20T21:00:00.000Z",
"key" : 1497992400000,
"doc_count" : 9,
"groupByField" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "2185d15c-5298-44ac-8646-37575490125d",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
}, {
"key" : "c3513cdd-58bb-4f8e-9b4c-467230b4f6e2",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156165632E9
}
}, {
"key" : "eff13403-9737-4d08-9dca-fb6c12c3a6fa",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
} ]
},
"sumUnique" : {
"value" : 3.468529664E9
}
}
I do need both allocated_bytes and total_bytes. How do I get multiple fields( allocated_bytes, total_bytes) for each point?
For example:
"sumUnique" : {
"Allocatedvalue" : 3.468529664E9,
"TotalValue" : 9.468529664E9
}
or like this:
"allocatedBytessumUnique" : {
"value" : 3.468529664E9
}
"totalBytessumUnique" : {
"value" : 9.468529664E9
},
You can just add another aggregation:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValueAllocated": {
"max": {
"field": "allocated_bytes"
}
},
"maxValueTotal": {
"max": {
"field": "total_bytes"
}
}
}
},
"sumUniqueAllocatedBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueAllocated"
}
},
"sumUniqueTotalBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueTotal"
}
}
}
}
}
}
I hope you are aware that sum_bucket calculates sibling aggregations only, in this case gives sum of max values, not the sum of total_bytes. If you want to get sum of total_bytes you can use sum aggregation

Post filter on subaggregation in elasticsearch

I am trying to run a post filter on the aggregated data, but it is not working as i expected. Can someone review my query and suggest if i am doing anything wrong here.
"query" : {
"bool" : {
"must" : {
"range" : {
"versionDate" : {
"from" : null,
"to" : "2016-04-22T23:13:50.000Z",
"include_lower" : false,
"include_upper" : true
}
}
}
}
},
"aggregations" : {
"associations" : {
"terms" : {
"field" : "association.id",
"size" : 0,
"order" : {
"_term" : "asc"
}
},
"aggregations" : {
"top" : {
"top_hits" : {
"from" : 0,
"size" : 1,
"_source" : {
"includes" : [ ],
"excludes" : [ ]
},
"sort" : [ {
"versionDate" : {
"order" : "desc"
}
} ]
}
},
"disabledDate" : {
"filter" : {
"missing" : {
"field" : "disabledDate"
}
}
}
}
}
}
}
STEPS in the query:
Filter by indexDate less than or equal to a given date.
Aggregate based on formId. Forming buckets per formId.
Sort in descending order and return top hit result per bucket.
Run a subaggregation filter after the sort subaggregation and remove all the documents from buckets where disabled date is not null.(Which is not working)
The whole purpose of post_filter is to run after aggregations have been computed. As such, post_filter has no effect whatsoever on aggregation results.
What you can do in your case is to apply a top-level filter aggregation so that documents with no disabledDate are not taken into account in aggregations, i.e. consider only documents with disabledDate.
{
"query": {
"bool": {
"must": {
"range": {
"versionDate": {
"from": null,
"to": "2016-04-22T23:13:50.000Z",
"include_lower": true,
"include_upper": true
}
}
}
}
},
"aggregations": {
"with_disabled": {
"filter": {
"exists": {
"field": "disabledDate"
}
},
"aggs": {
"form.id": {
"terms": {
"field": "form.id",
"size": 0
},
"aggregations": {
"top": {
"top_hits": {
"size": 1,
"_source": {
"includes": [],
"excludes": []
},
"sort": [
{
"versionDate": {
"order": "desc"
}
}
]
}
}
}
}
}
}
}
}

Resources