How to use transform {"type": "flatten"}, and get data buckets in vega-lite - elasticsearch

I am trying to create a visualize vega-lite/v2.json with transform ("type": "flatten").
but it's not working.
data
code:
{
$schema: https://vega.github.io/schema/vega-lite/v2.json
data: {
url: {
index: elk_map_statistic-*
body: {
size: 0,
"query": {
"bool": {
"must": [
{"range": {
"timestamp": {
"gte": "now-6m",
"lt": "now"
}
}
]
}
},
"aggs": {
"group_by_host": {
"terms": {
"field": "DestBuilding.keyword",
"size": 20000
},
"aggs" : {
"DestRoom" : {
"terms" : {
"field" : "DestRoom.keyword",
"size" : 500
}
},
"avg" : { "avg" : { "field" : "avg" } },
"types_count" : { "value_count" : { "field" : "count" } }
}
}
}
}
}
format: {property: "aggregations.group_by_host.buckets"}
},
"transform": [
{
"type": "flatten",
"fields": ["brand.buckets"],
"as": ["val"]
}
],
mark: rect
"encoding": {
"x": {"field": "val.key", "type": "ordinal"},
"y": {"field": "key", "type": "ordinal"}
},
}
How to use ("type": "flatten"), if I want to get data in buckets?
I found an example for vega:https://github.com/vega/vega/issues/1155

Related

elastic - query multiple levels on nested object in inner_hits

i have a huge nested object which has lots of levels
i want to create a query which will return only the leaf / some object in the middle,
and the query is supposed to query multiple levels in the tree.
for example:
my DB is saving the whole company structure.
company -> wards -> employees -> working hours
i want to make a query that will return only the working hours of the employees in ward 2 which started later than 3pm this month
i tried to use inner_hits - but to no use.
as requested, sample document and expected result:
company:[{
properties:{companyId: 112}
ward:[{
properties: {wardId: 223}
employee:{
properties: {employeeId: 334},
workingHours: [
{ date: "1.1.2021", numOfHours: 4},
{ date: "1.2.2021", numOfHours: 7}
]
}]
}]
}]
the query:
I need to return the working hours of date "1.2.21" , of employee 334, of ward 223. and only the working hours, not the whole tree.
expected result:
4 or { date: "1.1.2021", numOfHours: 4} , whatever is simpler
hope its clear now
You need to add inner_hits to all nested queries
You can either parse entire result to get matched working hours(from inner hits) o can use response filtering to remove additional data
Mapping
PUT index123
{
"mappings": {
"properties": {
"company": {
"type": "nested",
"properties": {
"ward": {
"type": "nested",
"properties": {
"employee": {
"type": "nested",
"properties": {
"workingHours": {
"type": "nested",
"properties": {
"date": {
"type": "date"
}
}
}
}
}
}
}
}
}
}
}
}
Data
"_index" : "index123",
"_type" : "_doc",
"_id" : "9gGYI3oBt-MOenya6BcN",
"_score" : 1.0,
"_source" : {
"company" : [
{
"companyId" : 112,
"ward" : [
{
"wardId" : 223,
"employee" : {
"employeeId" : 334,
"workingHours" : [
{
"date" : "2021-01-01",
"numOfHours" : 4
},
{
"date" : "2021-01-02",
"numOfHours" : 7
}
]
}
}
]
}
]
}
}
Query
GET index123/_search?filter_path=hits.hits.inner_hits.ward.hits.hits.inner_hits.employee.hits.hits.inner_hits.workingHours.hits.hits._source
{
"query": {
"nested": {
"inner_hits": {
"name":"ward"
},
"path": "company.ward",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.wardId": {
"value": 223
}
}
},
{
"nested": {
"inner_hits": {
"name":"employee"
},
"path": "company.ward.employee",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.employee.employeeId": {
"value":334
}
}
},
{
"nested": {
"inner_hits": {
"name":"workingHours"
},
"path": "company.ward.employee.workingHours",
"query": {
"range": {
"company.ward.employee.workingHours.date": {
"gte": "2021-01-01",
"lte": "2021-01-01"
}
}
}
}
}
]
}
}
}
}
]
}
}
}
}
}
Result
{
"hits" : {
"hits" : [
{
"inner_hits" : {
"ward" : {
"hits" : {
"hits" : [
{
"inner_hits" : {
"employee" : {
"hits" : {
"hits" : [
{
"inner_hits" : {
"workingHours" : {
"hits" : {
"hits" : [
{
"_source" : {
"date" : "2021-01-01",
"numOfHours" : 4
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
Update:
Query with company ID
GET index123/_search?filter_path=hits.hits.inner_hits.company.hits.hits.inner_hits.ward.hits.hits.inner_hits.employee.hits.hits.inner_hits.workingHours.hits.hits._source
{
"query": {
"nested": {
"path": "company",
"inner_hits": {
"name": "company"
},
"query": {
"bool": {
"must": [
{
"term": {
"company.companyId": {
"value": 112
}
}
},
{
"nested": {
"inner_hits": {
"name": "ward"
},
"path": "company.ward",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.wardId": {
"value": 223
}
}
},
{
"nested": {
"inner_hits": {
"name": "employee"
},
"path": "company.ward.employee",
"query": {
"bool": {
"must": [
{
"term": {
"company.ward.employee.employeeId": {
"value": 334
}
}
},
{
"nested": {
"inner_hits": {
"name": "workingHours"
},
"path": "company.ward.employee.workingHours",
"query": {
"range": {
"company.ward.employee.workingHours.date": {
"gte": "2021-01-01",
"lte": "2021-01-01"
}
}
}
}
}
]
}
}
}
}
]
}
}
}
}
]
}
}
}
}
}

Elasticsearch - get all nested objects of all documents

Let's imagine Elasticsearch index where each document represents a country. Country has cities field, which is defined as nested.
Sample mapping (simplified for brevity of this example):
{
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"cities": {
"type": "nested",
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
// other properties are omitted for brevity
}
}
}
}
The documents which I'm inserting to the index look like this:
{
"name": "Slovakia",
"cities": [
{
"name": "Bratislava"
},
{
"name": "Zilina"
},
...
]
}
{
"name": "Czech Republic",
"cities": [
{
"name": "Praha"
},
{
"name": "Brno"
},
...
]
}
Is it possible to compose a query which returns all cities (over all countries) and supports sorting & pagination? In response, I'd like to have the complete nested objects + some fields of the parent object (so that I can display which country the city belongs to).
The first returned page (response) would contain 10 cities from Czech Republic, the second page would contain 10 cities where four of them are (the last ones) from Czech Republic and six are from Slovakia.
I was looking into composite aggregation, but I don't know how add country name to sources:
{
"query": {
"match_all": {}
},
"aggs": {
"nested_aggs": {
"nested": {
"path": "cities"
},
"aggs": {
"by_name": {
"composite": {
"sources": [
{
"cityName": {
"terms": {
"field": "cities.name.keyword",
"order": "asc"
}
}
}
]
}
}
}
}
}
}
Is it possible to compose such query without modifying the Elasticsearch mapping?
All members of composite aggregations need to be defined under the same context — you cannot intermix nested and non-nested contexts.
The easiest option would be to first aggregate on the countries and then on the cities:
{
"size": 0,
"aggs": {
"by_country": {
"terms": {
"field": "name.keyword",
"size": 10
},
"aggs": {
"nested_cities": {
"nested": {
"path": "cities"
},
"aggs": {
"by_cities": {
"terms": {
"field": "cities.name.keyword",
"size": 10
}
}
}
}
}
}
}
}
If you do have the option of changing the mapping, you can leverage the include_in_root feature which'll enable you to perform composite aggs such as:
{
"size": 0,
"aggs": {
"by_name": {
"composite": {
"sources": [
{
"countryName": {
"terms": {
"field": "name.keyword",
"order": "asc"
}
}
},
{
"cityName": {
"terms": {
"field": "cities.name.keyword",
"order": "asc"
}
}
}
]
}
}
}
}
which can be easily paginated.
Here's what the result would look like:
...
"aggregations" : {
"by_name" : {
"after_key" : {
"countryName" : "Slovakia",
"cityName" : "Zilina"
},
"buckets" : [
{
"key" : {
"countryName" : "Czech Republic",
"cityName" : "Brno"
},
"doc_count" : 1
},
{
"key" : {
"countryName" : "Czech Republic",
"cityName" : "Praha"
},
"doc_count" : 1
},
{
"key" : {
"countryName" : "Slovakia",
"cityName" : "Bratislava"
},
"doc_count" : 1
},
{
"key" : {
"countryName" : "Slovakia",
"cityName" : "Zilina"
},
"doc_count" : 1
}
]
}
}

ElasticSearch Aggregation Filter (not nested) Array

I have mapping like that:
PUT myindex1/_mapping
{
"properties": {
"program":{
"properties":{
"rounds" : {
"properties" : {
"id" : {
"type" : "keyword"
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
And my example docs:
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000000", "name":"Test1"},
{"id":"00000000-0000-0000-0000-000000000001", "name":"Fact2"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000002", "name":"Test3"},
{"id":"00000000-0000-0000-0000-000000000003", "name":"Fact4"}
]
}
}
POST myindex1/_doc
{
"program": {
"rounds":[
{"id":"00000000-0000-0000-0000-000000000004", "name":"Test5"},
{"id":"00000000-0000-0000-0000-000000000005", "name":"Fact6"}
]
}
}
Purpose: get only names of rounds that filtered as wildcard by user.
Aggregation query:
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must":[
{
"wildcard": {
"program.rounds.name": "*test*"
}
}
]
}
}
}
},
"size": 0
}
This aggregation returns all 6 names, but I need only Test1,Test3,Test5. Also tried include": "/tes.*/i" regex pattern for terms, but ignore case does not work.
Note: I'm note sure abount nested type, because I don't interested in association between Id and Name (at least for now).
ElasticSearch version: 7.7.0
If you want to only aggregate specific rounds based on a condition on the name field, then you need to make rounds nested, otherwise all name values end up in the same field.
Your mapping needs to be changed to this:
PUT myindex1/
{
"mappings": {
"properties": {
"program": {
"properties": {
"rounds": {
"type": "nested", <--- add this
"properties": {
"id": {
"type": "keyword"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
}
}
And then your query needs to change to this:
GET myindex1/_search
{
"size": 0,
"query": {
"nested": {
"path": "program.rounds",
"query": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"aggs": {
"rounds": {
"nested": {
"path": "program.rounds"
},
"aggs": {
"name_filter": {
"filter": {
"wildcard": {
"program.rounds.name": "*Test*"
}
},
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"order": {
"_key": "asc"
}
}
}
}
}
}
}
}
}
And the result will be:
"aggregations" : {
"rounds" : {
"doc_count" : 6,
"name_filter" : {
"doc_count" : 3,
"names" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Test1",
"doc_count" : 1
},
{
"key" : "Test3",
"doc_count" : 1
},
{
"key" : "Test5",
"doc_count" : 1
}
]
}
}
}
}
UPDATE:
Actually, you can achieve what you want without introducing nested types with the following query. You were close, but the include pattern was wrong
GET myindex1/_search
{
"aggs": {
"result": {
"aggs": {
"names": {
"terms": {
"field": "program.rounds.name.keyword",
"size": 10000,
"include": "[Tt]est.*",
"order": {
"_key": "asc"
}
}
}
},
"filter": {
"bool": {
"must": [
{
"wildcard": {
"program.rounds.name": "*Test*"
}
}
]
}
}
}
},
"size": 0
}

How to calculate average rating of each products in Elasticsearch

I have products index with following mapping:
{
"products" : {
"mappings" : {
"properties" : {
"#timestamp" : {
"type" : "date"
},
"name":{
"type": "text"
},
"price" : {
"type" : "integer"
},
"product_review_rel" : {
"type" : "join",
"eager_global_ordinals" : true,
"relations" : {
"product" : "review"
}
},
"rate" : {
"type" : "integer"
}
}
}
}
}
This index contains products and reviews, as you can see at product_review_rel field.
Products contain name, price, ... fields.
Reviews contain rate, ... fields.
I want to get average rating of each products. How to do that?
Another question, is it possible to return average rating from products returned by the following query in the same request:
{
"query": {
"nested": {
"path": "translations",
"query": {
"multi_match": {
"query": "kem chống nắng",
"fields": [
"name"
],
"analyzer":"vi_analyzer"
}
}
}
}
}
Update 1: Composite aggregation
GET products/_search
{
"query": {
"nested": {
"path": "translations",
"query": {
"multi_match": {
"query": "kem chống nắng",
"fields": [
"translations.name",
"translations.description"
],
"analyzer": "vi_analyzer"
}
}
}
},
"aggs": {
"products": {
"composite": {
"sources": [
{
"id": {
"terms": {
"field": "_id"
}
}
}
]
},
"aggs": {
"reviews": {
"children": {
"type": "review"
},
"aggs": {
"rating": {
"avg": {
"field": "rate"
}
}
}
}
}
}
}
}```

Elastic Search: Selecting multiple vlaues in aggregates

In Elastic Search I have the following index with 'allocated_bytes', 'total_bytes' and other fields:
{
"_index" : "metrics-blockstore_capacity-2017_06",
"_type" : "datapoint",
"_id" : "AVzHwgsi9KuwEU6jCXy5",
"_score" : 1.0,
"_source" : {
"timestamp" : 1498000001000,
"resource_guid" : "2185d15c-5298-44ac-8646-37575490125d",
"allocated_bytes" : 1.159196672E9,
"resource_type" : "machine",
"total_bytes" : 1.460811776E11,
"machine" : "2185d15c-5298-44ac-8646-37575490125d"
}
I have the following query to
1)get a point for 30 minute interval using date-histogram
2)group by field on resource_guid.
3)max aggregate to find the max value.
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValue": {
"max": {
"field": "allocated_bytes"
}
}
}
},
"sumUnique": {
"sum_bucket": {
"buckets_path": "groupByField>maxValue"
}
}
}
}
}
}
But with this query I am able to get only allocated_bytes, but I need to have both allocated_bytes and total_bytes at the result point.
Following is the result from the above query:
{
"key_as_string" : "2017-06-20T21:00:00.000Z",
"key" : 1497992400000,
"doc_count" : 9,
"groupByField" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ {
"key" : "2185d15c-5298-44ac-8646-37575490125d",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
}, {
"key" : "c3513cdd-58bb-4f8e-9b4c-467230b4f6e2",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156165632E9
}
}, {
"key" : "eff13403-9737-4d08-9dca-fb6c12c3a6fa",
"doc_count" : 3,
"maxValue" : {
"value" : 1.156182016E9
}
} ]
},
"sumUnique" : {
"value" : 3.468529664E9
}
}
I do need both allocated_bytes and total_bytes. How do I get multiple fields( allocated_bytes, total_bytes) for each point?
For example:
"sumUnique" : {
"Allocatedvalue" : 3.468529664E9,
"TotalValue" : 9.468529664E9
}
or like this:
"allocatedBytessumUnique" : {
"value" : 3.468529664E9
}
"totalBytessumUnique" : {
"value" : 9.468529664E9
},
You can just add another aggregation:
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": 1497992400000,
"lte": 1497996000000
}
}
}
]
}
},
"aggregations": {
"groupByTime": {
"date_histogram": {
"field": "timestamp",
"interval": "30m",
"order": {
"_key": "desc"
}
},
"aggregations": {
"groupByField": {
"terms": {
"size": 1000,
"field": "resource_guid"
},
"aggregations": {
"maxValueAllocated": {
"max": {
"field": "allocated_bytes"
}
},
"maxValueTotal": {
"max": {
"field": "total_bytes"
}
}
}
},
"sumUniqueAllocatedBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueAllocated"
}
},
"sumUniqueTotalBytes": {
"sum_bucket": {
"buckets_path": "groupByField>maxValueTotal"
}
}
}
}
}
}
I hope you are aware that sum_bucket calculates sibling aggregations only, in this case gives sum of max values, not the sum of total_bytes. If you want to get sum of total_bytes you can use sum aggregation

Resources