how to group by terms count in elasticsearch - elasticsearch

I have the following document mapping
{
properties: {
id: {
type: 'keyword'
},
rel: {
type: 'nested',
properties: {
type: {
type: 'keyword'
},
...
}
}
}
}
In the end I want to plot a x-y chart where x axis is count of type t1 and y axis is count of type t2, so for the following documents
{ id: 1, rel: [ { type: t1, ... }, { type: t1, ... }, { type: t2, ... }] }
{ id: 2, rel: [ { type: t1, ... }, { type: t1, ... }] }
{ id: 3, rel: [ { type: t1, ... }, { type: t1, ... }] }
will map to 3 (x, y) points (2, 1), (2, 0), (2, 0), and I'm going to plot them on x-y plane like this
^
|
| 1
+---2-->
Right now I use the following aggregation
{
"_source": false,
"aggregations": {
"g1": {
"terms": {
"field": "id",
"size": 10000
},
"aggregations": {
"rel": {
"nested": {
"path": "rel"
},
"aggregations": {
"filter-t1": {
"filter": {
"terms": {
"rel.type": [
"t1"
]
}
}
},
"filter-t2": {
"filter": {
"terms": {
"rel.type": [
"t2"
]
}
}
}
}
}
}
}
}
}
to get the following result
{
"aggregations": {
"g1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1",
"doc_count": 1,
"rel": {
"doc_count": 4942,
"filter-t1": {
"doc_count": 6
},
"filter-t2": {
"doc_count": 20
}
}
},
{
"key": "2",
"doc_count": 1,
"rel": {
"doc_count": 3039,
"filter-t1": {
"doc_count": 6
},
"filter-t2": {
"doc_count": 11
}
}
}
...
and calculate number of documents in each coordinate in API layer.
The problem is that number of total documents is at millions of documents level, querying all in a request doesn't work. I don't find a way to do pagination in aggregations either, from size seems only work for _source.
Is there a way to achieve what I want in elasticsearch?

Related

how to get document above average value in elasticsearch

After grouping by device ID, I want to find the number of data with a numerical value greater than or equal to the average value of the data.
doc = [
{ deviceId: 1, data: {temp:1} },
{ deviceId: 1, data: {temp:2} },
{ deviceId: 1, data: {temp:3} },
{ deviceId: 1, data: {temp:4} },
{ deviceId: 1, data: {temp:5} },
{ deviceId: 2, data: {temp:1} },
{ deviceId: 2, data: {temp:2} },
{ deviceId: 2, data: {temp:3} },
{ deviceId: 2, data: {temp:4} },
{ deviceId: 2, data: {temp:5} },
{ deviceId: 3, data: {temp:1} },
{ deviceId: 3, data: {temp:2} },
{ deviceId: 3, data: {temp:3} },
{ deviceId: 3, data: {temp:4} },
{ deviceId: 3, data: {temp:5} },
];
"The desired result is"
result = aggregations :{
clusters:{
...
bucket:[
{ key:"1",
doc_count: 5,
avgData: {value: 3.0}
above_avgDataValue : {
doc_count : 2 // === data.temp > 3
}
}
]
}
}
Below is the aggregation I tried
_search {
size:0,
query:{
bool:{
filter:[
terms:{deviceId:[1,2]}
]
}
},
aggs:{
cluster:{
terms:{field:deviceId}
},
aggs:{
"avgData" : {"avg": {"field":"temp"}}
}
}
};
please help
Tldr;
I don't think this is possible with a single query.
But you could work around the issue by:
Get the average per deviceId
Get the number of doc above the average of deviceId
Work around
To get the average per deviceId the following query should work.
GET /73034730/_search
{
"size": 0,
"aggs": {
"avg_per_fields": {
"terms": {
"field": "deviceId",
"size": 10
},
"aggs": {
"avg": {
"avg": {
"field": "data.temp"
}
}
}
}
}
}
Then you could do the following query
GET /73034730/_search
{
"query": {
"bool": {
"should": [
{
"bool": {
"must": [
{
"match": {
"deviceId": "1"
}
},
{
"range": {
"data.temp": {
"gte": 3
}
}
}
]
}
},
{
"bool": {
"must": [
{
"match": {
"deviceId": "2"
}
},
{
"range": {
"data.temp": {
"gte": 3.33333335
}
}
}
]
}
},
{
"bool": {
"must": [
{
"match": {
"deviceId": "3"
}
},
{
"range": {
"data.temp": {
"gte": 3
}
}
}
]
}
}
],
"minimum_should_match": 1
}
},
"size": 0,
"aggs": {
"avg_per_fields": {
"terms": {
"field": "deviceId",
"size": 10
}
}
}
}
which considering the dataset you have should give you
{
...
"aggregations": {
"avg_per_fields": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 3
},
{
"key": 2,
"doc_count": 3
},
{
"key": 3,
"doc_count": 3
}
]
}
}
}

Nested Aggregation in Nest Elastic Search

In my Elastic document i have CityId,RootId,RootName,Price.Now i have to find top 7 roots in a city with following conditions.
Name and id of root which has minimum price in a City.
top 7 roots:- roots those have max number of entry in a City.
for Example :-
CityId RootId RootName Price
11 1 ABC 90
11 1 ABC 100
11 2 DEF 80
11 2 DEF 90
11 2 DEF 60
answer for CityId =11:-
RootId RootName Price
2 DEF 60
1 ABC 90
I am not aware of the syntax of the Nest. Adding a working example in JSON format.
Index Mapping:
{
"mappings":{
"properties":{
"listItems":{
"type":"nested"
}
}
}
}
Index Data:
{
"RootId": 2,
"CityId": 11,
"RootName": "DEF",
"listItems": [
{
"Price": 60
},
{
"Price": 90
},
{
"Price": 80
}
]
}
{
"RootId": 1,
"CityId": 11,
"RootName": "ABC",
"listItems": [
{
"Price": 100
},
{
"Price": 90
}
]
}
Search Query:
{
"size": 0,
"aggs": {
"id_terms": {
"terms": {
"field": "RootId"
},
"aggs": {
"nested_entries": {
"nested": {
"path": "listItems"
},
"aggs": {
"min_position": {
"min": {
"field": "listItems.Price"
}
}
}
}
}
}
}
}
Search Result:
"aggregations": {
"id_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 1,
"doc_count": 1,
"nested_entries": {
"doc_count": 2,
"min_position": {
"value": 90.0
}
}
},
{
"key": 2,
"doc_count": 1,
"nested_entries": {
"doc_count": 3,
"min_position": {
"value": 60.0
}
}
}
]
}
}
.Query(query => query.Bool(bQuery => bQuery.Filter(
fQuery => fQuery.Terms(ter => ter.Field(f => f.CityId).Terms(cityId))
)))
.Aggregations(agg => agg.Terms("group_by_rootId", st => st.Field(o => o.RootId)
.Order(TermsOrder.CountDescending)
.Aggregations(childAgg => childAgg.Min("min_price_in_group", m =>m.Field(p=>p.Price))
.TopHits("stocks", t11 => t11
.Source(sfd => sfd.Includes(fd => fd.Fields(Constants.IncludedFieldsFromElastic)))
.Size(1)
)
)
)
)
.Size(_popularStocksCount)
.From(0)
.Take(0);

Scripted Aggregation Calculation with unexpected results

First off I am somewhat new to ES / SK and even more so with aggrigations.
Here is my aggs structure:
aggs: {
all_budgets: {
sum: {
field: :amount
}
},
all_forecasts: {
sum: {
field: :forecast_total
}
},
all_variance: {
sum: {
script: "doc['forecast_total'].value - doc['amount'].value"
}
},
all_variance_p: {
sum: {
script: "(doc['forecast_total'].value - doc['amount'].value) / doc['amount'].value"
}
}
}
I am basically trying to get the total of all my budgets, total spent and then the over / under vs the budget and the corresponding %. Here is my output:
{
"all_forecasts": {
"doc_count": 2,
"value": 173604.0
},
"all_budgets": {
"doc_count": 2,
"value": 185437.0
},
"all_variance_p": {
"doc_count": 2,
"value": "0.33694326595832774"
},
"all_variance": {
"doc_count": 2,
"value": -11833.0
}
}
The "0.33694326595832774" value is wrong - should be "-0.06408106257" (i.e. -11833.0 / 185437.0) . The first two scripts work and I suspect I just don't understand how these scripts work.
Try casting your numbers to the same datatype before you divide. For instance ((float)(doc['forecast_total'].value) - (float)(doc['amount'].value)) / (float)(doc['amount'].value)).
In response to your comments to access final values of all_budgets and all_forecasts for further calculation, you can use bucket script aggregation which can accessparent aggregation using bucket path. Since it requires buckets to work on , you will need to add a parent aggregation ex date_histogram which splits documents in given interval(year or month or date etc)
{
"size": 0,
"aggs": {
"year_interval": {
"date_histogram": {
"field": "timestamp",
"interval": "year"
},
"aggs": {
"all_budgets": {
"sum": {
"field": "amount"
}
},
"all_forecasts": {
"sum": {
"field": "forecast"
}
},
"all_variance": {
"bucket_script": {
"buckets_path": {
"total_forecast":"all_forecasts",
"total_budget":"all_budgets"
},
"script": "params.total_forecast-params.total_budget"
}
},
"all_variance_p": {
"bucket_script": {
"buckets_path": {
"variance":"all_variance",
"budget":"all_budgets"
},
"script": "params.variance/params.budget"
}
}
}
}
}
}

Inner hits on grandparents still not working

I have problems retrieving the inner_hits of my "grandparent" items.
Parents from a child query works fine, but cant get it to return also the ones one more level up.
Any ideas of this?
The known issue for this should be fixed by now (2.3) and the workaround are written according to nested objects, not parent/child hierarchy data, so cant get it to work for me.
Code in Sense-format:
POST /test/child/_search
{
"query": {
"has_parent": {
"type": "parent",
"query": {
"has_parent": {
"type": "grandparent",
"query": {
"match_all": {}
},
"inner_hits": {}
}
},
"inner_hits": {}
}
}
}
PUT /test/child/3?parent=2&routing=1
{
"id": 3,
"name": "child",
"parentid": 2
}
PUT /test/parent/2?parent=1&routing=1
{
"id": 2,
"name": "parent",
"parentid": 1
}
PUT /test/grandparent/1
{
"id": 1,
"name": "grandparent"
}
PUT /test
{
"mappings": {
"grandparent": {},
"parent": {
"_parent": {
"type": "grandparent"
}
},
"child": {
"_parent": {
"type": "parent"
}
}
}
}
this is sample code for finding grand parent
const filterPath = `hits.hits.inner_hits.activity.hits.hits.inner_hits.user.hits.hits._source*,
hits.hits.inner_hits.activity.hits.hits.inner_hits.user.hits.hits.inner_hits.fofo.hits.hits._source*`;
const source = ['id', 'name', 'thumbnail'];
const { body } = await elasticWrapper.client.search({
index: ElasticIndex.UserDataFactory,
filter_path: filterPath,
_source: source,
body: {
from,
size,
query: {
bool: {
must: [
{
match: {
relation_type: ElasticRelationType.Like,
},
},
{
has_parent: {
parent_type: ElasticRelationType.Post,
query: {
bool: {
must: [
{
term: {
id: {
value: req.params.id,
},
},
},
{
has_parent: {
parent_type: ElasticRelationType.User,
query: {
bool: {
must: [
{
exists: {
field: 'id',
},
},
],
should: [
{
has_child: {
type: ElasticRelationType.Follower,
query: {
bool: {
minimum_should_match: 1,
should: [
{
match: {
follower:
req.currentUser?.id,
},
},
{
match: {
following:
req.currentUser?.id,
},
},
],
},
},
inner_hits: {
_source: [
'follower',
'following',
'status',
],
},
},
},
],
},
},
inner_hits: {
_source: ['id', 'name', 'thumbnail'],
},
},
},
],
},
},
inner_hits: {},
},
},
],
},
},
sort: [
{
createdAt: {
order: 'desc',
},
},
],
},
});

Ratio with elasticsearch

I have a list of customers with this structure:
{
"name" : "Toya Romano",
"hungry" : false,
"date" : 1420090500020
}
I would like to get the ratio of people who are hungry. How can I do it with an ElasticSearch query? I am running ES 2.3.
Rather a hacky approach because of this issue, but this should work:
{
"size": 0,
"aggs": {
"whatever": {
"filters": {
"filters": [{}]
},
"aggs": {
"all_people": {
"filter": {}
},
"hungry_count": {
"filter": {
"term": {
"hungry": true
}
}
},
"hungry_ratio": {
"bucket_script": {
"buckets_path": {
"total_hungry": "hungry_count._count",
"all": "all_people._count"
},
"script": "total_hungry/all"
}
}
}
}
}
}
With the result like this:
"buckets": [
{
"doc_count": 5,
"all_people": {
"doc_count": 5
},
"hungry_count": {
"doc_count": 3
},
"hungry_ratio": {
"value": 0.6
}
}
]

Resources