get buckets count in elasticsearch aggregations - elasticsearch

I am using elasticsearch to search a database with a lot of duplicates.
I am using field colapse and it works, however it returns the amount of hits (including duplicates) and not the amount of buckets.
"aggs": {
"uniques": {
"terms": {
"field": "guid"
},
"aggs": {
"jobs": { "top_hits": { "_source": "title", "size": 1 }}
}
}
}
I can count the buckets by making another request using cardinality (but it only returns count, not the documents):
{
"aggs" : {
"uniques" : {
"cardinality" : {
"field" : "guid"
}
}
}
}
Is there a way to return both requests (buckets + total bucket count) in one search?
Thanks

You can combine both of these aggregations into 1 request.
{
"aggs" : {
"uniques" : {
"cardinality" : {
"field" : "guid"
}
},
"uniquesTerms": {
"terms": {
"field": "guid"
},
"aggs": {
"jobs": { "top_hits": { "_source": "title", "size": 1 }}
}
}
}

Related

Elastic Search - Pagination on Aggregations

I have an index and I query an aggregation, instead of returning the whole aggregation at once I want to have it returned in chunks, that is small small blocks, is it possible to do so in Elastic Search?
Try to use Bucket sort
POST /sales/_search
{
"size": 0,
"aggs" : {
"sales_per_month" : {
"date_histogram" : {
"field" : "date",
"interval" : "month"
},
"aggs": {
"total_sales": {
"sum": {
"field": "price"
}
},
"sales_bucket_sort": {
"bucket_sort": {
"sort": [
{"total_sales": {"order": "desc"}}
],
"size": 3,
"from": 10
}
}
}
}
}
}

Elasticsearch. Using term aggregation, return values where doc count is less than some value

I want to group values by field(account id in my case) using term aggregation and return only fields where doc_count is less than some value.
I can specify min_doc_count parameter, but there is no max_doc_count. So I'm looking for a way to simulate this behavior. One of my many tries is this, but it doesn't work.
{
"size": 0,
"aggs": {
"by_account": {
"terms": {
"field": "accountId"
},
"aggs": {
"by_account_filtered": {
"bucket_selector": {
"buckets_path": {
"totalDocs": "_count"
},
"script": "params.totalDocs < 10000"
}
}
}
}
}
}
What am I doing wrong?
The bucket_selector aggregation need to be nested ( since its a parent-type aggregation ) and sibling of a metric aggregation that it will use to filter buckets.
So we use a top level terms aggregation, then use a nested value_count aggregation to expose the bucket doc_count to the sibling selector_bucket aggregation
try this :
{
"size": 0,
"aggs": {
"by_account": {
"terms": {
"field": "accountId"
},
"aggs": {
"by_account_number": {
"value_count" : {
"field" : "accountId"
}
},
"by_account_filtered": {
"bucket_selector": {
"buckets_path": {
"totalDocs": "by_account_number"
},
"script": "params.totalDocs < 10000"
}
}
}
}
}
}
EDIT : If you want to get the lowest account doc_count
{
"size": 0,
"aggs": {
"by_account": {
"terms": {
"field": "accountId",
"order" : { "_count" : "asc" },
"size": 100
},
"aggs": {
"by_account_number": {
"value_count" : {
"field" : "accountId"
}
},
"by_account_filtered": {
"bucket_selector": {
"buckets_path": {
"totalDocs": "by_account_number"
},
"script": "params.totalDocs < 10000"
}
}
}
}
}
}

Elasticsearch one record for one matching query

I have one elasticsearch index in which I have so many records. There is a field username, I want to get latest 1 post of each username by passing comma separated values, example ::
john,shahid,mike,jolie
and I want latest 1 post of each usernames. How can I do this? I can do it by passing one username at a time but it will hit so many http requests. I want to do it in one request.
You could use a filtered terms aggregation coupled with a top_hits one in order to achieve what you need:
{
"size": 0,
"query": {
"bool": {
"filter": {
"terms": {
"username": [ "john", "shahid", "mike", "jolie" ]
}
}
}
},
"aggs": {
"usernames": {
"filter": {
"terms": {
"username": [ "john", "shahid", "mike", "jolie" ]
}
},
"aggs": {
"by_username": {
"terms": {
"field": "username"
},
"aggs": {
"top1": {
"top_hits": {
"size": 1,
"sort" : {"created_date" : "desc"}
}
}
}
}
}
}
}
}
This query can give you all the posts of these 4 ids sorted by post_date in descending order. You can process on that data to get the result.
{
"sort" : [
{ "post_date" : {"order" : "desc"}}
],
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"username" : ["john","shahid","mike","jolie]
}
}
}
}
}

Count how many documents have an attribute or are missing that attribute in Elasticsearch

How can I write a single Elasticsearch query that will count how many documents either have a value for a field or are missing that field?
This query successfully count the docs missing the field:
POST localhost:9200//<index_name_here>/_search
{
"size": 0,
"aggs" : {
"Missing_Field" : {
"missing": { "field": "group_doc_groupset_id" }
}
}
}
This query does the opposite, counting documents NOT missing the field:
POST localhost:9200//<index_name_here>/_search
{
"size": 0,
"aggs" : {
"Not_Missing_Field" : {
"exists": { "field": "group_doc_groupset_id" }
}
}
}
How can I write one that combines both? For example, this yields a syntax error:
POST localhost:9200//<index_name_here>/_search
{
"size": 0,
"aggs" : {
"Missing_Field_Or_Not" : {
"missing": { "field": "group_doc_groupset_id" },
"exists": { "field": "group_doc_groupset_id" }
}
}
}
GET indexname/_search?size=0
{
"aggs": {
"a1": {
"missing": {
"field": "status"
}
},
"a2": {
"filter": {
"exists": {
"field": "status"
}
}
}
}
}
As per new Elastic search recommendation in the docs:
GET {your_index_name}/_search #or _count, to see just the value
{
"query": {
"bool": {
"must_not": { # here can be also "must"
"exists": {
"field": "{field_to_be_searched}"
}
}
}
}
}
Edit: _count allows to have exact values of how many documents are indexed. If there're more than 10k the total is shown as:
"hits" : {
"total" : {
"value" : 10000, # 10k
"relation" : "gte" # Greater than
}

sub field aggregation group by order by in elasticsearch

I am unable to find the correct syntax to get an aggregation of a sub object ordered by a count field.
A good example of this is a twitter document:
{
"properties" : {
"id" : {
"type" : "long"
},
"message" : {
"type" : "string"
},
"user" : {
"type" : "object",
"properties" : {
"id" : {
"type" : "long"
},
"screenName" : {
"type" : "string"
},
"followers" : {
"type" : "long"
}
}
}
}
}
How would I go about getting the Top Influencers for a given set of tweets? This would be a unique list of the top 10 "user" objects ordered by the "user.followers" field.
I have tried using top_hits but get an exception:
org.elasticsearch.common.breaker.CircuitBreakingException: [FIELDDATA]
Data too large, data for [user.id]
"aggs": {
"top-influencers": {
"terms": {
"field": "user.id",
"order": {
"top_hit": "desc"
}
},
"aggs": {
"top_tags_hits": {
"top_hits": {}
},
"top_hit": {
"max": {
"field": "user.followers"
}
}
}
}
}
I can get almost what I want using the "sort" field on the query (no aggregation), however if a user has multiple tweets then they will appear twice in the result. I need to be able to group by the sub object "user" and only return each user once.
---UPDATE---
I have managed to get a list of the top users returning in very good time. Unfortunatly it still isnt unique. Also the docs say top_hits is designed to be a sub agg..., I am using it as a top level agg...
"aggs": {
"top_influencers": {
"top_hits": {
"sort": [
{
"user.followers": {
"order": "desc"
}
}
],
"_source": {
"include": [
"user.id",
"user.screenName",
"user.followers"
]
},
"size": 10
}
}
}
Try this:
{
"aggs": {
"GroupByType": {
"terms": {
"field": "user.id",
"size": 10000
},
"aggs": {
"Group": {
"top_hits":{
"size":1,
"_source": {
"includes": ["user.id", "user.screenName", "user.followers"]
},
"sort":[{
"user.followers": {
"order": "desc"
}
}]
}
}
}
}
}
}
You can then take the top 10 results of this query. Note that normal search in elastic search only goes up to 10000 records.

Resources