Elasticsearch distinct records in order with pagination - elasticsearch

How do I get records after aggregation on a terms field in order with pagination. So far I have this:
{
"query": {
"bool": {
"filter": [
{
"terms": {
"user_id.keyword": [
"user#domain.com"
]
}
},
{
"range": {
"creation_time": {
"gte": "2019-02-04T19:00:00.000Z",
"lte": "2019-05-04T19:00:00.000Z"
}
}
}
],
"should": [
{
"wildcard": {
"operation": "*sol*"
}
},
{
"wildcard": {
"object_id": "*sol*"
}
},
{
"wildcard": {
"user_id": "*sol*"
}
},
{
"wildcard": {
"user_type": "*sol*"
}
},
{
"wildcard": {
"client_ip": "*sol*"
}
},
{
"wildcard": {
"country": "*sol*"
}
},
{
"wildcard": {
"workload": "*sol*"
}
}
]
}
},
"aggs": {
"user_ids": {
"terms": {
"field": "country.keyword",
"include": ".*United.*"
}
}
},
"from": 0,
"size": 10,
"sort": [
{
"creation_time": {
"order": "desc"
}
}
]
}
I looked into this and some people say its possible by using composite aggregations or by using partitions. But I am not sure how I can actually achieve this.
I also looked into bucket_sort but I cant seem to get it to work:
"my_bucket_sort": {
"bucket_sort": {
"sort": [
{
"user_ids": {
"order": "desc"
}
}
],
"size": 3
}
}
I am a noob at this. Kindly help me out. Thanks.

As the field is country, and presumably doesn't have a high cardinality, you could set size to be a sufficiently high number to return all countries in a single request
"aggs": {
"user_ids": {
"terms": {
"field": "country.keyword",
"include": ".*United.*",
"size": 10000
}
}
}
Or alternatively, for a high cardinality field, you could filter the aggregation first, and then use partitioning to page through the values
{
"size": 0,
"aggs": {
"user_ids": {
"filter": {
"wildcard" : { "country" : ".*United.*" }
},
"aggs": {
"countries": {
"terms": {
"field": "country.keyword",
"include": {
"partition": 0,
"num_partitions": 20
},
"size": 10000
}
}
}
}
}
}
where you would increase the value of partition with each query you send up to 19
See the elastic documentation for further details

Related

Elasticsearch Last document of multiple term queries

I need to get the last document of each interface, I have played around with different queries but I can get the desired result, below is my las attempt.
Can you help me to get the last document of each interface where the field throughput exist?
Thanks
GET /interface-2021.11/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"interface_name.keyword": {
"value": "Gi0/0/2 on (EXT-01)"
}
}
},
{
"term": {
"interface_name.keyword": {
"value": "Gi0/0/1 on (EXT-02)"
}
}
},
{
"term": {
"interface_name.keyword": {
"value": "Ethernet1/61 on (DC-01)"
}
}
},
{
"term": {
"interface_name.keyword": {
"value": "Ethernet1/17 on (DC-02)"
}
}
}
],
"minimum_should_match": 1,
"filter": [
{
"exists": {
"field": "throughput"
}
}
]
}
},
"aggs": {
"top_date": {
"top_hits": {
"sort": [
{
"#timestamp": {
"order": "desc"
}
}
]
}
}
}
}
Good job, you're on the right path! You just need to aggregate by interface_name.keyword and get the top hit for each interface.
Here is the query that will work as you expect:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"interface_name.keyword": [
"Gi0/0/2 on (EXT-01)",
"Gi0/0/1 on (EXT-02)",
"Ethernet1/61 on (DC-01)",
"Ethernet1/17 on (DC-02)"
]
}
},
{
"exists": {
"field": "throughput"
}
}
]
}
},
"aggs": {
"interfaces": {
"terms": {
"field": "interface_name.keyword"
},
"aggs": {
"top_date": {
"top_hits": {
"sort": [
{
"#timestamp": {
"order": "desc"
}
}
]
}
}
}
}
}
}

ElasticSearch - Aggregation result not matching total hits

I have query like below. It returns 320 results for the below condition-
{
"size": "5000",
"sort": [
{
"errorDateTime": {
"order": "desc"
}
}
],
"query": {
"bool": {
"must": [
{
"range": {
"errorDateTime": {
"gte": "2021-04-07T20:08:20.516",
"lte": "2021-04-08T00:08:20.516"
}
}
},
{
"bool": {
"should": [
{
"match": {
"businessFunction": "PriceUpdate"
}
},
{
"match": {
"businessFunction": "PriceFeedIntegration"
}
},
{
"match": {
"businessFunction": "StoreConnectivity"
}
},
{
"match": {
"businessFunction": "Transaction"
}
},
{
"match": {
"businessFunction": "SalesSummary"
}
}
]
}
}
]
}
},
"aggs": {
"genres_and_store": {
"terms": {
"field": "storeId"
},
"aggs": {
"genres_and_error": {
"terms": {
"field": "errorCode"
},
"aggs": {
"genres_and_business": {
"terms": {
"field": "businessFunction"
}
}
}
}
}
}
}
}
However the aggregation results are not matching. I have so many stores which are not returned in aggregation but I can see them in query result. What am I missing? My schema looks like -
{
"errorDescription": "FTP Service unable to connect to Store to list the files for Store 12345",
"errorDateTime": "2021-04-07T21:01:15.040546",
"readBy": [],
"errorCode": "e004",
"businessFunction": "TransactionError",
"storeId": "12345"
}
Please let me know if I am writing the query wrong. I want to aggregare per store, per errorcode and per businessFunction.
If no size param is set in the terms aggregation, then by default it returns the top 10 terms, which are ordered by their doc_count. You need to add the size param in the terms aggregation, to get all the matching total hits.
Try out the below query
{
"size": "5000",
"sort": [
{
"errorDateTime": {
"order": "desc"
}
}
],
"query": {
"bool": {
"must": [
{
"range": {
"errorDateTime": {
"gte": "2021-04-07T20:08:20.516",
"lte": "2021-04-08T00:08:20.516"
}
}
},
{
"bool": {
"should": [
{
"match": {
"businessFunction": "PriceUpdate"
}
},
{
"match": {
"businessFunction": "PriceFeedIntegration"
}
},
{
"match": {
"businessFunction": "StoreConnectivity"
}
},
{
"match": {
"businessFunction": "Transaction"
}
},
{
"match": {
"businessFunction": "SalesSummary"
}
}
]
}
}
]
}
},
"aggs": {
"genres_and_store": {
"terms": {
"field": "storeId",
"size": 100 // note this
},
"aggs": {
"genres_and_error": {
"terms": {
"field": "errorCode"
},
"aggs": {
"genres_and_business": {
"terms": {
"field": "businessFunction"
}
}
}
}
}
}
}
}
I think I was missing size parameter inside aggs and was getting default 10 aggregations only:
"aggs": {
"genres_and_store": {
"terms": {
"field": "storeId",
"size": 1000
},

ElasticSearch query with prefix for aggregation

I am trying to add a prefix condition for my ES query in a "must" clause.
My current query looks something like this:
body = {
"query": {
"bool": {
"must":
{ "term": { "article_lang": 0 }}
,
"filter": {
"range": {
"created_time": {
"gte": "now-3h"
}
}
}
}
},
"aggs": {
"articles": {
"terms": {
"field": "article_id.keyword",
"order": {
"score": "desc"
},
"size": 1000
},
"aggs": {
"score": {
"sum": {
"field": "score"
}
}
}
}
}
}
I need to add a mandatory condition to my query to filter articles whose id starts with "article-".
So, far I have tried this:
{
"query": {
"bool": {
"should": [
{ "term": { "article_lang": 0 }},
{ "prefix": { "article_id": {"value": "article-"} }}
],
"filter": {
"range": {
"created_time": {
"gte": "now-3h"
}
}
}
}
},
"aggs": {
"articles": {
"terms": {
"field": "article_id.keyword",
"order": {
"score": "desc"
},
"size": 1000
},
"aggs": {
"score": {
"sum": {
"field": "score"
}
}
}
}
}
}
I am fairly new to ES and from the documentations online, I know that "should" is to be used for "OR" conditions and "must" for "AND". This is returning me some data but as per the condition it will be consisting of either article_lang=0 or articles starting with article-. When I use "must", it doesn't return anything.
I am certain that there are articles with id starting with this prefix because currently, we are iterating through this result to filter out such articles. What am I missing here?
In your prefix query, you need to use the article_id.keyword field, not article_id. Also, you should prefer filter over must since you're simply doing yes/no matching (aka filters)
{
"query": {
"bool": {
"filter": [ <-- change this
{
"term": {
"article_lang": 0
}
},
{
"prefix": {
"article_id.keyword": { <-- and this
"value": "article-"
}
}
}
],
"filter": {
"range": {
"created_time": {
"gte": "now-3h"
}
}
}
}
},
"aggs": {
"articles": {
"terms": {
"field": "article_id.keyword",
"order": {
"score": "desc"
},
"size": 1000
},
"aggs": {
"score": {
"sum": {
"field": "score"
}
}
}
}
}
}

ElasticSearch aggregations using filter and without it

I`m building product list page with filters. There a lot of filters, and data for them are counting in ES with aggregation functions.
Simplest example if min/max price:
{
"size": 0,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"term": {
"shop_id": 44
}
},
{
"term": {
"CategoryId": 36898
}
},
{
"term": {
"products_status": 1
}
},
{
"term": {
"availability": 3
}
}
]
}
}
}
},
"aggs": {
"min_price": {
"min": {
"field": "products_price"
}
},
"max_price": {
"max": {
"field": "products_price"
}
}
}
}
So, this request in ES return me minimal and maximal price according rules installed in filter (category_id 36898, shop_id 44 etc).
It is working perfect.
The question is: is it possible to update this request and get aggregations without filters? Or is it maybe possible to return aggregation data with another filter in one request?
So I want:
min_price and max_price for filtered data (query1)
and mix_price and max_price for unfiltered data (or filtered data with query 2)?
You can use global option for the aggregations to not applying any filters provided in query block.
For example, for your query use the following json input.
{
"size": 0,
"query": {
"filtered": {
"query": {
"match_all": {}
},
"filter": {
"bool": {
"must": [
{
"term": {
"shop_id": 44
}
},
{
"term": {
"CategoryId": 36898
}
},
{
"term": {
"products_status": 1
}
},
{
"term": {
"availability": 3
}
}
]
}
}
}
},
"aggs": {
"min_price": {
"min": {
"field": "products_price"
}
},
"max_price": {
"max": {
"field": "products_price"
}
},
"without_filter_min": {
"global": {},
"aggs": {
"price_value": {
"min": {
"field": "products_price"
}
}
}
},
"without_filter_max": {
"global": {},
"aggs": {
"price_value": {
"max": {
"field": "products_price"
}
}
}
}
}
}

Query elasticsearch with multiple numeric ranges

{
"query": {
"filtered": {
"query": {
"match": {
"log_path": "message_notification.log"
}
},
"filter": {
"numeric_range": {
"time_taken": {
"gte": 10
}
}
}
}
},
"aggs": {
"distinct_user_ids": {
"cardinality": {
"field": "user_id"
}
}
}
}
I have to run this query 20 times as i want to know notification times above each of the following thresholds- [10,30,60,120,240,300,600,1200..]. Right now, i am running a loop and making 20 queries for fetching this.
Is there a more sane way to query elasticsearch once and get ranges that fall into these thresholds respectively?
What you probably want is a "range aggregation".
Here is the possible query where you can add more range or alter them -
{
"size": 0,
"query": {
"match": {
"log_path": "message_notification.log"
}
},
"aggs": {
"intervals": {
"range": {
"field": "time_taken",
"ranges": [
{
"to": 50
},
{
"from": 50,
"to": 100
},
{
"from": 100
}
]
},
"aggs": {
"distinct_user_ids": {
"cardinality": {
"field": "user_id"
}
}
}
}
}
}

Resources