Query unsold products from ElasticSearch - elasticsearch

I'm new to elasticSearch and need some help...
I've an index called sales having these type of records :
{product.id : 12 , sales.datetime : October 12th 2019},
{product.id : 13 , sales.datetime : October 12th 2019},
{product.id : 14 , sales.datetime : October 13th 2019},
{product.id : 14 , sales.datetime : October 14th 2019},
{product.id : 14 , sales.datetime : October 14th 2019},
{product.id : 13 , sales.datetime : October 18th 2019},
...
I would like to retrieve products unsold from the October 12th 2019,
I tried to filter on the sales.datetime with :
"range" => [
"datetime" => [
"lt" => "October 12th 2019"
]
]
But obviously, this type of query will return unexpected values, for exemple with this case, it'll retrieve product.id : [12,13] but has you can see, the product.id has been sell on October 18th 2019...

Mappings:
PUT sales
{
"mappings": {
"properties": {
"id":{
"type": "keyword"
},
"date":{
"type": "date",
"format": "MM-dd-yyyy"
}
}
}
}
Data:
[
{
"_index" : "sales",
"_type" : "_doc",
"_id" : "I-6Y7W0B_-hMjUaqpQH4",
"_score" : 1.0,
"_source" : {
"id" : 12,
"date" : "10-12-2019"
}
},
{
"_index" : "sales",
"_type" : "_doc",
"_id" : "JO6Y7W0B_-hMjUaqtwEL",
"_score" : 1.0,
"_source" : {
"id" : 13,
"date" : "10-12-2019"
}
},
{
"_index" : "sales",
"_type" : "_doc",
"_id" : "Je6Y7W0B_-hMjUaqxgEH",
"_score" : 1.0,
"_source" : {
"id" : 13,
"date" : "10-18-2019"
}
}
]
Query:- get max date for all terms, get max date where date range less than 2019-10-12
if both are same return bucket
GET sales/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "id",
"size": 10000
},
"aggs": {
"maxDate": {
"max": {
"field": "date"
}
},
"pending_status": {
"filter": {
"range": {
"date": {
"lte": "10-12-2019"
}
}
},
"aggs": {
"filtered_maxdate": {
"max": {
"field": "date"
}
}
}
},
"buckets_latest_status_pending": {
"bucket_selector": {
"buckets_path": {
"filtereddate": "pending_status>filtered_maxdate",
"maxDate": "maxDate"
},
"script": "params.filtereddate==params.maxDate"
}
}
}
}
}
}
Response:
[
{
"key" : "12",
"doc_count" : 1,
"pending_status" : {
"doc_count" : 1,
"filtered_maxdate" : {
"value" : 1.5708384E12,
"value_as_string" : "10-12-2019"
}
},
"maxDate" : {
"value" : 1.5708384E12,
"value_as_string" : "10-12-2019"
}
}
]
EDIT 1:
You can use (top_hits)[https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-top-hits-aggregation.html] aggregation to get all documents in a bucket
GET sales/_search
{
"size": 0,
"aggs": {
"transactionId": {
"terms": {
"field": "id",
"size": 10000
},
"aggs": {
"maxDate": {
"max": {
"field": "date"
}
},
"pending_status": {
"filter": {
"range": {
"date": {
"lte": "10-12-2019"
}
}
},
"aggs": {
"filtered_maxdate": {
"max": {
"field": "date"
}
}
}
},
"buckets_latest_status_pending": {
"bucket_selector": {
"buckets_path": {
"filtereddate": "pending_status>filtered_maxdate",
"maxDate": "maxDate"
},
"script": "params.filtereddate==params.maxDate"
}
},
"top_hits":{ ---> top hits to get all documents under a bucket
"top_hits": {
"size": 10
}
}
}
}
}
}
For pagination you can use composite aggregation/ include partitions

Let us take a step back and understand what we are trying to do.
First we want to filter the documents. Secondly we want the filter logic to be dependent on the other document's value, which is nothing but a kind of self-join kinda scenario. I don't think that is possible with the way your documents are ingested. You can read more on join here.
As a simplest solution, in order to achieve what you are looking for, you would need to change the document structure to something like below. Of course you can also choose to use nested type but I think below would be much simpler solution.
Mapping:
PUT sales
{
"mappings": {
"properties": {
"id":{
"type": "keyword"
},
"date":{
"type": "date",
"format": "MM-dd-yyyy"
}
}
}
}
Sample Documents:
POST sales/_doc/1
{
"id" : 12 ,
"date" : ["10-12-2019", "10-18-2019"] <--- Note this
}
POST sales/_doc/2
{
"id" : 13 ,
"date" : ["10-12-2019"]
}
POST sales/_doc/3
{
"id" : 14 ,
"date" : ["10-10-2019", "10-14-2019"]
}
Query:
POST sales/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"date": {
"lte": "10-12-2019"
}
}
}
],
"must_not": [
{
"range": {
"date": {
"gt": "10-12-2019"
}
}
}
]
}
}
}
Note how the query has been constructed in a very simplistic fashion using must and must_not clauses.
Response:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "sales",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"id" : 13,
"date" : [
"10-12-2019"
]
}
}
]
}
}
Note that in the response, you get only the document you are looking for.
Hope that helps!

Related

ElasticSearch aggregation for filtering users who had both events

I've written a query which perfectly return me 6000+ events my users had:
GET /<app_logs-2022.11.23*>/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"context.identity.type": "login"
}
},
{
"term": {
"context.identity.type": "login_error"
}
}
],
"minimum_should_match": 1
}
},
"_source": [
"context.identity.user_id",
"context.identity.type"
],
"size": 3
}
And i get such set of data
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 15,
"successful" : 15,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6001,
"max_score" : 10.722837,
"hits" : [
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "bb469377-0618-49a6-a643-1201dc84c829",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "8f4e82a0-f333-4096-bfb6-767fed924093",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "72562ad0-4f35-4624-8776-8b555dea851e",
"type" : "login_error"
}
}
}
},
{
"_index" : "app_logs-2022.11.23-7",
"_type" : "app",
"_id" : "7090be5a-8b53-4723-a1ac-223476a000f1",
"_score" : 10.722837,
"_source" : {
"context" : {
"identity" : {
"user_id" : "75bcb301-1cee-4b3b-aa1b-adbe4c011388",
"type" : "login_error"
}
}
}
}
]
}
}
But i can't figure out how to get a number of users who had both login and login_error events, i've tried cardinality aggregation, terms and several more but all of them just split types into buckets showing sum but doesn't group by user, and i want to find how many users had problems first but then managed to login in the end.
The best i've managed to achieve is to get buckets by user_id and output cardinality for each of them by type
"aggs": {
"results": {
"terms": {
"field": "context.identity.user_id",
"size": 300
},
"aggs": {
"events": {
"cardinality": {
"field": "context.identity.type"
}
}
}
}
}
I created an example based on the sentence I want to find how many users had problems first but then managed to login in at the end.
It works like this:
Make aggs based on user_id
Make sub-aggs by type
Ignore the ones that don't contain login_error
#put mapping
PUT test_stack_login
{
"mappings": {
"properties": {
"context.identity.user_id": {
"type": "keyword"
},
"context.identity.type": {
"type": "keyword"
}
}
}
}
#put example docs
POST test_stack_login/_bulk?refresh&pretty
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":1,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":2,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":3,"context.identity.type":"login"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login_error"}
{"index":{}}
{"context.identity.user_id":4,"context.identity.type":"login"}
#Run the query
GET test_stack_login/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "context.identity.user_id",
"size": 1000
},
"aggs": {
"context_identity_type": {
"terms": {
"field": "context.identity.type",
"size": 10
}
},
"login_error_exist": {
"bucket_selector": {
"buckets_path": {
"var1": "context_identity_type['login_error']>_count"
},
"script": "params.var1 != null"
}
}
}
}
}
}
#the result will be like in the ss
You will get user_id containing both login and login_error information in the context.identity.type field. The keys in the response will give you the user_id of which have logged in at least once unsuccessfully and once successfully logged in.
"buckets" : [
{"key" : "1" ...},
{"key" : "4" ...}
]

global sorting across different buckets after aggregation in elasticsearch

a sample in my document is as shown below.
{"rackName" : "rack005", "roomName" : "roomB", "power" : 132, "timestamp" : 1594540106208}
the thing I wanna do is get the latest data of each rack in a given room then sort them by power.
with the code below I did something to get close to my target.losing mind with the last step which seems like soring my data cross different buckets by field 'power'.
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"timestamp": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
-----------------------------------result-------------------------------------------------------
"aggregations" : {
"rk_ag" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "rack003",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "0FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack003",
"roomName" : "roomB",
"power" : 115,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack004",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "1FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack004",
"roomName" : "roomB",
"power" : 108,
"timestamp" : 1594540117492
},
"sort" : [
1594540117492
]
}
]
}
}
},
{
"key" : "rack005",
"doc_count" : 4,
"latest" : {
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "power",
"_type" : "_doc",
"_id" : "2FXVQnMB8DPB7H9t6U0E",
"_score" : null,
"_source" : {
"rackName" : "rack005",
"roomName" : "roomB",
"power" : 118,
"timestamp" : 1594540114492
},
"sort" : [
1594540114492
]
}
]
}
}
}
]
}
}
You're sorting by timestamp instead of power. Try this instead:
GET /power/_search
{
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
You can sort by multiple fields too.
Adding to #Joe's answer. As he mentioned, you can use multiple fields in the sort.
Below query would give you what you are looking for:
POST my_rack_index/_search
{
"size": 0,
"query": {
"term": {
"roomName.keyword": {
"value": "roomB"
}
}
},
"aggs": {
"rk_ag": {
"terms": {
"field": "rackName"
},
"aggs": {
"latest": {
"top_hits": {
"sort": [ <---- Note this part
{
"timestamp": {
"order": "desc"
}
},
{
"power": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
So now if for every rack you have two documents having same rackName with exact same power, the one with the latest timestamp would be showing up in the response.
The way sort would work is, first it would sort based on the timestamp, then it would do the sorting based on power by keeping the sort based on timestamp intact.

SQl equivalent Correlated query for ElasticSearch Aggregation

I have a use case for writing an aggregation which if written in SQL can be achieved using correlated queries.
I have a index called listings where the properties/columns are ListDate, ListPrice, SoldDate, SoldPrice, OffMarketDate.
ListDate is not nullable, but SoldDate,SoldPrice, OffMarketDate can be nullable.
I want to aggregate stats from the above index based on the following requirement.
I want to have monthly stats, which I see can be achieved by
DateHistogramAggregation
For each month from the
DateHistogramAggregation, I want to find the listings as follows:
Example: For Jan 2019, get all the listings where (ListDate< Feb 1st, 2019) and (SoldDate is null or SoldDate<Jan 1st, 2019) and (OffMarketDate is null or OffMarketDate< Jan 1st, 2019)
Then run the aggregation function for those lists each month.
I appreciate any suggestions to implement this use case. Thanks in advance for the help.
Please see the below details and info as how you can approach this problem:
Mapping:
PUT listings
{
"mappings": {
"properties": {
"listDate":{
"type": "date"
},
"listPrice":{
"type": "long"
},
"soldDate":{
"type": "date"
},
"soldPrice": {
"type": "long"
},
"offMarketDate": {
"type": "date"
}
}
}
}
Note that I've constructed the above mapping looking at your question.
Sample Documents:
POST listings/_doc/1
{
"listDate": "2020-01-01",
"listPrice": "100.00",
"soldDate": "2019-12-25",
"soldPrice": "120.00",
"offMarketDate": "2019-12-20"
}
POST listings/_doc/2
{
"listDate": "2020-01-01",
"listPrice": "100.00",
"soldDate": "2019-12-24",
"soldPrice": "122.00",
"offMarketDate": "2019-12-20"
}
POST listings/_doc/3
{
"listDate": "2020-01-25",
"listPrice": "120.00",
"soldDate": "2020-01-30",
"soldPrice": "140.00",
"offMarketDate": "2020-01-26"
}
POST listings/_doc/4
{
"listDate": "2020-01-25",
"listPrice": "120.00",
"soldDate": "2020-02-02",
"soldPrice": "135.00",
"offMarketDate": "2020-01-26"
}
POST listings/_doc/5
{
"listDate": "2020-01-25",
"listPrice": "120.00"
}
POST listings/_doc/6
{
"listDate": "2020-02-02",
"listPrice": "120.00"
}
Note how I've not added the soldDate and offMarketDate in the docs 5 and 6 as that would be better option than having it with null value.
Request Query:
So I've come up with the below query for your use-case.
Also for the sake of aggregation, let's say I've calculated the total soldPrice for the docs having
listDate in the month of Jan 2020 AND
(soldDate either null OR soldDate before the month of Jan 2020) AND.
(offMarketDate either null OR offMarketDate before the month of Jan 2020).
Below is the query:
POST listings/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"listDate": {
"gte": "2020-01-01",
"lte": "2020-02-01"
}
}
},
{
"bool": {
"should": [
{
"range": {
"soldDate": {
"lte": "2020-01-01"
}
}
},
{
"bool": {
"must_not": [
{
"exists": {
"field": "soldDate"
}
}
]
}
}
],
"minimum_should_match": 1
}
},
{
"bool": {
"should": [
{
"range": {
"offMarketDate": {
"lte": "2020-01-01"
}
}
},
{
"bool": {
"must_not": [
{
"exists": {
"field": "offMarketDate"
}
}
]
}
}
],
"minimum_should_match": 1
}
}
]
}
},
"aggs": {
"my_histogram": {
"date_histogram": {
"field": "listDate",
"calendar_interval": "month"
},
"aggs": {
"total_sales_price": {
"sum": {
"field": "soldPrice"
}
}
}
}
}
}
The query above is very easily readable and self explanatory. I'd suggest reading about the below different queries which I've made use of:
Bool Query
Range Query
Field Exists Query to verify if field exists or not.
Data Histogram Aggregation
Sum Metrics Aggregation
Response:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 3.0,
"hits" : [
{
"_index" : "listings",
"_type" : "_doc",
"_id" : "1",
"_score" : 3.0,
"_source" : {
"listDate" : "2020-01-01",
"listPrice" : "100.00",
"soldDate" : "2019-12-25",
"soldPrice" : "120.00",
"offMarketDate" : "2019-12-20"
}
},
{
"_index" : "listings",
"_type" : "_doc",
"_id" : "2",
"_score" : 3.0,
"_source" : {
"listDate" : "2020-01-01",
"listPrice" : "100.00",
"soldDate" : "2019-12-24",
"soldPrice" : "122.00",
"offMarketDate" : "2019-12-20"
}
},
{
"_index" : "listings",
"_type" : "_doc",
"_id" : "5",
"_score" : 1.0,
"_source" : {
"listDate" : "2020-01-25",
"listPrice" : "120.00"
}
}
]
},
"aggregations" : {
"my_histogram" : {
"buckets" : [
{
"key_as_string" : "2020-01-01T00:00:00.000Z",
"key" : 1577836800000,
"doc_count" : 3,
"total_sales_price" : {
"value" : 242.0
}
}
]
}
}
}
As expected, documents 1,2 and 5 are showing up with the correct aggregated sum of soldPrice.
Hope that helps!

Elasticsearch filter by multiple fields in an object which is in an array field

The goal is to filter products with multiple prices.
The data looks like this:
{
"name":"a",
"price":[
{
"membershipLevel":"Gold",
"price":"5"
},
{
"membershipLevel":"Silver",
"price":"50"
},
{
"membershipLevel":"Bronze",
"price":"100"
}
]
}
I would like to filter by membershipLevel and price. For example, if I am a silver member and query price range 0-10, the product should not appear, but if I am a gold member, the product "a" should appear. Is this kind of query supported by Elasticsearch?
You need to make use of nested datatype for price and make use of nested query for your use case.
Please see the below mapping, sample document, query and response:
Mapping:
PUT my_price_index
{
"mappings": {
"properties": {
"name":{
"type":"text"
},
"price":{
"type":"nested",
"properties": {
"membershipLevel":{
"type":"keyword"
},
"price":{
"type":"double"
}
}
}
}
}
}
Sample Document:
POST my_price_index/_doc/1
{
"name":"a",
"price":[
{
"membershipLevel":"Gold",
"price":"5"
},
{
"membershipLevel":"Silver",
"price":"50"
},
{
"membershipLevel":"Bronze",
"price":"100"
}
]
}
Query:
POST my_price_index/_search
{
"query": {
"nested": {
"path": "price",
"query": {
"bool": {
"must": [
{
"term": {
"price.membershipLevel": "Gold"
}
},
{
"range": {
"price.price": {
"gte": 0,
"lte": 10
}
}
}
]
}
},
"inner_hits": {} <---- Do note this.
}
}
}
The above query means, I want to return all the documents having price.price range from 0 to 10 and price.membershipLevel as Gold.
Notice that I've made use of inner_hits. The reason is despite being a nested document, ES as response would return the entire set of document instead of only the document specific to where the query clause is applicable.
In order to find the exact nested doc that has been matched, you would need to make use of inner_hits.
Below is how the response would return.
Response:
{
"took" : 128,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9808291,
"hits" : [
{
"_index" : "my_price_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.9808291,
"_source" : {
"name" : "a",
"price" : [
{
"membershipLevel" : "Gold",
"price" : "5"
},
{
"membershipLevel" : "Silver",
"price" : "50"
},
{
"membershipLevel" : "Bronze",
"price" : "100"
}
]
},
"inner_hits" : {
"price" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.9808291,
"hits" : [
{
"_index" : "my_price_index",
"_type" : "_doc",
"_id" : "1",
"_nested" : {
"field" : "price",
"offset" : 0
},
"_score" : 1.9808291,
"_source" : {
"membershipLevel" : "Gold",
"price" : "5"
}
}
]
}
}
}
}
]
}
}
Hope this helps!
Let me take show you how to do it, using the nested fields and query and filter context. I will take your example to show, you how to define index mapping, index sample documents, and search query.
It's important to note the include_in_parent param in Elasticsearch mapping, which allows us to use these nested fields without using the nested fields.
Please refer to Elasticsearch documentation about it.
If true, all fields in the nested object are also added to the parent
document as standard (flat) fields. Defaults to false.
Index Def
{
"mappings": {
"properties": {
"product": {
"type": "nested",
"include_in_parent": true
}
}
}
}
Index sample docs
{
"product": {
"price" : 5,
"membershipLevel" : "Gold"
}
}
{
"product": {
"price" : 50,
"membershipLevel" : "Silver"
}
}
{
"product": {
"price" : 100,
"membershipLevel" : "Bronze"
}
}
Search query to show Gold with price range 0-10
{
"query": {
"bool": {
"must": [
{
"match": {
"product.membershipLevel": "Gold"
}
}
],
"filter": [
{
"range": {
"product.price": {
"gte": 0,
"lte" : 10
}
}
}
]
}
}
}
Result
"hits": [
{
"_index": "so-60620921-nested",
"_type": "_doc",
"_id": "1",
"_score": 1.0296195,
"_source": {
"product": {
"price": 5,
"membershipLevel": "Gold"
}
}
}
]
Search query to exclude Silver, with same price range
{
"query": {
"bool": {
"must": [
{
"match": {
"product.membershipLevel": "Silver"
}
}
],
"filter": [
{
"range": {
"product.price": {
"gte": 0,
"lte" : 10
}
}
}
]
}
}
}
Above query doesn't return any result as there isn't any matching result.
P.S :- this SO answer might help you to understand nested fields and query on them in detail.
You have to use Nested fields and nested query to archive this: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-nested-query.html
Define you Price property with type "Nested" and then you will be able to filter by every property of nested object

elasticsearch groupby and filter by regex condition

It's a bit hard for me to define the question as I'm not very experienced with Elasticsearch. I'm focusing the question on my specific problem:
Assuming I have the following records:
{
id: 1
name: bla1_1.aaa
},
{
id: 1
name: bla1_2.bbb
},
{
id: 2
name: bla2_1.aaa
},
{
id: 2
name: bla2_2.aaa
}
What I want is to GET all the ids that have all of their names ending with aaa.
I was thinking about group by id and then do a regex query like so: *\.aaa so that all the name must satisfy the regex query.
On this particular example I would get id: 2 back.
How do I do it?
Let me know if there's anything I need to add to clarify the question.
RegexExp can be used.
Wildcard .* matches any character any number of times including zero
Terms aggregation will give you unique "ids" and number of docs under them.
Mapping :
PUT regex
{
"mappings": {
"properties": {
"id":{
"type":"integer"
},
"name":{
"type":"text",
"fields": {
"keyword":{
"type":"keyword"
}
}
}
}
}
}
Data:
"hits" : [
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "olQXjW0BywGFQhV7k84P",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "o1QXjW0BywGFQhV7us6B",
"_score" : 1.0,
"_source" : {
"id" : 1,
"name" : "bla1_2.bbb"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pFQXjW0BywGFQhV77c6J",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_1.aaa"
}
},
{
"_index" : "regex",
"_type" : "_doc",
"_id" : "pVQYjW0BywGFQhV7Dc6F",
"_score" : 1.0,
"_source" : {
"id" : 2,
"name" : "bla2_2.aaa"
}
}
]
Query:
GET regex/_search
{
"size":0,
"query": {
"regexp": {
"name.keyword": {
"value": ".*.aaa" ---> name ending with .aaa
}
}
},
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
}
}
}
}
Result:
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"unique_ids" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 2, ---> 2 doc under id 2
"doc_count" : 2
},
{
"key" : 1, ----> 1 doc under id 1
"doc_count" : 1
}
]
}
}
Edit:
Using bucket selector to keep buckets where total count of docs in Id matches with docs selected in regex
GET regex/_search
{
"size": 0,
"aggs": {
"unique_ids": {
"terms": {
"field": "id",
"size": 10
},
"aggs": {
"totalCount": { ---> to get total count of id(all docs)
"value_count": {
"field": "id"
}
},
"filter_agg": {
"filter": {
"bool": {
"must": [
{
"regexp": {
"name.keyword": ".*.aaa"
}
}
]
}
},
"aggs": {
"finalCount": { -->total count of docs matching regex
"value_count": {
"field": "id"
}
}
}
},
"mybucket_selector": { ---> include buckets where totalcount==finalcount
"bucket_selector": {
"buckets_path": {
"FinalCount": "filter_agg>finalCount",
"TotalCount": "totalCount"
},
"script": "params.FinalCount==params.TotalCount"
}
}
}
}
}
}

Resources