Elasticsearch cardinality multiple fields

Elasticsearch cardinality multiple fields - elasticsearch

I have an index which is as follows:
{
"_index" : "r2332",
"_type" : "_doc",
"_id" : "Vl81o3oBs8vUbHSMCZVQ",
"_score" : 1.0,
"_source" : {
"maid" : "d8ee3c5e-babb-4777-9cba-17fb0cd8e8a9",
"date" : "2021-06-09",
"hour" : 5,
"site_id" : 1035
}
},
{
"_index" : "r2332",
"_type" : "_doc",
"_id" : "Xl81o3oBs8vUbHSMCZVQ",
"_score" : 1.0,
"_source" : {
"maid" : "d8ee3c5e-babb-4777-9cba-17fb0cd8e8a9",
"date" : "2021-06-09",
"hour" : 5,
"site_id" : 1897
}
}
I am trying to get the unique count across maid, date. I am able to aggregate with one field maid but not both. The following are the codes that I tried.
Trial 1:
{
"size": 0,
"query": {
"bool": {
"filter": [{
"terms": {
"site_id": [7560, 7566]
}
}, {
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}]
}
},
"runtime_mappings": {
"type_and_promoted": {
"type": "keyword",
"script": "emit(doc['maid'].value + ' ' + doc['date'].value)"
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"reverse_nested": {},
"uv": {
"cardinality": {
"field": "runtime_mappings"
}
}
}
}
}
}
}
}
}
}
This is giving an empty output.
Trial 2:
{
"size": 0,
"query": {
"bool": {
"filter": [{
"terms": {
"site_id": [7560, 7566]
}
}, {
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"uv": {
"cardinality": {
"script": "doc['maid'].value + '#' +doc'date'].value"
}
}
}
}
}
}
}
}
}
}
This gives me syntax error at doc['maid'].value. How do I effectively combine two fields for cardinality. I am using Elasticsearch 7.13.2.
The mapping of my index is as follows:
"r2332" : {
"mappings" : {
"dynamic" : "false",
"properties" : {
"date" : {
"type" : "date"
},
"hour" : {
"type" : "integer"
},
"maid" : {
"type" : "keyword"
},
"reach" : {
"type" : "integer"
},
"site_id" : {
"type" : "keyword"
}
}
}
}
}

Modify your second query as shown below. You need to use the maid.keyword instead of the maid field in the cardinality aggregation to avoid the search_phase_execution_exception
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"terms": {
"site_id": [
7560,
7566
]
}
},
{
"range": {
"date": {
"gte": "2021-09-01",
"lte": "2021-09-15"
}
}
}
]
}
},
"aggs": {
"group_by": {
"terms": {
"field": "site_id",
"size": 100
},
"aggs": {
"bydate": {
"terms": {
"field": "date",
"size": 100
},
"aggs": {
"byhour": {
"terms": {
"field": "hour",
"size": 24
},
"aggs": {
"uv": {
"cardinality": {
"script": "doc['maid.keyword'].value + '#' +doc['date'].value"
}
}
}
}
}
}
}
}
}
}

Related

Sorting & querying a query based on a date field that may not exist

Requirement: I want to execute a query & sort on a date field that may not exist. The records date field doesn't exist should all be included first then the records that's date field value is less then 1600230168278 will be included there after. Order will be first those records that doesn't exist date field then date ascending
Mapping & sample data:
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"date": {
"type": "date"
},
"name": {
"type": "text"
}
}
}
}
}
PUT my_index/_doc/1
{
"date": 1546300800000
}
PUT my_index/_doc/2
{
"date": 1577836800000
}
PUT my_index/_doc/3
{
"date": 1609459200000
}
PUT my_index/_doc/4
{
"name": "Arif Mahmud Rana"
}
My Query:
{
"query": {
"bool": {
"must": {
"function_score": {
"functions": [
{
"filter": {
"exists": {
"field": "date"
}
},
"weight": 0.5
}
],
"query": {
"match_all": {}
}
}
},
"filter": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"bool": {
"must": [
{
"exists": {
"field": "date"
}
},
{
"range": {
"date": {
"lt": 1600230168278
}
}
}
]
}
},
{
"bool": {
"must_not": {
"exists": {
"field": "date"
}
}
}
}
]
}
}
}
},
"sort": [
{
"_score": "desc"
},
{
"date": "asc"
}
],
"size": 100
}
Result of query:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : null,
"hits" : [
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "4",
"_score" : 1.0,
"_source" : {
"name" : "Arif Mahmud Rana"
},
"sort" : [
1.0,
9223372036854775807
]
},
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.5,
"_source" : {
"date" : 1546300800000
},
"sort" : [
0.5,
1546300800000
]
},
{
"_index" : "my_index",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.5,
"_source" : {
"date" : 1577836800000
},
"sort" : [
0.5,
1577836800000
]
}
]
}
}
This works fine for this simple index with less data but when working on large index my elastic node goes down.
Elastic version: 6.8.5
Actual index: 3048140(docs.count), 1073559(docs.deleted), 1.3gb(store.size) & 1.3gb(pri.store.size)
Any help or idea will be great TIA.

I believe custom scoring on all docs not having date field in large index causing the problem.
Here's a way it could be done to achieve your usecase using missing to define sort criteria for docs with missing sorting field.
GET test/_search
{"query":{"match_all":{}}}
PUT /test
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"age": { "type": "integer" }
}
}
}
POST test/_doc
{
"name": "shahin",
"age": 234
}
POST test/_doc
{
"name": "karim",
"age": 235
}
POST test/_doc
{
"name": "rahim"
}
POST test/_search
{
"query": {
"bool": {
"should": [
{
"bool": {
"must":
{
"range": {
"age": {
"lt": 250
}
}
}
}
},
{
"bool": {
"must_not": {
"exists": {
"field": "age"
}
}
}
}
]
}
},
"sort": [
{ "age" : {"missing" : "_first", "order": "asc"}}
],
"size": 100
}

I added some optimization that may help other. I was indexing my production index from this index. I had to search/query then loop over those data & index in my production index. Here is my production query.
GET /my_index/_search?filter_path=hits.hits._id,hits.hits._source
{
"query": {
"bool": {
"filter": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"range": {
"lastModified": {
"lte": 1600314822988
}
}
},
{
"bool": {
"must_not": {
"exists": {
"field": "lastModified"
}
}
}
}
]
}
}
}
},
"sort": [
{
"indexed": {
"order": "asc",
"missing": "_first"
}
},
{
"lastModified": {
"order": "asc",
"missing": "_first"
}
}
],
"size": 100
}
I used filter over should as my query doesn't need score on matched items. Also I used filter_path to get only required fields. After adding this optimizations my query was at least 4 seconds faster.

How to do a range aggregation in elasticsearch

enter image description here
When aggregating by the field userguid
{
"_source": false,
"aggregations": {
"range_userGuid": {
"terms": {
"field": "userGuid"
}
}
}
}
I get the result
"aggregations" : {
"range_userGuid" : {
"doc_count_error_upper_bound" : 151,
"sum_other_doc_count" : 2424145,
"buckets" : [
{
"key" : 803100110976,
"doc_count" : 1
},
{
"key" : 813110447915,
"doc_count" : 10
},
{
"key" : 803100110306,
"doc_count" : 101
},
{
"key" : 2123312,
"doc_count" : 300
},
{
"key" : 3452342,
"doc_count" : 9999
},
]
}
}
Now I want to get the range from the aggs result. For example (0-100],(100-1000],>1000, and get the count of users. The expect result:
[
{
"from": 0,
"to": 100,
"count": 2 <---- 2 users, 803100110976 and 813110447915
},
{
"from": 100,
"to": "1000",
"count": 2 <---- 803100110306 and 2123312
},
{
"from": 1001,
"count": 1 <---- 3452342
}
]
The bucket size of aggs about 150000, how do I write such query?

You can use the range aggregation to achieve what you expect:
POST /test/_search
{
"size": 0,
"aggs": {
"range_userGuid": {
"range": {
"field": "userGuid",
"ranges": [
{
"from": 0,
"to": 100
},
{
"from": 100,
"to": 200
},
{
"from": 200,
"to": 1000
},
{
"from": 1000
}
]
}
}
}
}
UPDATE: Adapting this answer to your need:
POST index/_search
{
"size": 0,
"aggs": {
"users_0_100": {
"terms": {
"field": "userGuid",
"size": 1000
},
"aggs": {
"0_100": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount < 100"
}
}
}
},
"users_100_200": {
"terms": {
"field": "userGuid",
"size": 1000
},
"aggs": {
"100_200": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 100 && params.docCount < 200"
}
}
}
},
"users_200_1000": {
"terms": {
"field": "userGuid",
"size": 1000
},
"aggs": {
"200_1000": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 200 && params.docCount < 1000"
}
}
}
},
"users_1000": {
"terms": {
"field": "userGuid",
"size": 1000
},
"aggs": {
"1000": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount >= 1000"
}
}
}
}
}
}

group by nested and non nested fields in es

Hi i am trying to do group by nested and non nested fields.I want to do group by on 1 non nested fields(from_district) ,1 nested field(truck_number) and max on nested field(truck_number.score).
Requirement -: to get max score of each truck in all districts if truck is present in that district for a given sp_id
eg-:
District1 ,truck1, 0.9,
District2 ,truck1, 0.8,
District1 ,truck2, 1.8,
District2 ,truck3, 0.7,
District3 ,truck4, 1.7
Below is my mapping
{
"sp_ranked_indent" : {
"mappings" : {
"properties" : {
"from_district" : {
"type" : "keyword"
},
"sp_id" : {
"type" : "long"
},
"to_district" : {
"type" : "keyword"
},
"truck_ranking_document" : {
"type" : "nested",
"properties" : {
"score" : {
"type" : "float"
},
"truck_number" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
}
}
Below is the query that i tried but it is not grouping by nested and non nested field and also the max truck score is incorrect
{
"size": 0,
"query": {
"terms": {
"sp_id": [650128],
"boost": 1.0
}
},
"aggregations": {
"NESTED_AGG": {
"nested": {
"path": "truck_ranking_document"
},
"aggregations": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"truck_numer": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
},
"fromdistrictagg": {
"reverse_nested": {},
"aggregations": {
"fromDistrict": {
"terms": {
"field": "from_district",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [{
"_count": "desc"
}, {
"_key": "asc"
}]
}
}
}
}
}
}
}
}

I think this can be done using term and nested aggregation. Below query will produce output in follwing format
District1
Truck1
Max score
Truck2
Max score
Truck3
Max score
District2
Truck1
Max score
Truck2
Max score
Truck3
Max score
Query:
{
"query": {
"terms": {
"sp_id": [
1
]
}
},
"aggs": {
"district": {
"terms": {
"field": "from_district",
"size": 10
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
},
"min_bucket_selector": {
"bucket_selector": {
"buckets_path": {
"count": "trucks>truck_no._bucket_count"
},
"script": {
"inline": "params.count != 0"
}
}
}
}
}
}
}
Result:
"aggregations" : {
"district" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "District1",
"doc_count" : 1,
"trucks" : {
"doc_count" : 2,
"truck_no" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1",
"doc_count" : 1,
"max_score" : {
"value" : 2.0
}
},
{
"key" : "3",
"doc_count" : 1,
"max_score" : {
"value" : 3.0
}
}
]
}
}
}
]
}
Composite Aggregation
Composite aggregation response contains an after_key
"after_key" : {
"district" : "District4"
}
you need to use the after parameter to retrieve the next results
{
"aggs": {
"my_buckets": {
"composite": {
"size": 100,
"sources": [
{
"district": {
"terms": {
"field": "from_district"
}
}
}
]
},
"aggs": {
"trucks": {
"nested": {
"path": "truck_ranking_document"
},
"aggs": {
"truck_no": {
"terms": {
"field": "truck_ranking_document.truck_number.keyword",
"size": 10
},
"aggs": {
"max_score": {
"max": {
"field": "truck_ranking_document.score"
}
},
"select": {
"bucket_selector": {
"buckets_path": {
"score": "max_score"
},
"script": "if(params.score>0) return true;"
}
}
}
}
}
}
}
}
}
}

How to do two nested aggregations in elasticsearch?

City and home type are two nested objects in the following document mapping:
"mappings" : {
"home_index_doc" : {
"properties" : {
"city" : {
"type" : "nested",
"properties" : {
"country" : {
"type" : "nested",
"properties" : {
"name" : {
"type" : "text"
}
}
},
"name" : {
"type" : "keyword"
}
}
},
"home_type" : {
"type" : "nested",
"properties" : {
"name" : {
"type" : "keyword"
}
}
},
...
}
}
}
I am trying to do the following aggregation:
Take all present documents and show all home_types per city.
I imagine it should look similar to:
"aggregations": {
"all_cities": {
"buckets": [
{
"key": "Tokyo",
"doc_count": 12,
"home_types": {
"buckets": [
{
"key": "apartment",
"doc_count": 5
},
{
"key": "house",
"doc_count": 12
}
]
}
},
{
"key": "New York",
"doc_count": 1,
"home_types": {
"buckets": [
{
"key": "house",
"doc_count": 1
}
]
}
}
]
}
}
After trying gazzilion aproaches and combinations, I've made it that far with Kibana:
GET home-index/home_index_doc/_search
{
"size": 0,
"aggs": {
"all_cities": {
"nested": {
"path": "city"
},
"aggs": {
"city_name": {
"terms": {
"field": "city.name"
}
}
}
},
"aggs": {
"all_home_types": {
"nested": {
"path": "home_type"
},
"aggs": {
"home_type_name": {
"terms": {
"field": "home_type.name"
}
}
}
}
}
}
}
and I get the following exception:
"type": "unknown_named_object_exception",
"reason": "Unknown BaseAggregationBuilder [all_home_types]",

You need to use reverse_nested in order to jump out of the city nested type back at the root level and do another nested aggregation for the home_type nested type. Basically, like this:
{
"size": 0,
"aggs": {
"all_cities": {
"nested": {
"path": "city"
},
"aggs": {
"city_name": {
"terms": {
"field": "city.name"
},
"aggs": {
"by_home_types": {
"reverse_nested": {},
"aggs": {
"all_home_types": {
"nested": {
"path": "home_type"
},
"aggs": {
"home_type_name": {
"terms": {
"field": "home_type.name"
}
}
}
}
}
}
}
}
}
}
}
}

Elasticsearch 2.4 - Combining Date Range Filter with Date Histogram

I have query :
{
"size": 0,
"aggs": {
"data_bulanan" : {
"date_histogram" : {
"field" : "creation_date",
"interval" : "month",
"format": "MMMM"
},
"aggs": {
"SvcCharge" : {
"sum": {
"field": "service_tax"
}
},
"GvtTax" : {
"sum": {
"field": "government_tax"
}
},
"Discount" : {
"sum" : {
"field": "discount"
}
}
}
}
}
}
And i want to add the date range from date to date . for example from 2014-01-01 to 2015-01-01 .
How to add it?
thank you

If you'd like to limit the resultset to the specified dates above, you need to combine a query to your aggregation, you can do it like so
{
"size": 0,
"query": {
"bool": {
"filter": {
"range": {
"creation_date": {
"gte": "2014-01-01",
"lte": "2015-01-01"
}
}
}
}
},
"aggs": {
"data_bulanan": {
"date_histogram": {
"field": "creation_date",
"interval": "month",
"format": "MMMM"
},
"aggs": {
"SvcCharge": {
"sum": {
"field": "service_tax"
}
},
"GvtTax": {
"sum": {
"field": "government_tax"
}
},
"Discount": {
"sum": {
"field": "discount"
}
}
}
}
}
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Elasticsearch cardinality multiple fields - elasticsearch

Related

Sorting & querying a query based on a date field that may not exist

How to do a range aggregation in elasticsearch

group by nested and non nested fields in es

How to do two nested aggregations in elasticsearch?

Elasticsearch 2.4 - Combining Date Range Filter with Date Histogram

Categories

Resources