Elasticsearch painless query exception - elasticsearch

I am using Elasticsearch 6.2 which uses painless for the inline scripting. One of the fields in my doc has mapping:
"gap_days": {"type": "integer"}
And I have a painless script for search and the few lines are:
int gap = 10; //initialize to a default value
if (doc.containsKey('gap_days')) {
if (doc['gap_days'].value != null) {
gap = doc['gap_days'].value;
}
}
But this keeps throwing an error:
script_stack: [
"gap = doc['gap_days'].value; } } ",
" ^---- HERE"
],
caused_by: {
reason: "cannot convert MethodHandle(Longs)long to (Object)int",
type: "wrong_method_type_exception"
},
reason: "runtime error"
I tried to look into all unique doc['gap_days'] values in the index, and you can see all of them are integer in all documents
"aggregations": {
"uniq_gaps": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 342,
"buckets": [
{
"key": 0,
"doc_count": 31607
},
{
"key": 365,
"doc_count": 15119
},
{
"key": 5,
"doc_count": 2639
},
{
"key": 21,
"doc_count": 1784
},
{
"key": 14,
"doc_count": 1229
},
{
"key": 3,
"doc_count": 1073
},
{
"key": 7,
"doc_count": 979
},
{
"key": 2,
"doc_count": 728
},
{
"key": 4,
"doc_count": 291
},
{
"key": 10,
"doc_count": 170
}
]
}
}
Then why does it throw an exception saying cannot convert MethodHandle(Longs)long to (Object)int and my script stops working. Any idea how to fix this problem?

Related

Nest aggregation results are null however there are data in the debugger

I'm working on aggregations in NEST, so far everything has worked well, but now when I try to access nested fields through .children the result is null, however the debugger is showing the data correctly.
If I post this query through postman I get the following results:
{
"took": 50,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 9,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"filter#CollarSize": {
"meta": {},
"doc_count": 9,
"nested#VariantsProperties": {
"doc_count": 53,
"sterms#CollarSize": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "CollarSize",
"doc_count": 39,
"sterms#banana": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "15",
"doc_count": 7
},
{
"key": "16",
"doc_count": 7
},
{
"key": "17",
"doc_count": 6
},
{
"key": "18",
"doc_count": 6
},
{
"key": "LAR",
"doc_count": 2
},
{
"key": "MED",
"doc_count": 2
},
{
"key": "SML",
"doc_count": 2
},
{
"key": "X.L",
"doc_count": 2
},
{
"key": "XXL",
"doc_count": 2
},
{
"key": "15.5",
"doc_count": 1
},
{
"key": "16.5",
"doc_count": 1
},
{
"key": "XXXL",
"doc_count": 1
}
]
}
},
{
"key": "Colour",
"doc_count": 14,
"sterms#banana": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Blue",
"doc_count": 7
},
{
"key": "White",
"doc_count": 7
}
]
}
}
]
}
},
"sterms#CollarSize": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
}
Is there a way to get inside the child "CollarSize" ? I've tried different combinations with .nested, .children, .terms, .filter however none of these seems to work.
You can get "CollarSize" terms and "banana" terms for each with
var response = client.Search<object>(/** your query here **/);
var collarSizeSignificantTermsAgg = response.Aggregations.Filter("CollarSize").Nested("VariantsProperties").Terms("CollarSize");
foreach(var bucket in collarSizeSignificantTermsAgg.Buckets)
{
Console.WriteLine(bucket.Key);
var bananaSigTerms = bucket.Terms("banana");
foreach(var subBucket in bananaSigTerms.Buckets)
{
Console.WriteLine($"key: {subBucket.Key}, doc_count: {subBucket.DocCount}");
}
}
which prints
CollarSize
key: 15, doc_count: 7
key: 16, doc_count: 7
key: 17, doc_count: 6
key: 18, doc_count: 6
key: LAR, doc_count: 2
key: MED, doc_count: 2
key: SML, doc_count: 2
key: X.L, doc_count: 2
key: XXL, doc_count: 2
key: 15.5, doc_count: 1
key: 16.5, doc_count: 1
key: XXXL, doc_count: 1
Colour
key: Blue, doc_count: 7
key: White, doc_count: 7
Here's a full example, using InMemoryConnection to stub the response
private static void Main()
{
var defaultIndex = "my_index";
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var json = #"{
""took"": 50,
""timed_out"": false,
""_shards"": {
""total"": 1,
""successful"": 1,
""skipped"": 0,
""failed"": 0
},
""hits"": {
""total"": {
""value"": 9,
""relation"": ""eq""
},
""max_score"": null,
""hits"": []
},
""aggregations"": {
""filter#CollarSize"": {
""meta"": { },
""doc_count"": 9,
""nested#VariantsProperties"": {
""doc_count"": 53,
""sterms#CollarSize"": {
""doc_count_error_upper_bound"": 0,
""sum_other_doc_count"": 0,
""buckets"": [
{
""key"": ""CollarSize"",
""doc_count"": 39,
""sterms#banana"": {
""doc_count_error_upper_bound"": 0,
""sum_other_doc_count"": 0,
""buckets"": [
{
""key"": ""15"",
""doc_count"": 7
},
{
""key"": ""16"",
""doc_count"": 7
},
{
""key"": ""17"",
""doc_count"": 6
},
{
""key"": ""18"",
""doc_count"": 6
},
{
""key"": ""LAR"",
""doc_count"": 2
},
{
""key"": ""MED"",
""doc_count"": 2
},
{
""key"": ""SML"",
""doc_count"": 2
},
{
""key"": ""X.L"",
""doc_count"": 2
},
{
""key"": ""XXL"",
""doc_count"": 2
},
{
""key"": ""15.5"",
""doc_count"": 1
},
{
""key"": ""16.5"",
""doc_count"": 1
},
{
""key"": ""XXXL"",
""doc_count"": 1
}
]
}
},
{
""key"": ""Colour"",
""doc_count"": 14,
""sterms#banana"": {
""doc_count_error_upper_bound"": 0,
""sum_other_doc_count"": 0,
""buckets"": [
{
""key"": ""Blue"",
""doc_count"": 7
},
{
""key"": ""White"",
""doc_count"": 7
}
]
}
}
]
}
},
""sterms#CollarSize"": {
""doc_count_error_upper_bound"": 0,
""sum_other_doc_count"": 0,
""buckets"": []
}
}
}
}
";
var settings = new ConnectionSettings(pool, new InMemoryConnection(Encoding.UTF8.GetBytes(json)))
.DefaultIndex(defaultIndex);
var client = new ElasticClient(settings);
var response = client.Search<object>(s => s);
var collarSizeSignificantTermsAgg = response.Aggregations.Filter("CollarSize").Nested("VariantsProperties").Terms("CollarSize");
foreach (var bucket in collarSizeSignificantTermsAgg.Buckets)
{
Console.WriteLine(bucket.Key);
var bananaSigTerms = bucket.Terms("banana");
foreach (var subBucket in bananaSigTerms.Buckets)
{
Console.WriteLine($"key: {subBucket.Key}, doc_count: {subBucket.DocCount}");
}
}
}

Power Query Convert Records in List to Columns

I have a list of JSON documents that I'm trying to convert into rows in Power Query. I'm struggling though as the values I need are records, in a list, inside a column the record. Anything that starts getting close to what I need gets horrendously complex 😕
A single record looks like this:
{
"key_as_string": "2020-02-25T23:00:00.000Z",
"key": 1582671600000,
"doc_count": 1086187,
"attack_types": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "attack-sqli",
"doc_count": 380989
},
{
"key": "attack-protocol",
"doc_count": 8195
},
{
"key": "attack-xss",
"doc_count": 1216
},
{
"key": "attack-rce",
"doc_count": 258
},
{
"key": "attack-disclosure",
"doc_count": 157
},
{
"key": "attack-lfi",
"doc_count": 24
},
{
"key": "attack-generic",
"doc_count": 17
},
{
"key": "attack-rfi",
"doc_count": 2
}
]
}
}
And I'm trying to turn it into this:
The 2nd row shown here is just an example of what a 2nd record would look like, for clarity.
Any help is greatly appreciated!
I started with this JSON, saved in a file called test.json, to ensure I would see how things worked with two records:
[{
"key_as_string": "2020-02-25T23:00:00.000Z",
"key": 1582671600000,
"doc_count": 1086187,
"attack_types": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "attack-sqli",
"doc_count": 380989
},
{
"key": "attack-protocol",
"doc_count": 8195
},
{
"key": "attack-xss",
"doc_count": 1216
},
{
"key": "attack-rce",
"doc_count": 258
},
{
"key": "attack-disclosure",
"doc_count": 157
},
{
"key": "attack-lfi",
"doc_count": 24
},
{
"key": "attack-generic",
"doc_count": 17
},
{
"key": "attack-rfi",
"doc_count": 2
}
]
}
},
{
"key_as_string": "2020-02-25T22:00:00.000Z",
"key": 158267000000,
"doc_count": 1086186,
"attack_types": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "attack-sqli",
"doc_count": 384419
},
{
"key": "attack-protocol",
"doc_count": 2046
},
{
"key": "attack-xss",
"doc_count": 1504
},
{
"key": "attack-rce",
"doc_count": 198
},
{
"key": "attack-disclosure",
"doc_count": 120
},
{
"key": "attack-lfi",
"doc_count": 16
},
{
"key": "attack-generic",
"doc_count": 200
},
{
"key": "attack-rfi",
"doc_count": 2
}
]
}
}]
The I used the GUI to derive this M code, which seems to work:
let
Source = Json.Document(File.Contents("MYFILEPATH\test.json")),
#"Converted to Table" = Table.FromList(Source, Splitter.SplitByNothing(), null, null, ExtraValues.Error),
#"Expanded Column1" = Table.ExpandRecordColumn(#"Converted to Table", "Column1", {"key_as_string", "key", "doc_count", "attack_types"}, {"key_as_string", "key", "doc_count", "attack_types"}),
#"Expanded attack_types" = Table.ExpandRecordColumn(#"Expanded Column1", "attack_types", {"doc_count_error_upper_bound", "sum_other_doc_count", "buckets"}, {"doc_count_error_upper_bound", "sum_other_doc_count", "buckets"}),
#"Expanded buckets" = Table.ExpandListColumn(#"Expanded attack_types", "buckets"),
#"Expanded buckets1" = Table.ExpandRecordColumn(#"Expanded buckets", "buckets", {"key", "doc_count"}, {"key.1", "doc_count.1"}),
#"Pivoted Column" = Table.Pivot(#"Expanded buckets1", List.Distinct(#"Expanded buckets1"[key.1]), "key.1", "doc_count.1"),
#"Removed Other Columns" = Table.SelectColumns(#"Pivoted Column",{"key", "attack-sqli", "attack-protocol", "attack-xss", "attack-rce", "attack-disclosure", "attack-lfi", "attack-generic", "attack-rfi"}),
#"Sorted Rows" = Table.Sort(#"Removed Other Columns",{{"key", Order.Descending}})
in
#"Sorted Rows"
Just cut and paste the above M code into your Advanced editor. Replace MYFILEPATH with your file path.
I got this result:

Histogram aggregation OR something else?

Which aggregation should I use, when I want same functionality as Histogram, BUT with specify only number of buckets, instead of specify interval?
Something like: give me aggs for price, and split it to 5 buckets...
I don’t want to make min+max aggregation, then calculate 5 intervals before sending my query, because that means 1 extra query on server ... first ask for min+max, then send actual query.
STANDARD HISTOGRAM AGGS QUERY:
"aggs":{
"prices":{
"histogram": {
"field": "variants.priceVat.d1",
"interval": 500
}
}
}
STANDARD RESULT (min 10, max 850 = 2 buckets, because interval is 500):
"prices": {
"doc_count": 67,
"prices": {
"buckets": [
{
"key": 10,
"doc_count": 56
},
{
"key": 500,
"doc_count": 13
}
]
}
}
WHAT I WANT (five buckets with automatic range min:10, max:850 = 1 bucket interval is 168):
"prices": {
"doc_count": 67,
"prices":{
"buckets": [
{
"key": 10,
"doc_count": 42
},
{
"key": 178,
"doc_count": 10
},
{
"key": 346,
"doc_count": 4
},
{
"key": 514,
"doc_count": 7
},
{
"key": 682,
"doc_count": 2
}
]
}
}

How to use aggregations with Elastic Search

I'm using Elastic Search to create a search filter and I need to find all the values saved in the database of the "cambio" column without repeating the values.
The values are saved as follows: "Manual de 5 marchas" or "Manual de 6 marchas"....
I created this query to return all saved values:
GET /crawler10/crawler-vehicles10/_search
{
"size": 0,
"aggregations": {
"my_agg": {
"terms": {
"field": "cambio"
}
}
}
}
But when I run the returned values they look like this:
"aggregations": {
"my_agg": {
"doc_count_error_upper_bound": 2,
"sum_other_doc_count": 2613,
"buckets": [
{
"key": "de",
"doc_count": 2755
},
{
"key": "marchas",
"doc_count": 2714
},
{
"key": "manual",
"doc_count": 2222
},
{
"key": "modo",
"doc_count": 1097
},
{
"key": "5",
"doc_count": 1071
},
{
"key": "d",
"doc_count": 1002
},
{
"key": "n",
"doc_count": 1002
},
{
"key": "automática",
"doc_count": 935
},
{
"key": "com",
"doc_count": 919
},
{
"key": "6",
"doc_count": 698
}
]
}
}
Aggregations are based on the mapping type of the saved field. The field type for cambio seems to be set to analyzed(by default). Please create an index with the mapping not_analyzed for your field cambio.
You can create the index with a PUT request as below (if your ES version is less than 5) and then you will need to re-index your data in the crawler10 index.
PUT crawler10/_mapping/
{
"mappings": {
"crawler-vehicles10": {
"properties": {
"cambio": {
"type": "string"
"index": "not_analyzed"
}
}
}
}
}
For ES v5 or greater
PUT crawler10/_mapping/
{
"mappings": {
"crawler-vehicles10": {
"properties": {
"cambio": {
"type": "keyword"
}
}
}
}
}

ElasticSearch - Get Statistics on Aggregation results

I have the following simple aggregation:
GET index1/type1/_search
{
"size": 0,
"aggs": {
"incidentID": {
"terms": {
"field": "incidentID",
"size": 5
}
}
}
}
Results are:
"aggregations": {
"incidentID": {
"buckets": [
{
"key": "0A631EB1-01EF-DC28-9503-FC28FE695C6D",
"doc_count": 233
},
{
"key": "DF107D2B-CA1E-85C9-E01A-C966DC6F7051",
"doc_count": 226
},
{
"key": "60B8955F-38FD-8DFE-D374-4387668C8368",
"doc_count": 220
},
{
"key": "B787868A-F72E-63DC-D837-B3A864D9FFC6",
"doc_count": 174
},
{
"key": "C597EC5F-C60F-F3BA-61CB-4990F12C1893",
"doc_count": 174
}
]
}
}
What I want to do is get the "statistics" of the "doc_count" returned. I want:
Min Value
Max Value
Average
Standard Deviation
No, this is not currently possible, here is the issue tracking the support:
https://github.com/elasticsearch/elasticsearch/issues/8110
Obviously, it is possible to do this client side if you are able to pull the full list of all buckets into memory.

Resources