Elasticsearch data does not match mapping - elasticsearch

I am migrating elasticsearch prod data from 1.4.3v to 5.5v, for which I am using reindex. When I try to reindex old ES index to new ES index the reindexing fails with an exception Failed Reason: mapper [THROUGHPUT_ROWS_PER_SEC] cannot be changed from type [long] to [float]. Failed Type: illegal_argument_exception
ES mapping for task_history index in ES 1.4.3v
{
"task_history": {
"mappings": {
"task_run_hist": {
"_all": {
"enabled": false
},
"_routing": {
"required": true,
"path": "org_id"
},
"properties": {
"RUN_TIME_IN_MINS": {
"type": "double"
},
"THROUGHPUT_ROWS_PER_SEC": {
"type": "long"
},
"account_name": {
"type": "string",
"index": "not_analyzed",
"store": true
}
}
}
}
}
}
ES mapping for task_history index in ES 5.5v (this mapping gets created as part reindexing)
{
"task_history": {
"mappings": {
"task_run_hist": {
"_all": {
"enabled": false
},
"_routing": {
"required": true
},
"properties": {
"RUN_TIME_IN_MINS": {
"type": "float"
},
"THROUGHPUT_ROWS_PER_SEC": {
"type": "long"
},
"account_name": {
"type": "keyword",
"store": true
}
}
}
}
}
}
Sample data
{
"_index": "task_history",
"_type": "task_run_hist",
"_id": "1421955143",
"_score": 1,
"_source": {
"RUN_TIME_IN_MINS": 0.47,
"THROUGHPUT_ROWS_PER_SEC": 46,
"org_id": "xxxxxx",
"account_name": "Soma Acc1"
}
},
{
"_index": "task_history",
"_type": "task_run_hist",
"_id": "1421943738",
"_score": 1,
"_source": {
"RUN_TIME_IN_MINS": 1.02,
"THROUGHPUT_ROWS_PER_SEC": 65.28,
"org_id": "yyyyyy",
"account_name": "Choma Acc1"
}
}
2 Questions
How elasticsearch 1.4.3 is saving float numbers when mapping for THROUGHPUT_ROWS_PER_SEC type is long?
If it's a data issue in old ES how can I remove all float numbers before starting the reindexing process?
For 2nd option I am trying to list all documents having float numbers using below query, so that I can verify once and delete it, but below query still lists documents having THROUGHPUT_ROWS_PER_SEC as non floating numbers.
Note: Groovy scripting is enabled
GET task_history/task_run_hist/_search?size=100
{
"filter": {
"script": {
"script": "doc['THROUGHPUT_ROWS_PER_SEC'].value % 1 == 0"
}
}
}
Updated with solution provided by Val
When I try below script in reindexing, I get a runtime error. Listed below. Any clue on what is getting wrond here? I added additional condition to convert RUN_TIME_IN_MINS to float as your original script pointed out an error in RUN_TIME_IN_MINS field. mapper [RUN_TIME_IN_MINS] cannot be changed from type [long] to [float]"
POST _reindex?wait_for_completion=false
{
"source": {
"remote": {
"host": "http://esip:15000"
},
"index": "task_history"
},
"dest": {
"index": "task_history"
},
"script": {
"inline": "if (ctx._source.THROUGHPUT_ROWS_PER_SEC % 1 != 0) { ctx.op = 'noop' } ctx._source.RUN_TIME_IN_MINS = (float) ctx._source.RUN_TIME_IN_MINS;",
"lang": "painless"
}
}
Runtime error
{
"completed": true,
"task": {
"node": "wZOzypYlSayIRlhp9y3lVA",
"id": 645528,
"type": "transport",
"action": "indices:data/write/reindex",
"status": {
"total": 18249521,
"updated": 4691,
"created": 181721,
"deleted": 0,
"batches": 37,
"version_conflicts": 0,
"noops": 67076,
"retries": {
"bulk": 0,
"search": 0
},
"throttled_millis": 0,
"requests_per_second": -1,
"throttled_until_millis": 0
},
"description": """
reindex from [host=esip port=15000 query={
"match_all" : {
"boost" : 1.0
}
}][task_history] updated with Script{type=inline, lang='painless', idOrCode='if (ctx._source.THROUGHPUT_ROWS_PER_SEC % 1 != 0) { ctx.op = 'noop' } ctx._source.RUN_TIME_IN_MINS = (float) ctx._source.RUN_TIME_IN_MINS;', options={}, params={}} to [task_history]
""",
"start_time_in_millis": 1502336063507,
"running_time_in_nanos": 93094657751,
"cancellable": true
},
"error": {
"type": "script_exception",
"reason": "runtime error",
"script_stack": [],
"script": "if (ctx._source.THROUGHPUT_ROWS_PER_SEC % 1 != 0) { ctx.op = 'noop' } ctx._source.RUN_TIME_IN_MINS = (float) ctx._source.RUN_TIME_IN_MINS;",
"lang": "painless",
"caused_by": {
"type": "null_pointer_exception",
"reason": null
}
}
}

You obviously want to keep your existing ES 5.x mapping with a longso all you need to do is to add a script to your reindex call that modifies the THROUGHPUT_ROWS_PER_SEC field to a long. Something like this should do:
POST _reindex
{
"source": {
"remote": {
"host": "http://es1host:9200",
},
"index": "task_history"
},
"dest": {
"index": "task_history"
},
"script": {
"inline": "if (ctx._source.THROUGHPUT_ROWS_PER_SEC % 1 != 0) { ctx.op = 'noop' }" },
"lang": "painless"
}
}

Related

Elastic search array of objects nested range aggregation

I'm trying to make range aggregation on the following data set:
{
"ProductType": 1,
"ProductDefinition": "fc588f8e-14f2-4871-891f-c73a4e3d17ca",
"ParentProduct": null,
"Sku": "074617",
"VariantSku": null,
"Name": "Paraboot Avoriaz/Jannu Marron Brut Marron Brown Hiking Boot Shoes",
"AllowOrdering": true,
"Rating": null,
"ThumbnailImageUrl": "/media/1106/074617.jpg",
"PrimaryImageUrl": "/media/1106/074617.jpg",
"Categories": [
"399d7b20-18cc-46c0-b63e-79eadb9390c7"
],
"RelatedProducts": [],
"Variants": [
"84a7ff9f-edf0-4aab-87f9-ba4efd44db74",
"e2eb2c50-6abc-4fbe-8fc8-89e6644b23ef",
"a7e16ccc-c14f-42f5-afb2-9b7d9aefbc5c"
],
"PriceGroups": [
"86182755-519f-4e05-96ef-5f93a59bbaec"
],
"DisplayName": "Paraboot Avoriaz/Jannu Marron Brut Marron Brown Hiking Boot Shoes",
"ShortDescription": "",
"LongDescription": "<ul><li>Paraboot Avoriaz Mountaineering Boots</li><li>Marron Brut Marron (Brown)</li><li>Full leather inners and uppers</li><li>Norwegien Welted Commando Sole</li><li>Hand made in France</li><li>Style number : 074617</li></ul><p>As featured on Pritchards.co.uk</p>",
"UnitPrices": {
"EUR 15 pct": 343.85
},
"Taxes": {
"EUR 15 pct": 51.5775
},
"PricesInclTax": {
"EUR 15 pct": 395.4275
},
"Slug": "paraboot-avoriazjannu-marron-brut-marron-brown-hiking-boot-shoes",
"VariantsProperties": [
{
"Key": "ShoeSize",
"Value": "8"
},
{
"Key": "ShoeSize",
"Value": "10"
},
{
"Key": "ShoeSize",
"Value": "6"
}
],
"Guid": "0d4f6899-c66a-4416-8f5d-26822c3b57ae",
"Id": 178,
"ShowOnHomepage": true
}
I'm aggregating on VariantsProperties which have the following mapping
"VariantsProperties": {
"type": "nested",
"properties": {
"Key": {
"type": "keyword"
},
"Value": {
"type": "keyword"
}
}
}
Terms aggregations are working fine with following code:
{
"aggs": {
"Nest": {
"nested": {
"path": "VariantsProperties"
},
"aggs": {
"fieldIds": {
"terms": {
"field": "VariantsProperties.Key"
},
"aggs": {
"values": {
"terms": {
"field": "VariantsProperties.Value"
}
}
}
}
}
}
}
}
However when I try to do a range aggregation to get shoes in size between 8 - 12 such as:
{
"aggs": {
"Nest": {
"nested": {
"path": "VariantsProperties"
},
"aggs": {
"fieldIds": {
"range": {
"field": "VariantsProperties.Value",
"ranges": [ { "from": 8, "to": 12 }]
}
}
}
}
}
}
I get the following error:
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Field [VariantsProperties.Value] of type [keyword] is not supported for aggregation [range]"
}
],
"type": "search_phase_execution_exception",
"reason": "all shards failed",
"phase": "query",
"grouped": true,
"failed_shards": [
{
"shard": 0,
"index": "product-avenueproductindexdefinition-24476f82-en-us",
"node": "ejgN4XecT1SUfgrhzP8uZg",
"reason": {
"type": "illegal_argument_exception",
"reason": "Field [VariantsProperties.Value] of type [keyword] is not supported for aggregation [range]"
}
}
],
"caused_by": {
"type": "illegal_argument_exception",
"reason": "Field [VariantsProperties.Value] of type [keyword] is not supported for aggregation [range]",
"caused_by": {
"type": "illegal_argument_exception",
"reason": "Field [VariantsProperties.Value] of type [keyword] is not supported for aggregation [range]"
}
}
},
"status": 400
}
Is there a way to "transform" the terms aggregation into a range aggregation, without the need of changing the schema? I know I could build the ranges myself by extracting the data from the terms aggregation and building the ranges out of it, however, I would prefer a solution within the elastic itself.
There are two ways to solve this:
Option A: Use a script instead of a field. This option will work without having to reindex your data, but depending on your volume of data, the performance might suffer.
POST test/_search
{
"aggs": {
"Nest": {
"nested": {
"path": "VariantsProperties"
},
"aggs": {
"fieldIds": {
"range": {
"script": "Integer.parseInt(doc['VariantsProperties.Value'].value)",
"ranges": [
{
"from": 8,
"to": 12
}
]
}
}
}
}
}
}
Option B: Add an integer sub-field in your mapping.
PUT my-index/_mapping
{
"properties": {
"VariantsProperties": {
"type": "nested",
"properties": {
"Key": {
"type": "keyword"
},
"Value": {
"type": "keyword",
"fields": {
"numeric": {
"type": "integer",
"ignore_malformed": true
}
}
}
}
}
}
}
Once your mapping is modified, you can run _update_by_query on your index in order to reindex the VariantsProperties.Value data
PUT my-index/_update_by_query
Finally, when this last command is done, you can run the range aggregation on the VariantsProperties.Value.numeric field.
Also note that this second but will be more performant on the long term.

Term aggregation on ElasticSearch join

I would like to perform an aggregation on a join relation using ElasticSearch 7.7.
I need to know how many children I have for each parent.
The only way that I found to solve my issue is to use script inside term aggregation, but my concern is about performance.
/my_index/_search
{
"size": 0,
"aggs": {
"total": {
"terms": {
"script": {
"lang": "painless",
"source": "params['_source']['my_join']['parent']"
}
}
},
"max_total": {
"max_bucket": {
"buckets_path": "total>_count"
}
}
}
}
Someone knows a more fast way to execute this aggregation avoiding the script?
If the join field wasn't a parent/child I could replace the term aggregation with:
"terms": { "field": "my_field" }
To give more context I add some information about mapping:
I'm using Elastic 7.7.
I also attach a mapping with some sample documents:
{
"mappings": {
"properties": {
"my_join": {
"relations": {
"other": "doc"
},
"type": "join"
},
"reader": {
"type": "keyword"
},
"name": {
"type": "text"
},
"content": {
"type": "text"
}
}
}
}
PUT example/_doc/1
{
"reader": [
"A",
"B"
],
"my_join": {
"name": "other"
}
}
PUT example/_doc/2
{
"reader": [
"A",
"B"
],
"my_join": {
"name": "other"
}
}
PUT example/_doc/3
{
"content": "abc",
"my_join": {
"name": "doc",
"parent": 1
}
}
PUT example/_doc/4
{
"content": "def",
"my_join": {
"name": "doc"
"parent": 2
}
}
PUT example/_doc/5
{
"content": "def",
"acl_join": {
"name": "doc"
"parent": 1
}
}

Elasticsearch remove a field from an object of an array in a dynamically generated index

I'm trying to delete fields from an object of an array in Elasticsearch. The index has been dynamically generated.
This is the mapping:
{
"mapping": {
"_doc": {
"properties": {
"age": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"result": {
"properties": {
"resultid": {
"type": "long"
},
"resultname": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
},
"timestamp": {
"type": "date"
}
}
}
}
}
}
this is a document:
{
"result": [
{
"resultid": 69,
"resultname": "SFO"
},
{
"resultid": 151,
"resultname": "NYC"
}
],
"age": 54,
"name": "Jorge",
"timestamp": "2020-04-02T16:07:47.292000"
}
My goals is to remove all the fields resultid in result in all the document of the index. After update the document should look like this:
{
"result": [
{
"resultname": "SFO"
},
{
"resultname": "NYC"
}
],
"age": 54,
"name": "Jorge",
"timestamp": "2020-04-02T16:07:47.292000"
}
I tried using the following articles on stackoverflow but with no luck:
Remove elements/objects From Array in ElasticSearch Followed by Matching Query
remove objects from array that satisfying the condition in elastic search with javascript api
Delete nested array in elasticsearch
Removing objects from nested fields in ElasticSearch
Hopefully someone can help me find a solution.
You should reindex your index in a new one with _reindex API and call a script to remove your fields :
POST _reindex
{
"source": {
"index": "my-index"
},
"dest": {
"index": "my-index-reindex"
},
"script": {
"source": """
for (int i=0;i<ctx._source.result.length;i++) {
ctx._source.result[i].remove("resultid")
}
"""
}
}
After you can delete your first index :
DELETE my-index
And reindex it :
POST _reindex
{
"source": {
"index": "my-index-reindex"
},
"dest": {
"index": "my-index"
}
}
I combined the answer from Luc E with some of my own knowledge in order to reach a solution without reindexing.
POST INDEXNAME/TYPE/_update_by_query?wait_for_completion=false&conflicts=proceed
{
"script": {
"source": "for (int i=0;i<ctx._source.result.length;i++) { ctx._source.result[i].remove(\"resultid\")}"
},
"query": {
"bool": {
"must": [
{
"exists": {
"field": "result.id"
}
}
]
}
}
}
Thanks again Luc!
If your array has more than one copy of element you want to remove. Use this:
ctx._source.some_array.removeIf(tag -> tag == params['c'])

Elasticsearch update date in nested

I want to remove 15 minutes from all dates in the history that are less than 15 minutes old.
So I have to compare the date now - 15 minutes to the record date.
However, when I retrieve the date, it can not compare it because it is like a String and adding ".value" returns that the attribute does not exist.
Error response :
"if(ctx._source.histories[i].creation_date.value"
dynamic getter [java.lang.String, value] not found
Try other solutions with others error :
"if(ctx._source.histories[i].creation_date.date"
"if(ctx._source.histories[i].creation_date.getMillis()"
"if(ctx._source.histories[i].creation_date.value.getMillis()"
Update request (elasticsearch.js) :
{
"query": { "term": { "user_id": "USER_ID" } },
"script":
{
"lang": "painless",
"source": "for(int i = ctx._source.histories.length-1; i > 0; --i){ if(ctx._source.histories[i].creation_date.value > params.date) { ctx._source.histories[i].creation_date -= 1000 * 60 * 15; } }",
"params": { "date": new Date() - 1000 * 60 * 15 }
}
}
Mapping :
{
"mappings":
{
"_doc":
{
"properties":
{
"histories":
{
"type": "nested",
"properties":
{
"type": { "type": "text" },
"key": { "type": "text" },
"value": { "type": "text" },
"ip": { "type": "ip" },
"useragent": { "type": "text" },
"creation_date": { "type": "date" }
}
}
}
}
}
}
Infos elasticsearch :
{
"name" : "ZZZ",
"cluster_name" : "YYY",
"cluster_uuid" : "XXX",
"version" : {
"number" : "6.5.2",
"build_flavor" : "default",
"build_type" : "tar",
"build_hash" : "WWW",
"build_date" : "2018-11-29T23:58:20.891072Z",
"build_snapshot" : false,
"lucene_version" : "7.5.0",
"minimum_wire_compatibility_version" : "5.6.0",
"minimum_index_compatibility_version" : "5.0.0"
},
"tagline" : "You Know, for Search"
}
Sample datas :
{
"hits":
{
"total": 1,
"max_score": 4.13468,
"hits":
[
{
"_index": "myindex",
"_type": "_doc",
"_id": "H1dQ4WgBypYasGfnnXXI",
"_score": 4.13468,
"_source":
{
"infos":
{
"firsname": "John",
"lastname": "Doe",
"mail": "john.doe#stackoverflow.com"
},
"histories":
[
{
"type": "auth",
"key": "try",
"value": "fail",
"ip": "127.0.0.1",
"useragent": "iPhoneX",
"creation_date": "2019-02-19T16:49:00.396Z"
},
{
"type": "auth",
"key": "try",
"value": "fail",
"ip": "127.0.0.1",
"useragent": "iPhoneX",
"creation_date": "2019-02-19T16:50:00.396Z"
}
]
}
}
]
}
}
I think I might have something that might help you (tested on ES 6.6.0).
{
"query": {
"match_all": {}
},
"script": {
"lang": "painless",
"source": """
// parse params.data to Instant
def paramDate = Instant.parse(params.date);
for(int i = ctx._source.histories.length-1; i > 0; --i) {
// parse the creation date to Instant
def creationDate = Instant.parse(ctx._source.histories[i].creation_date);
// check time difference between both
if (ChronoUnit.MINUTES.between(creationDate, paramDate) <= 15) {
// remove 15 minutes if condition satisfied
ctx._source.histories[i].creation_date = creationDate.minusSeconds(900).toString();
}
}
""",
"params": {
"date": "2019-02-19T16:45:00.000Z"
}
}
}
Note: I'm using triple quotes to make the query more readable, but feel free to inline it again as you see fit and remove the comments.

elasticsearch reindex nested object's element to keyword

I have an index structured like below:
"my_index": {
"mappings": {
"my_index": {
"properties": {
"adId": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"title": {
"type": "keyword"
},
"creativeStatistics": {
"type": "nested",
"properties": {
"clicks": {
"type": "long"
},
"creativeId": {
"type": "keyword"
}
}
}
}
}
}
}
I need to remove the nested object in a new index and just save the creativeId as a new keyword (to make it clear: I know I will loose the clicks data, and it is not important). It means the final new index scheme would be:
"my_new_index": {
"mappings": {
"my_new_index": {
"properties": {
"adId": {
"type": "keyword"
},
"name": {
"type": "keyword"
},
"title": {
"type": "keyword"
},
"creativeId": {
"type": "keyword"
}
}
}
}
}
Right now each row has exactly one creativeStatistics. and therefore there is no complexity in selecting one of the creativeIds.
I know it is possible to reindex using painless scripts, but I don't know how can I do that. Any help will be appreciated.
You can do it like this:
POST _reindex
{
"source": {
"index": "my_old_index"
},
"dest": {
"index": "my_new_index"
},
"script": {
"source": "if (ctx._source.creativeStatistics != null && ctx._source.creativeStatistics.size() > 0) {ctx._source.creativeId = ctx._source.creativeStatistics[0].creativeId; ctx._source.remove('creativeStatistics')}",
"lang": "painless"
}
}
You can also create a Pipeline by creating a Script Processor as follows:
PUT _ingest/pipeline/my_pipeline
{
"description" : "My pipeline",
"processors" : [
{ "script" : {
"source": "for (item in ctx.creativeStatistics) { if(item.creativeId!=null) {ctx.creativeId = item.creativeId;} }"
}
},
{
"remove": {
"field": "creativeStatistics"
}
}
]
}
Note that if you have multiple nested objects, it would append the last object's creativeId. And it would only add creativeId if a source document has one in its creativeStatistics.
Below is how you can then use reindex query:
POST _reindex
{
"source": {
"index": "creativeindex_src"
},
"dest": {
"index": "creativeindex_dest",
"pipeline": "my_pipeline"
}
}

Resources