default location for single primary shard on elastic search? - elasticsearch

I have an ES 7.8 cluster to store log data, one index for one tenant.
As you can see the default index.number_of_shards is one. (Please ignore the fact I dont have any replicas because the data are just imported)
This looks problematic as all primary shards are located on the same node. How can I assign them evenly on different nodes when creating the index?
Update1:
$ curl -sk 'myhost:19081/_cluster/settings?pretty'
{
"persistent" : { },
"transient" : { }
}
$ $ curl -sk 'myhost:19081/_cluster/allocation/explain?pretty&include_disk_info=true&include_yes_decisions=true'
{
"error" : {
"root_cause" : [
{
"type" : "illegal_argument_exception",
"reason" : "unable to find any unassigned shards to explain [ClusterAllocationExplainRequest[useAnyUnassignedShard=true,includeYesDecisions?=true]"
}
],
"type" : "illegal_argument_exception",
"reason" : "unable to find any unassigned shards to explain [ClusterAllocationExplainRequest[useAnyUnassignedShard=true,includeYesDecisions?=true]"
},
"status" : 400
}
$ curl -sk 'myhost:19081/_cat/nodeattrs?v'
node host ip attr value
node-001 10.96.110.92 10.96.110.92 ml.machine_memory 99750834176
node-001 10.96.110.92 10.96.110.92 ml.max_open_jobs 20
node-001 10.96.110.92 10.96.110.92 xpack.installed true
node-001 10.96.110.92 10.96.110.92 transform.node true
node-004 10.96.108.179 10.96.108.179 ml.machine_memory 99531649024
node-004 10.96.108.179 10.96.108.179 ml.max_open_jobs 20
node-004 10.96.108.179 10.96.108.179 xpack.installed true
node-004 10.96.108.179 10.96.108.179 transform.node true
node-003 10.96.113.19 10.96.113.19 ml.machine_memory 99531649024
node-003 10.96.113.19 10.96.113.19 ml.max_open_jobs 20
node-003 10.96.113.19 10.96.113.19 xpack.installed true
node-003 10.96.113.19 10.96.113.19 transform.node true
node-002 10.96.112.213 10.96.112.213 ml.machine_memory 99531649024
node-002 10.96.112.213 10.96.112.213 ml.max_open_jobs 20
node-002 10.96.112.213 10.96.112.213 xpack.installed true
node-002 10.96.112.213 10.96.112.213 transform.node true
node-005 10.96.101.214 10.96.101.214 ml.machine_memory 99516563456
node-005 10.96.101.214 10.96.101.214 ml.max_open_jobs 20
node-005 10.96.101.214 10.96.101.214 xpack.installed true
node-005 10.96.101.214 10.96.101.214 transform.node true
$ curl -sk 'myhost:19081/_all/_settings?include_defaults&filter_path=**.allocation&pretty'
{
// with several hundreds other identical results of indices
"my_index_1" : {
"defaults" : {
"index" : {
"routing" : {
"allocation" : {
"enable" : "all",
"total_shards_per_node" : "-1"
}
},
"allocation" : {
"max_retries" : "5",
"existing_shards_allocator" : "gateway_allocator"
}
}
}
}
}
Update2:
curl -sk -HContent-Type:application/json -d ' {"index": "my_index_1", "shard": 0, "primary": true }' 'myhost:19081/_cluster/allocation/explain?pretty&include_disk_info=true&include_yes_decisions=true'
{
"index" : "my_index_1",
"shard" : 0,
"primary" : true,
"current_state" : "started",
"current_node" : {
"id" : "CNyCF4_eTmCQYXh_Bhb0KQ",
"name" : "node004",
"transport_address" : "10.96.108.179:9300",
"attributes" : {
"ml.machine_memory" : "99531649024",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"weight_ranking" : 1
},
"cluster_info" : {
"nodes" : {
"CNyCF4_eTmCQYXh_Bhb0KQ" : {
"node_name" : "node004",
"least_available" : {
"path" : "/data3/nodes/0",
"total_bytes" : 15999772393472,
"used_bytes" : 23527976960,
"free_bytes" : 15976244416512,
"free_disk_percent" : 99.9,
"used_disk_percent" : 0.1
},
"most_available" : {
"path" : "/data2/nodes/0",
"total_bytes" : 15999772393472,
"used_bytes" : 19824119808,
"free_bytes" : 15979948273664,
"free_disk_percent" : 99.9,
"used_disk_percent" : 0.1
}
},
"xiR8clLRSVirvkmlyDpgXg" : {
"node_name" : "node001",
"least_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999896125440,
"used_bytes" : 2815332352,
"free_bytes" : 15997080793088,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
},
"most_available" : {
"path" : "/data3/nodes/0",
"total_bytes" : 15999896125440,
"used_bytes" : 278740992,
"free_bytes" : 15999617384448,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
}
},
"afbAZaznQwaRtryF7yI4dA" : {
"node_name" : "node003",
"least_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999836385280,
"used_bytes" : 34533376,
"free_bytes" : 15999801851904,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
},
"most_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999836385280,
"used_bytes" : 34533376,
"free_bytes" : 15999801851904,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
}
},
"vhFAg67YSgquqP8tR-s98w" : {
"node_name" : "node002",
"least_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999836385280,
"used_bytes" : 34537472,
"free_bytes" : 15999801847808,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
},
"most_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999836385280,
"used_bytes" : 34537472,
"free_bytes" : 15999801847808,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
}
},
"KL8hcVTJTBmN9MTa3fX8eQ" : {
"node_name" : "node005",
"least_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999772393472,
"used_bytes" : 34983936,
"free_bytes" : 15999737409536,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
},
"most_available" : {
"path" : "/data1/nodes/0",
"total_bytes" : 15999772393472,
"used_bytes" : 34983936,
"free_bytes" : 15999737409536,
"free_disk_percent" : 100.0,
"used_disk_percent" : 0.0
}
}
},
"shard_sizes" : {
"[my_index_1][0][p]_bytes" : 2120083,
// redact several hundreds others
},
"shard_paths" : {
"[my_index_1][0], node[CNyCF4_eTmCQYXh_Bhb0KQ], [P], s[STARTED], a[id=dqceFOaFT0ugDALnFEJWvg]" : "/data2/nodes/0",
// redact several hundreds others
}
},
"can_remain_on_current_node" : "yes",
"can_rebalance_cluster" : "yes",
"can_rebalance_to_other_node" : "no",
"rebalance_explanation" : "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
"node_allocation_decisions" : [
{
"node_id" : "KL8hcVTJTBmN9MTa3fX8eQ",
"node_name" : "node005",
"transport_address" : "10.96.101.214:9300",
"node_attributes" : {
"ml.machine_memory" : "99516563456",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "worse_balance",
"weight_ranking" : 1,
"deciders" : [
{
"decider" : "max_retry",
"decision" : "YES",
"explanation" : "shard has no previous failures"
},
{
"decider" : "replica_after_primary_active",
"decision" : "YES",
"explanation" : "shard is primary and can be allocated"
},
{
"decider" : "enable",
"decision" : "YES",
"explanation" : "all allocations are allowed"
},
{
"decider" : "node_version",
"decision" : "YES",
"explanation" : "can relocate primary shard from a node with version [7.8.0] to a node with equal-or-newer version [7.8.0]"
},
{
"decider" : "snapshot_in_progress",
"decision" : "YES",
"explanation" : "no snapshots are currently running"
},
{
"decider" : "restore_in_progress",
"decision" : "YES",
"explanation" : "ignored as shard is not being recovered from a snapshot"
},
{
"decider" : "filter",
"decision" : "YES",
"explanation" : "node passes include/exclude/require filters"
},
{
"decider" : "same_shard",
"decision" : "YES",
"explanation" : "this node does not hold a copy of this shard"
},
{
"decider" : "disk_threshold",
"decision" : "YES",
"explanation" : "enough disk for shard on node, free: [14.5tb], shard size: [2mb], free after allocating shard: [14.5tb]"
},
{
"decider" : "throttling",
"decision" : "YES",
"explanation" : "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]"
},
{
"decider" : "shards_limit",
"decision" : "YES",
"explanation" : "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
},
{
"decider" : "awareness",
"decision" : "YES",
"explanation" : "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
}
]
},
{
"node_id" : "afbAZaznQwaRtryF7yI4dA",
"node_name" : "node003",
"transport_address" : "10.96.113.19:9300",
"node_attributes" : {
"ml.machine_memory" : "99531649024",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "worse_balance",
"weight_ranking" : 1,
"deciders" : [
{
"decider" : "max_retry",
"decision" : "YES",
"explanation" : "shard has no previous failures"
},
{
"decider" : "replica_after_primary_active",
"decision" : "YES",
"explanation" : "shard is primary and can be allocated"
},
{
"decider" : "enable",
"decision" : "YES",
"explanation" : "all allocations are allowed"
},
{
"decider" : "node_version",
"decision" : "YES",
"explanation" : "can relocate primary shard from a node with version [7.8.0] to a node with equal-or-newer version [7.8.0]"
},
{
"decider" : "snapshot_in_progress",
"decision" : "YES",
"explanation" : "no snapshots are currently running"
},
{
"decider" : "restore_in_progress",
"decision" : "YES",
"explanation" : "ignored as shard is not being recovered from a snapshot"
},
{
"decider" : "filter",
"decision" : "YES",
"explanation" : "node passes include/exclude/require filters"
},
{
"decider" : "same_shard",
"decision" : "YES",
"explanation" : "this node does not hold a copy of this shard"
},
{
"decider" : "disk_threshold",
"decision" : "YES",
"explanation" : "enough disk for shard on node, free: [14.5tb], shard size: [2mb], free after allocating shard: [14.5tb]"
},
{
"decider" : "throttling",
"decision" : "YES",
"explanation" : "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]"
},
{
"decider" : "shards_limit",
"decision" : "YES",
"explanation" : "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
},
{
"decider" : "awareness",
"decision" : "YES",
"explanation" : "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
}
]
},
{
"node_id" : "vhFAg67YSgquqP8tR-s98w",
"node_name" : "node002",
"transport_address" : "10.96.112.213:9300",
"node_attributes" : {
"ml.machine_memory" : "99531649024",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "worse_balance",
"weight_ranking" : 1,
"deciders" : [
{
"decider" : "max_retry",
"decision" : "YES",
"explanation" : "shard has no previous failures"
},
{
"decider" : "replica_after_primary_active",
"decision" : "YES",
"explanation" : "shard is primary and can be allocated"
},
{
"decider" : "enable",
"decision" : "YES",
"explanation" : "all allocations are allowed"
},
{
"decider" : "node_version",
"decision" : "YES",
"explanation" : "can relocate primary shard from a node with version [7.8.0] to a node with equal-or-newer version [7.8.0]"
},
{
"decider" : "snapshot_in_progress",
"decision" : "YES",
"explanation" : "no snapshots are currently running"
},
{
"decider" : "restore_in_progress",
"decision" : "YES",
"explanation" : "ignored as shard is not being recovered from a snapshot"
},
{
"decider" : "filter",
"decision" : "YES",
"explanation" : "node passes include/exclude/require filters"
},
{
"decider" : "same_shard",
"decision" : "YES",
"explanation" : "this node does not hold a copy of this shard"
},
{
"decider" : "disk_threshold",
"decision" : "YES",
"explanation" : "enough disk for shard on node, free: [14.5tb], shard size: [2mb], free after allocating shard: [14.5tb]"
},
{
"decider" : "throttling",
"decision" : "YES",
"explanation" : "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]"
},
{
"decider" : "shards_limit",
"decision" : "YES",
"explanation" : "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
},
{
"decider" : "awareness",
"decision" : "YES",
"explanation" : "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
}
]
},
{
"node_id" : "xiR8clLRSVirvkmlyDpgXg",
"node_name" : "node001",
"transport_address" : "10.96.110.92:9300",
"node_attributes" : {
"ml.machine_memory" : "99750834176",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true",
"transform.node" : "true"
},
"node_decision" : "worse_balance",
"weight_ranking" : 1,
"deciders" : [
{
"decider" : "max_retry",
"decision" : "YES",
"explanation" : "shard has no previous failures"
},
{
"decider" : "replica_after_primary_active",
"decision" : "YES",
"explanation" : "shard is primary and can be allocated"
},
{
"decider" : "enable",
"decision" : "YES",
"explanation" : "all allocations are allowed"
},
{
"decider" : "node_version",
"decision" : "YES",
"explanation" : "can relocate primary shard from a node with version [7.8.0] to a node with equal-or-newer version [7.8.0]"
},
{
"decider" : "snapshot_in_progress",
"decision" : "YES",
"explanation" : "no snapshots are currently running"
},
{
"decider" : "restore_in_progress",
"decision" : "YES",
"explanation" : "ignored as shard is not being recovered from a snapshot"
},
{
"decider" : "filter",
"decision" : "YES",
"explanation" : "node passes include/exclude/require filters"
},
{
"decider" : "same_shard",
"decision" : "YES",
"explanation" : "this node does not hold a copy of this shard"
},
{
"decider" : "disk_threshold",
"decision" : "YES",
"explanation" : "enough disk for shard on node, free: [14.5tb], shard size: [2mb], free after allocating shard: [14.5tb]"
},
{
"decider" : "throttling",
"decision" : "YES",
"explanation" : "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]"
},
{
"decider" : "shards_limit",
"decision" : "YES",
"explanation" : "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
},
{
"decider" : "awareness",
"decision" : "YES",
"explanation" : "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
}
]
}
]
}

Elasticsearch by default tries to spread the shards evenly on all the data-nodes, in your case its really strange why all the shards all on the same data-node.
You should debug the cause of it, hopefully you don't have a single data-node in your cluster, please provide your cluster settings in order to get more information on your cluster settings and setup.
Also provide the output of Shard allocation explain API.
For time being, You can manually move these shards on other data-nodes by using the cluster reroute API

Elasticsearch automatically take cares of allocating shards to different node.
Try to rebalance cluster that may fix the problem
https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html#:~:text=Elasticsearch%20runs%20an%20automatic%20process,from%20completely%20balancing%20the%20cluster.

As it turns out my cluster has a cluster.routing.allocation.balance.shard of zero.
Solved this by
PUT /_cluster/settings
{
"persistent" : {
"cluster.routing.allocation.balance.shard" : "0.45"
}
}

Related

Elasticsearch ILM not rolling

I have configured my ILM to rollover when the indice size be 20GB or after passing 30 days in the hot node
but my indice passed 20GB and still didn't pass to the cold node
and when I run: GET _cat/indices?v I get:
green open packetbeat-7.9.2-2020.10.22-000001 RRAnRZrrRZiihscJ3bymig 10 1 63833049 0 44.1gb 22gb
Could you tell me how to solve that please !
Knowing that in my packetbeat file configuration, I have just changed the number of shards:
setup.template.settings:
index.number_of_shards: 10
index.number_of_replicas: 1
when I run the command GET packetbeat-7.9.2-2020.10.22-000001/_settings I get this output:
{
"packetbeat-7.9.2-2020.10.22-000001" : {
"settings" : {
"index" : {
"lifecycle" : {
"name" : "packetbeat",
"rollover_alias" : "packetbeat-7.9.2"
},
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"mapping" : {
"total_fields" : {
"limit" : "10000"
}
},
"refresh_interval" : "5s",
"number_of_shards" : "10",
"provided_name" : "<packetbeat-7.9.2-{now/d}-000001>",
"max_docvalue_fields_search" : "200",
"query" : {
"default_field" : [
"message",
"tags",
"agent.ephemeral_id",
"agent.id",
"agent.name",
"agent.type",
"agent.version",
"as.organization.name",
"client.address",
"client.as.organization.name",
and the output of the command GET /packetbeat-7.9.2-2020.10.22-000001/_ilm/explain is :
{
"indices" : {
"packetbeat-7.9.2-2020.10.22-000001" : {
"index" : "packetbeat-7.9.2-2020.10.22-000001",
"managed" : true,
"policy" : "packetbeat",
"lifecycle_date_millis" : 1603359683835,
"age" : "15.04d",
"phase" : "hot",
"phase_time_millis" : 1603359684332,
"action" : "rollover",
"action_time_millis" : 1603360173138,
"step" : "check-rollover-ready",
"step_time_millis" : 1603360173138,
"phase_execution" : {
"policy" : "packetbeat",
"phase_definition" : {
"min_age" : "0ms",
"actions" : {
"rollover" : {
"max_size" : "50gb",
"max_age" : "30d"
}
}
},
"version" : 1,
"modified_date_in_millis" : 1603359683339
}
}
}
}
It's weird that it's 50GB !!
Thanks for your help
So I found the solution of this problem.
After updating the policy, I removed the policy from the index using it, and then added it again to those index.

elasticsearch cluster stats api return different values on different node

Image Link
on master _cluster/stats:
"fs" : {
"total_in_bytes" : 15874832596992,
"free_in_bytes" : 12578652061696,
"available_in_bytes" : 12578652061696,
"spins" : "true"
},
node analyzer1(hot) _cluster/stats:
"fs" : {
"total_in_bytes" : 16274067427328,
"free_in_bytes" : 12992671711232,
"available_in_bytes" : 12992671711232,
"spins" : "true"
},
on analyzer2(hot) _cluster/stats:
"fs" : {
"total_in_bytes" : 16274067427328,
"free_in_bytes" : 12989881016320,
"available_in_bytes" : 12989881016320,
"spins" : "true"
},
on analyzer3(warm) _cluster:
"fs" : {
"total_in_bytes" : 14753986846720,
"free_in_bytes" : 11355335659520,
"available_in_bytes" : 11355335659520,
"spins" : "true"
},
on analyzer4(warm) _cluster/stats
"fs" : {
"total_in_bytes" : 17874236866560,
"free_in_bytes" : 14598999666688,
"available_in_bytes" : 14598999666688,
"spins" : "true"
},
The five node are in same cluster,and the cluster status is green
Why am I getting different values of fs_total?
note:
elastic version: 5.2.2
jdkversion: openjdk8

elasticsearch - there is no copy of the shard available?

I have a few indices in red, after a failure of the system, caused by disk full.
But I cannot reallocate the lost shard. It says "there is no copy of the shard available"
curl -XGET 'localhost:9200/_cluster/allocation/explain?pretty'
{
"shard" : {
"index" : "my_index",
"index_uuid" : "iNY9t81wQf6wJc-KqufUrg",
"id" : 0,
"primary" : true
},
"assigned" : false,
"shard_state_fetch_pending" : false,
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2017-05-30T07:33:04.192Z",
"failed_attempts" : 5,
"delayed" : false,
"details" : "failed to create shard, failure FileSystemException[/data/es/storage/nodes/0/indices/iNY9t81wQf6wJc-KqufUrg/0/_state/state-13.st.tmp: Read-only file system]",
"allocation_status" : "deciders_no"
},
"allocation_delay_in_millis" : 60000,
"remaining_delay_in_millis" : 0,
"nodes" : {
"KvOd2vSQTOSgjgqyEnOKpA" : {
"node_name" : "node1",
"node_attributes" : { },
"store" : {
"shard_copy" : "NONE"
},
"final_decision" : "NO",
"final_explanation" : "there is no copy of the shard available",
"weight" : -3.683333,
"decisions" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has already failed allocating [5] times vs. [5] retries allowed unassigned_info[[reason=ALLOCATION_FAILED], at[2017-05-30T07:33:04.192Z], failed_attempts[5], delayed=false, details[failed to create shard, failure FileSystemException[/data/es/storage/nodes/0/indices/iNY9t81wQf6wJc-KqufUrg/0/_state/state-13.st.tmp: Read-only file system]], allocation_status[deciders_no]] - manually call [/_cluster/reroute?retry_failed=true] to retry"
}
]
},
"pC9fL41xRgeZDAEYvNR1eQ" : {
"node_name" : "node2",
"node_attributes" : { },
"store" : {
"shard_copy" : "AVAILABLE"
},
"final_decision" : "NO",
"final_explanation" : "the shard cannot be assigned because one or more allocation decider returns a 'NO' decision",
"weight" : -2.333333,
"decisions" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has already failed allocating [5] times vs. [5] retries allowed unassigned_info[[reason=ALLOCATION_FAILED], at[2017-05-30T07:33:04.192Z], failed_attempts[5], delayed=false, details[failed to create shard, failure FileSystemException[/data/es/storage/nodes/0/indices/iNY9t81wQf6wJc-KqufUrg/0/_state/state-13.st.tmp: Read-only file system]], allocation_status[deciders_no]] - manually call [/_cluster/reroute?retry_failed=true] to retry"
}
]
},
"1g7eCfEQS9u868lFSoo7FQ" : {
"node_name" : "node3",
"node_attributes" : { },
"store" : {
"shard_copy" : "AVAILABLE"
},
"final_decision" : "NO",
"final_explanation" : "the shard cannot be assigned because one or more allocation decider returns a 'NO' decision",
"weight" : 40.866665,
"decisions" : [
{
"decider" : "max_retry",
"decision" : "NO",
"explanation" : "shard has already failed allocating [5] times vs. [5] retries allowed unassigned_info[[reason=ALLOCATION_FAILED], at[2017-05-30T07:33:04.192Z], failed_attempts[5], delayed=false, details[failed to create shard, failure FileSystemException[/data/es/storage/nodes/0/indices/iNY9t81wQf6wJc-KqufUrg/0/_state/state-13.st.tmp: Read-only file system]], allocation_status[deciders_no]] - manually call [/_cluster/reroute?retry_failed=true] to retry"
}
]
}
}
}
I tried basically every option of the reroute command (documentation here). but it gives me 400 error.. like this:
curl -XPOST 'localhost:9200/_cluster/reroute?pretty' -H 'Content-Type: application/json' -d'
{
"commands" : [
{
"allocate_replica" : {
"index" : "myindex", "shard" : 0,
"node" : "node2"
}
}
]
}'
response:
{
"error" : {
"root_cause" : [
{
"type" : "illegal_argument_exception",
"reason" : "[allocate_replica] trying to allocate a replica shard [myindex][0], while corresponding primary shard is still unassigned"
}
],
"type" : "illegal_argument_exception",
"reason" : "[allocate_replica] trying to allocate a replica shard [myindex][0], while corresponding primary shard is still unassigned"
},
"status" : 400
}
try this:
curl -XPOST 'xx.xxx.xx:9200/_cluster/reroute' -d '{"commands" : [{"allocate_stale_primary":{"index" : "myindex", "shard" : 0, "node" : "node2","accept_data_loss" : true}}]}'

Elastic Search 2.3.4 Stops allocating shards with no obvious reason

I am attempting to upgrade our Elastic Search cluster from 1.6 to 2.3.4. The upgrade seems to work, and I can see shard allocation starting to happen within Kopf - but at some point the shard allocation appears to stop with many shards left unallocated, and no errors being reported in the logs. Typically I'm left with 1200 / 3800 shards unallocated.
We have a typical 3 node cluster and I am trialing this standalone on my local machine with all 3 nodes running on my local machine.
I have seen similar symptoms reported - see https://t37.net/how-to-fix-your-elasticsearch-cluster-stuck-in-initializing-shards-mode.html
. The solution here seemed to be to manually allocate the shards, which I've tried (and works) but I'm at a loss to explain the behaviour of elastic search here. I'd prefer not to go down this route, as I want my cluster to spin up automatically without intervention.
There is also https://github.com/elastic/elasticsearch/pull/14494 which seems to be resolved with the latest ES version, so shouldn't be a problem.
There are no errors in log files - I have upped the root level logging to 'DEBUG' in order to see what I can. What I can see is lines like the below for each unallocated shard (this from the master node logs):
[2016-07-26 09:18:04,859][DEBUG][gateway ] [germany] [index][4] found 0 allocations of [index][4], node[null], [P], v[0], s[UNASSIGNED], unassigned_info[[reason=CLUSTER_RECOVERED], at[2016-07-26T08:05:04.447Z]], highest version: [-1]
[2016-07-26 09:18:04,859][DEBUG][gateway ] [germany] [index][4]: not allocating, number_of_allocated_shards_found [0]
Config file (with comments removed):
cluster.name: elasticsearch-jm-2.3.4
node.name: germany
script.inline: true
script.indexed: true
If I query the cluster health after reallocation has stopped - I get the response below:
http://localhost:9200/_cluster/health?pretty
cluster_name : elasticsearch-jm-2.3.4
status : red
timed_out : False
number_of_nodes : 3
number_of_data_nodes : 3
active_primary_shards : 1289
active_shards : 2578
relocating_shards : 0
initializing_shards : 0
unassigned_shards : 1264
delayed_unassigned_shards : 0
number_of_pending_tasks : 0
number_of_in_flight_fetch : 0
task_max_waiting_in_queue_millis : 0
active_shards_percent_as_number : 67.10046850598647
Further querying for shards - filtered to one index with unallocated shards. As can be seen - shard 0 and 4 are unallocated whereas shard 1 2 and 3 have been allocated :
http://localhost:9200/_cat/shards
cs-payment-warn-2016.07.20 3 p STARTED 106 92.4kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 3 r STARTED 106 92.4kb 127.0.0.1 switzerland
cs-payment-warn-2016.07.20 4 p UNASSIGNED
cs-payment-warn-2016.07.20 4 r UNASSIGNED
cs-payment-warn-2016.07.20 2 r STARTED 120 74.5kb 127.0.0.1 cyprus
cs-payment-warn-2016.07.20 2 p STARTED 120 74.5kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 1 r STARTED 120 73.8kb 127.0.0.1 cyprus
cs-payment-warn-2016.07.20 1 p STARTED 120 73.8kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 0 p UNASSIGNED
cs-payment-warn-2016.07.20 0 r UNASSIGNED
Manually rerouting an unassigned shard appears to work - (stripped back results set)
http://localhost:9200/_cluster/reroute
POST:
{
"dry_run": true,
"commands": [
{
"allocate": {
"index": "cs-payment-warn-2016.07.20",
"shard": 4,
"node": "switzerland" ,
"allow_primary": true
}
}
]
}
Response:
{
"acknowledged" : true,
"state" : {
"version" : 722,
"state_uuid" : "Vw2vPoCMQk2ZosjzviD4TQ",
"master_node" : "yhL7XXy-SKu_WAM-C33dzA",
"blocks" : {},
"nodes" : {},
"routing_table" : {
"indices" : {
"cs-payment-warn-2016.07.20" : {
"shards" : {
"3" : [{
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 3,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "x_Iq88hmTqiasrjW09hVuw"
}
}, {
"state" : "STARTED",
"primary" : false,
"node" : "1a8dgBscTUS3c7Pv4mN9CQ",
"relocating_node" : null,
"shard" : 3,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "DF-EUEy_SpeUElnZI6cgsQ"
}
}
],
"4" : [{
"state" : "INITIALIZING",
"primary" : true,
"node" : "1a8dgBscTUS3c7Pv4mN9CQ",
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"allocation_id" : {
"id" : "1tw7C7YPQsWwm_O-8mYHRg"
},
"unassigned_info" : {
"reason" : "INDEX_CREATED",
"at" : "2016-07-26T14:20:15.395Z",
"details" : "force allocation from previous reason CLUSTER_RECOVERED, null"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
],
"2" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "eQ-_vWNbRp27So0iGSitmA"
}
}, {
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "O1PU1_NVS8-uB2yBrG76MA"
}
}
],
"1" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZmxtOvorRVmndR15OJMkMA"
}
}, {
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZNgzePThQxS-iqhRSXzZCw"
}
}
],
"0" : [{
"state" : "UNASSIGNED",
"primary" : true,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
]
}
}
},
"routing_nodes" : {
"unassigned" : [{
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : true,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
]
},
"nodes" : {
"rlRQ2u0XQRqxWld-wSrOug" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "eQ-_vWNbRp27So0iGSitmA"
}
}, {
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZmxtOvorRVmndR15OJMkMA"
}
}
]
}
}
}
}

Elasticsearch shard relocation not working

I added 12 new data nodes to an existing cluster of 8 data nodes. I am trying to shutdown the previous 8 nodes using the "exclude allocation" as recommended
curl -XPUT localhost:9200/_cluster/settings -d '{
"transient" : {
"cluster.routing.allocation.exclude._ip" : "10.0.0.1"
} }'
It wasn't relocating any shards, so I ran the reroute command with the 'explain' option. Can someone explain what the following text is saying ?
> "explanations" : [ {
> "command" : "move",
> "parameters" : {
> "index" : "2015-09-20",
> "shard" : 0,
> "from_node" : "_dDn1SmqSquhMGgjti6vGg",
> "to_node" : "OQBFMt17RaWboOzMnUy2jA"
> },
> "decisions" : [ {
> "decider" : "same_shard",
> "decision" : "YES",
> "explanation" : "shard is not allocated to same node or host"
> }, {
> "decider" : "filter",
> "decision" : "YES",
> "explanation" : "node passes include/exclude/require filters"
> }, {
> "decider" : "replica_after_primary_active",
> "decision" : "YES",
> "explanation" : "shard is primary"
> }, {
> "decider" : "throttling",
> "decision" : "YES",
> "explanation" : "below shard recovery limit of [16]"
> }, {
> "decider" : "enable",
> "decision" : "YES",
> "explanation" : "allocation disabling is ignored"
> }, {
> "decider" : "disable",
> "decision" : "YES",
> "explanation" : "allocation disabling is ignored"
> }, {
> "decider" : "awareness",
> "decision" : "NO",
> "explanation" : "too many shards on nodes for attribute: [dc]" }, {
> "decider" : "shards_limit",
> "decision" : "YES",
> "explanation" : "total shard limit disabled: [-1] <= 0"
> }, {
> "decider" : "node_version",
> "decision" : "YES",
> "explanation" : "target node version [1.4.5] is same or newer than source node version [1.4.5]"
> }, {
> "decider" : "disk_threshold",
> "decision" : "YES",
> "explanation" : "enough disk for shard on node, free: [1.4tb]"
> }, {
> "decider" : "snapshot_in_progress",
> "decision" : "YES", "explanation" : "no snapshots are currently running"
>
If you have replicas, you can simply switch off your nodes, one by one and wait for each that the cluster becomes green again.
You don't need to explicitly reroute in that case.
That said, in your logs, it sounds like you are using awareness in your elasticsearch.yml file. You should check your settings.
You can install kopf plugin, it will help you manage elasticsearch nodes and the task will be more simplified.
With this plugin what you want it's easier.
You can download here: https://github.com/lmenezes/elasticsearch-kopf .
Other plugins with support you can get in: https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-plugins.html .

Resources