How To fix unassigned shards on ELK - sharding

i have an ELK cluster with 3 nodes:
node1: *hims
node2: mw
node3: cm
my cluster is in yellow status and i have 10 shards not assigned:
{
"cluster_name" : "elastic",
"status" : "yellow",
"timed_out" : false,
"number_of_nodes" : 3,
"number_of_data_nodes" : 3,
"active_primary_shards" : 829,
"active_shards" : 829,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 10,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 98.80810488676997
}
when i check for one of the unassigned shard i got this explanation:
{
"index" : ".siem-signals-default-000002",
"shard" : 0,
"primary" : true,
"current_state" : "started",
"current_node" : {
"id" : "ID000000node1",
"name" : "Node1",
"transport_address" : "IP____Node1:9300",
"attributes" : {
"xpack.installed" : "true",
"transform.node" : "false"
},
"weight_ranking" : 3
},
"can_remain_on_current_node" : "yes",
"can_rebalance_cluster" : "no",
"can_rebalance_cluster_decisions" : [
{
"decider" : "rebalance_only_when_active",
"decision" : "NO",
"explanation" : "rebalancing is not allowed until all replicas in the cluster are active"
},
{
"decider" : "cluster_rebalance",
"decision" : "NO",
"explanation" : "the cluster has unassigned shards and cluster setting [cluster.routing.allocation.allow_rebalance] is set to [indices_all_active]"
}
],
"can_rebalance_to_other_node" : "no",
"rebalance_explanation" : "rebalancing is not allowed",
"node_allocation_decisions" : [
{
"node_id" : "ID000000node2",
"node_name" : "node2",
"transport_address" : "IP_____node2:9300",
"node_attributes" : {
"xpack.installed" : "true",
"transform.node" : "false"
},
"node_decision" : "no",
"weight_ranking" : 1,
"deciders" : [
{
"decider" : "data_tier",
"decision" : "NO",
"explanation" : "index has a preference for tiers [data_content] and node does not meet the required [data_content] tier"
}
]
},
{
"node_id" : "ID000000Node3",
"node_name" : "node3",
"transport_address" : "IP_____node3:9300",
"node_attributes" : {
"xpack.installed" : "true",
"transform.node" : "false"
},
"node_decision" : "no",
"weight_ranking" : 2,
"deciders" : [
{
"decider" : "disk_threshold",
"decision" : "NO",
"explanation" : "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=85%], using more disk space than the maximum allowed [85.0%], actual free: [10.484763703454583%]"
},
{
"decider" : "data_tier",
"decision" : "NO",
"explanation" : "index has a preference for tiers [data_content] and node does not meet the required [data_content] tier"
}
]
}
]
}
All the 10 unassigned shards have the same issue and error ...
I tried to run the reroute command but nothing happened

Related

ensure minimum functionality under heavy search load elk

I have a 3x node elk cluster with fillowing node characteristics:
96GB RAM
25GB heap
10x CPU
6TB total indexes size for past 30 days distributed in 1.5TB per node
the filebeat index 50GB per index split across 3x shards
~ total 100GB/day , including the 1xreplica ~ 200GB/day
The thing is that when viewing my log metrics for past 1-7 days everything is fine , but if I open dashboard to see details for past 30 days I cannot do anything else , and if I open kibana from other browser tab is throw error 408 , all CPU's are at 100%
Please, advice if there is any option to ensure single request will not take the 100% from all CPUs in all nodes and I would be able to see from second tab at least the cluster monitoring?
Thanks,
R2D2
Output from GET _cluster/stats?pretty&human
{
"_nodes" : {
"total" : 6,
"successful" : 6,
"failed" : 0
},
"cluster_name" : "elk",
"cluster_uuid" : "CztzsocFTgmKHWI_GnqDvQ",
"timestamp" : 1634106014829,
"status" : "green",
"indices" : {
"count" : 116,
"shards" : {
"total" : 304,
"primaries" : 152,
"replication" : 1.0,
"index" : {
"shards" : {
"min" : 2,
"max" : 6,
"avg" : 2.6206896551724137
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.3103448275862069
},
"replication" : {
"min" : 1.0,
"max" : 1.0,
"avg" : 1.0
}
}
},
"docs" : {
"count" : 4672727260,
"deleted" : 1262101
},
"store" : {
"size" : "4tb",
"size_in_bytes" : 4490413194045,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "58.3mb",
"memory_size_in_bytes" : 61151040,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "4.1gb",
"memory_size_in_bytes" : 4406840440,
"total_count" : 2390354,
"hit_count" : 14385,
"miss_count" : 2375969,
"cache_size" : 4760,
"cache_count" : 5724,
"evictions" : 964
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 3404,
"memory" : "60.1mb",
"memory_in_bytes" : 63090152,
"terms_memory" : "29.9mb",
"terms_memory_in_bytes" : 31446928,
"stored_fields_memory" : "3.9mb",
"stored_fields_memory_in_bytes" : 4122896,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "13.8kb",
"norms_memory_in_bytes" : 14208,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "26.2mb",
"doc_values_memory_in_bytes" : 27506120,
"index_writer_memory" : "59.6mb",
"index_writer_memory_in_bytes" : 62543712,
"version_map_memory" : "830b",
"version_map_memory_in_bytes" : 830,
"fixed_bit_set" : "1gb",
"fixed_bit_set_memory_in_bytes" : 1168657528,
"max_unsafe_auto_id_timestamp" : 1634083207166,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 2686,
"index_count" : 79
},
{
"name" : "binary",
"count" : 14,
"index_count" : 3
},
{
"name" : "boolean",
"count" : 8714,
"index_count" : 103
},
{
"name" : "byte",
"count" : 1,
"index_count" : 1
},
{
"name" : "date",
"count" : 11809,
"index_count" : 115
},
{
"name" : "double",
"count" : 2607,
"index_count" : 79
},
{
"name" : "flattened",
"count" : 641,
"index_count" : 80
},
{
"name" : "float",
"count" : 2611,
"index_count" : 88
},
{
"name" : "geo_point",
"count" : 639,
"index_count" : 80
},
{
"name" : "half_float",
"count" : 56,
"index_count" : 14
},
{
"name" : "integer",
"count" : 186,
"index_count" : 11
},
{
"name" : "ip",
"count" : 9888,
"index_count" : 80
},
{
"name" : "keyword",
"count" : 285169,
"index_count" : 113
},
{
"name" : "long",
"count" : 79464,
"index_count" : 109
},
{
"name" : "nested",
"count" : 272,
"index_count" : 95
},
{
"name" : "object",
"count" : 57866,
"index_count" : 114
},
{
"name" : "short",
"count" : 7981,
"index_count" : 80
},
{
"name" : "text",
"count" : 7837,
"index_count" : 104
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "pattern_capture",
"count" : 1,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "uax_url_email",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "unique",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 6,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 6,
"remote_cluster_client" : 3,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.10.2"
],
"os" : {
"available_processors" : 36,
"allocated_processors" : 36,
"names" : [
{
"name" : "Linux",
"count" : 6
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 8",
"count" : 6
}
],
"mem" : {
"total" : "294gb",
"total_in_bytes" : 315680096256,
"free" : "36.5gb",
"free_in_bytes" : 39295123456,
"used" : "257.4gb",
"used_in_bytes" : 276384972800,
"free_percent" : 12,
"used_percent" : 88
}
},
"process" : {
"cpu" : {
"percent" : 3
},
"open_file_descriptors" : {
"min" : 389,
"max" : 1318,
"avg" : 849
}
},
"jvm" : {
"max_uptime" : "4.6d",
"max_uptime_in_millis" : 399502750,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 6
}
],
"mem" : {
"heap_used" : "30.9gb",
"heap_used_in_bytes" : 33225823136,
"heap_max" : "78gb",
"heap_max_in_bytes" : 83751862272
},
"threads" : 428
},
"fs" : {
"total" : "5.9tb",
"total_in_bytes" : 6521808826368,
"free" : "1.8tb",
"free_in_bytes" : 2028853403648,
"available" : "1.5tb",
"available_in_bytes" : 1697288646656
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 6
},
"http_types" : {
"security4" : 6
}
},
"discovery_types" : {
"zen" : 6
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 6
}
],
"ingest" : {
"number_of_pipelines" : 4,
"processor_stats" : {
"conditional" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "18.5s",
"time_in_millis" : 18569
},
"geoip" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "11.3s",
"time_in_millis" : 11315
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"rename" : {
"count" : 615290336,
"failed" : 0,
"current" : 0,
"time" : "7.1s",
"time_in_millis" : 7100
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Output from : GET _cat/nodes?v&h=v,n,m,r,rm,du,dt,cpu,ram.percent,ram.current,heap.percent,search.query_current,heap.current,heap.total,segments.count&s=n:asc
v n m r rm du dt cpu ram.percent ram.current heap.percent search.query_current heap.current segments.count
7.10.2 elk-ingest-data-0 - cdhilrstw 96gb 1.3tb 1.9tb 100 78 74.5gb 46 0 11.7gb 1162
7.10.2 elk-ingest-data-1 - cdhilrstw 96gb 1.5tb 1.9tb 71 77 73.7gb 34 0 8.6gb 1155
7.10.2 elk-ingest-data-2 - cdhilrstw 96gb 1.5tb 1.9tb 68 100 95.9gb 52 0 13gb 1091
7.10.2 elk-master-0 * lm 2gb 568.3mb 9.7gb 53 100 1.9gb 37 0 381.9mb 0
7.10.2 elk-master-1 - lm 2gb 568.3mb 9.7gb 2 68 1.3gb 61 0 628.9mb 0
7.10.2 elk-master-2 - lm 2gb 568.3mb 9.7gb 3 68 1.3gb 55 0 572.6mb 0

Elasticsearch cluster NodeStatsCollector time out when collecting data

A few minutes after the elasticsearch cluster run, all nodes gives the error:
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
all nodes gives this error and indexer that is a java program index data whit bulk api very well. and there is no error in logs of my indexer program. but elastic search node gives this error.
i used 4 elasticsearch nodes and kibana in one machine whit windows server 2019 data center
the version of elasticsearch and kibana is 7.10.0 and Cluster Status is :
{
"_nodes" : {
"total" : 4,
"successful" : 4,
"failed" : 0
},
"cluster_name" : "es-cluster",
"cluster_uuid" : "RRhGhaElfh5lUxGfsKg",
"timestamp" : 1245375859907,
"status" : "green",
"indices" : {
"count" : 1,
"shards" : {
"total" : 9,
"primaries" : 3,
"replication" : 2.0,
"index" : {
"shards" : {
"min" : 9,
"max" : 9,
"avg" : 9.0
},
"primaries" : {
"min" : 3,
"max" : 3,
"avg" : 3.0
},
"replication" : {
"min" : 2.0,
"max" : 2.0,
"avg" : 2.0
}
}
},
"docs" : {
"count" : 0,
"deleted" : 0
},
"store" : {
"size" : "36.1mb",
"size_in_bytes" : 37936718,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"total_count" : 0,
"hit_count" : 0,
"miss_count" : 0,
"cache_size" : 0,
"cache_count" : 0,
"evictions" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 0,
"memory" : "0b",
"memory_in_bytes" : 0,
"terms_memory" : "0b",
"terms_memory_in_bytes" : 0,
"stored_fields_memory" : "0b",
"stored_fields_memory_in_bytes" : 0,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "0b",
"norms_memory_in_bytes" : 0,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "0b",
"doc_values_memory_in_bytes" : 0,
"index_writer_memory" : "233.1mb",
"index_writer_memory_in_bytes" : 244509188,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0,
"fixed_bit_set" : "0b",
"fixed_bit_set_memory_in_bytes" : 0,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "boolean",
"count" : 3,
"index_count" : 1
},
{
"name" : "date",
"count" : 4,
"index_count" : 1
},
{
"name" : "geo_point",
"count" : 1,
"index_count" : 1
},
{
"name" : "integer",
"count" : 8,
"index_count" : 1
},
{
"name" : "ip",
"count" : 2,
"index_count" : 1
},
{
"name" : "keyword",
"count" : 12,
"index_count" : 1
},
{
"name" : "object",
"count" : 1,
"index_count" : 1
},
{
"name" : "text",
"count" : 17,
"index_count" : 1
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "stop",
"count" : 3,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "standard",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "arabic_normalization",
"count" : 1,
"index_count" : 1
},
{
"name" : "decimal_digit",
"count" : 1,
"index_count" : 1
},
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "persian_normalization",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 4,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 0,
"remote_cluster_client" : 4,
"transform" : 3,
"voting_only" : 1
},
"versions" : [
"7.10.0"
],
"os" : {
"available_processors" : 48,
"allocated_processors" : 24,
"names" : [
{
"name" : "Windows Server 2019",
"count" : 4
}
],
"pretty_names" : [
{
"pretty_name" : "Windows Server 2019",
"count" : 4
}
],
"mem" : {
"total" : "383.4gb",
"total_in_bytes" : 411772076032,
"free" : "127.2gb",
"free_in_bytes" : 136611741696,
"used" : "256.2gb",
"used_in_bytes" : 275160334336,
"free_percent" : 33,
"used_percent" : 67
}
},
"process" : {
"cpu" : {
"percent" : 0
},
"open_file_descriptors" : {
"min" : -1,
"max" : -1,
"avg" : 0
}
},
"jvm" : {
"max_uptime" : "38.2m",
"max_uptime_in_millis" : 2297261,
"versions" : [
{
"version" : "14.0.2",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "14.0.2+12-46",
"vm_vendor" : "Oracle Corporation",
"bundled_jdk" : true,
"using_bundled_jdk" : false,
"count" : 4
}
],
"mem" : {
"heap_used" : "2.2gb",
"heap_used_in_bytes" : 2433056080,
"heap_max" : "50gb",
"heap_max_in_bytes" : 53687091200
},
"threads" : 153
},
"fs" : {
"total" : "6.5tb",
"total_in_bytes" : 7196607758336,
"free" : "6.2tb",
"free_in_bytes" : 6888031485952,
"available" : "6.2tb",
"available_in_bytes" : 6888031469568
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 4
},
"http_types" : {
"security4" : 4
}
},
"discovery_types" : {
"zen" : 4
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "zip",
"count" : 4
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Any Idea for solve this error?

where does ES store data

Similar questions were asked before suggestion to hit:
http://127.0.0.1:9200/_nodes/stats/fs?pretty
This shows:
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "elasticsearch",
"nodes" : {
"-MS6L0okT7SwuQQzp64F0Q" : {
"timestamp" : 1568228571281,
"name" : "Bladiblah",
"transport_address" : "127.0.0.1:9300",
"host" : "127.0.0.1",
"ip" : "127.0.0.1:9300",
"roles" : [
"ingest",
"master",
"data"
],
"attributes" : {
"ml.machine_memory" : "274841600000",
"xpack.installed" : "true",
"ml.max_open_jobs" : "20"
},
"fs" : {
"timestamp" : 1568228571281,
"total" : {
"total_in_bytes" : 400052711424,
"free_in_bytes" : 317666811904,
"available_in_bytes" : 317666811904
},
"least_usage_estimate" : {
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"total_in_bytes" : 400052711424,
"available_in_bytes" : 317666963456,
"used_disk_percent" : 20.593723180814195
},
"most_usage_estimate" : {
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"total_in_bytes" : 400052711424,
"available_in_bytes" : 317666963456,
"used_disk_percent" : 20.593723180814195
},
"data" : [
{
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"mount" : "(C:)",
"type" : "NTFS",
"total_in_bytes" : 400052711424,
"free_in_bytes" : 317666811904,
"available_in_bytes" : 317666811904
}
]
}
}
}
}
This shows that quite a bit of memory is used. However, when I go to:
C:\elasticsearch-7.3.1\data\nodes\0
and go to properties the used memory is only a few kilobytes. Am I missing something?

Why is ascending geo distance sorting faster than descending geo distance sorting

I'm using Elasticsearch 6.6 and have an index (1 shard, 1 replica) with the geonames (https://www.geonames.org/) dataset indexed (indexsize =1.3 gb, 11.8 mio geopoints).
I was playing around a bit with the geo distance sorting query, sorting the whole index for some origin points. So after some testing I saw that sorting ascending is always faster than sorting descending. here is an example query (i also tested with bigger "size"-parameter):
POST /geonames/_search?request_cache=false
{
"size":1,
"sort" : [
{
"_geo_distance" : {
"location" : [8, 49],
"order" : "asc",
"unit" : "m",
"mode" : "min",
"distance_type" : "arc",
"ignore_unmapped": true
}
}
]
}
Here is the answer for ascending sorting (with explain and profile True):
{
"took" : 1374,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 11858060,
"max_score" : null,
"hits" : [
{
"_shard" : "[geonames][0]",
"_node" : "qXTymyB9QLmxhPtGEtA_mA",
"_index" : "geonames",
"_type" : "doc",
"_id" : "L781LmkBrQo0YN4qP48D",
"_score" : null,
"_source" : {
"id" : "3034701",
"name" : "ForĂȘt de Wissembourg",
"location" : {
"lat" : "49.00924",
"lon" : "8.01542"
}
},
"sort" : [
1523.4121312414704
],
"_explanation" : {
"value" : 1.0,
"description" : "*:*",
"details" : [ ]
}
}
]
},
"profile" : {
"shards" : [
{
"id" : "[qXTymyB9QLmxhPtGEtA_mA][geonames][0]",
"searches" : [
{
"query" : [
{
"type" : "MatchAllDocsQuery",
"description" : "*:*",
"time_in_nanos" : 265223567,
"breakdown" : {
"score" : 0,
"build_scorer_count" : 54,
"match_count" : 0,
"create_weight" : 10209,
"next_doc" : 253091268,
"match" : 0,
"create_weight_count" : 1,
"next_doc_count" : 11858087,
"score_count" : 0,
"build_scorer" : 263948,
"advance" : 0,
"advance_count" : 0
}
}
],
"rewrite_time" : 1097,
"collector" : [
{
"name" : "CancellableCollector",
"reason" : "search_cancelled",
"time_in_nanos" : 1044167746,
"children" : [
{
"name" : "SimpleFieldCollector",
"reason" : "search_top_hits",
"time_in_nanos" : 508296683
}
]
}
]
}
],
"aggregations" : [ ]
}
]
}
}
and here for descending, just switched the parameter from asc to desc (also with profile and explain):
{
"took" : 2226,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 11858060,
"max_score" : null,
"hits" : [
{
"_shard" : "[geonames][0]",
"_node" : "qXTymyB9QLmxhPtGEtA_mA",
"_index" : "geonames",
"_type" : "doc",
"_id" : "Mq80LmkBrQo0YN4q11bA",
"_score" : null,
"_source" : {
"id" : "4036351",
"name" : "Bollons Seamount",
"location" : {
"lat" : "-49.66667",
"lon" : "-176.16667"
}
},
"sort" : [
1.970427111052182E7
],
"_explanation" : {
"value" : 1.0,
"description" : "*:*",
"details" : [ ]
}
}
]
},
"profile" : {
"shards" : [
{
"id" : "[qXTymyB9QLmxhPtGEtA_mA][geonames][0]",
"searches" : [
{
"query" : [
{
"type" : "MatchAllDocsQuery",
"description" : "*:*",
"time_in_nanos" : 268521404,
"breakdown" : {
"score" : 0,
"build_scorer_count" : 54,
"match_count" : 0,
"create_weight" : 9333,
"next_doc" : 256458664,
"match" : 0,
"create_weight_count" : 1,
"next_doc_count" : 11858087,
"score_count" : 0,
"build_scorer" : 195265,
"advance" : 0,
"advance_count" : 0
}
}
],
"rewrite_time" : 1142,
"collector" : [
{
"name" : "CancellableCollector",
"reason" : "search_cancelled",
"time_in_nanos" : 1898324618,
"children" : [
{
"name" : "SimpleFieldCollector",
"reason" : "search_top_hits",
"time_in_nanos" : 1368306442
}
]
}
]
}
],
"aggregations" : [ ]
}
]
}
}
So my question is, why is it like this ? As I understood Es calculates the distance from the origin point to every other point and then sorts them. So why is the descending sorting so much slower ?
Asking the same question on the Elasticsearch board and getting an answer.
So apparantly Elasticsearch uses differnt searching strategies/algorithms for ascending end descending distance sorting.
For the descending sorting it calculates the distance from the origin to every point end then sorts.
For the ascending sorting it uses boundingboxes to filter points near the origin and only calculate the distances for points inside the boundingboxes.

ElasticSearch Freezes and Crashes

I have setup a cluster with 3 elasticsearch instances and they are being feed with documents provided by logstash (~165K docs per minute). I have those 3 machines with 16Gb RAM and each instance is started up with 8Gb.
The indexing works quite well, and I'm able to perform all required search operations I was expecting. The thing now is that I want to make it general available, but unfortunately depending on the queries that are executed (i.e. range facets over all indexes) it freezes the entire cluster and ends up in a split-brain state.
I already limited some things like:
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
My entire config file looks like:
index.number_of_shards: 10
index.number_of_replicas: 0
bootstrap.mlockall: true
# Indices settings
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Cache Sizes
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
Is there something else I could improve here to avoid such freeze and split brain state?
Output of my nodes info:
{
"cluster_name" : "elasticsearch",
"nodes" : {
"7i5sZj_jT_qe6HNESfzO3A" : {
"name" : "Captain Fate",
"transport_address" : "inet[/192.168.0.83:9300]",
"host" : "esserver02",
"ip" : "192.168.0.83",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.83:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Captain Fate",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 8482,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 8482,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1411976625093,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9300]",
"publish_address" : "inet[/192.168.0.83:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9200]",
"publish_address" : "inet[/192.168.0.83:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"0OaMqY6IR1SYeL6rd6P61Q" : {
"name" : "Blonde Phantom",
"transport_address" : "inet[/192.168.0.100:9300]",
"host" : "esserver03",
"ip" : "192.168.0.100",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.100:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Blonde Phantom",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 98772,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 98772,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657551806,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.100:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.100:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"H2h01oNGSuCL0uu8J3SF6w" : {
"name" : "Dakimh the Enchanter",
"transport_address" : "inet[/192.168.0.101:9300]",
"host" : "esserver04",
"ip" : "192.168.0.101",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.101:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Dakimh the Enchanter",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 88019,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 88019,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657560829,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.101:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.101:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
}
}
}
My old configuration:
index.number_of_shards: 40
index.number_of_replicas: 0
bootstrap.mlockall: true
## Threadpool Settings ##
# Indices settings
indices.memory.index_buffer_size: 50%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
Your indices.fielddata.cache.size is set to 15% only. Why?
These data are used for aggregations/facets so it could be related. You should remove indices.fielddata.cache.expire ASAP : this setting is really not recommended at all, as evictions are really costful and it schedules evictions for fielddata values, even if they are used. Could you give us the result of a node stats API ?
Update 1 :
I see minimum_master_nodes is set to 1, but you say you have 3 nodes. This should be set to 2 according to the equation generally used (number of nodes/2 + 1, documentation)
Update 2 :
Do you still experience split-brains with the updated configuration?
Depending on your cluster ES version (> 1.0) , you might want to tune the fielddata circuit-breaker to prevent the most costly requests to be done : take a look here.

Resources