ElasticSearch Freezes and Crashes - elasticsearch

I have setup a cluster with 3 elasticsearch instances and they are being feed with documents provided by logstash (~165K docs per minute). I have those 3 machines with 16Gb RAM and each instance is started up with 8Gb.
The indexing works quite well, and I'm able to perform all required search operations I was expecting. The thing now is that I want to make it general available, but unfortunately depending on the queries that are executed (i.e. range facets over all indexes) it freezes the entire cluster and ends up in a split-brain state.
I already limited some things like:
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
My entire config file looks like:
index.number_of_shards: 10
index.number_of_replicas: 0
bootstrap.mlockall: true
# Indices settings
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Cache Sizes
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
Is there something else I could improve here to avoid such freeze and split brain state?
Output of my nodes info:
{
"cluster_name" : "elasticsearch",
"nodes" : {
"7i5sZj_jT_qe6HNESfzO3A" : {
"name" : "Captain Fate",
"transport_address" : "inet[/192.168.0.83:9300]",
"host" : "esserver02",
"ip" : "192.168.0.83",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.83:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Captain Fate",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 8482,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 8482,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1411976625093,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9300]",
"publish_address" : "inet[/192.168.0.83:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9200]",
"publish_address" : "inet[/192.168.0.83:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"0OaMqY6IR1SYeL6rd6P61Q" : {
"name" : "Blonde Phantom",
"transport_address" : "inet[/192.168.0.100:9300]",
"host" : "esserver03",
"ip" : "192.168.0.100",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.100:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Blonde Phantom",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 98772,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 98772,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657551806,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.100:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.100:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"H2h01oNGSuCL0uu8J3SF6w" : {
"name" : "Dakimh the Enchanter",
"transport_address" : "inet[/192.168.0.101:9300]",
"host" : "esserver04",
"ip" : "192.168.0.101",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.101:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Dakimh the Enchanter",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 88019,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 88019,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657560829,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.101:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.101:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
}
}
}
My old configuration:
index.number_of_shards: 40
index.number_of_replicas: 0
bootstrap.mlockall: true
## Threadpool Settings ##
# Indices settings
indices.memory.index_buffer_size: 50%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000

Your indices.fielddata.cache.size is set to 15% only. Why?
These data are used for aggregations/facets so it could be related. You should remove indices.fielddata.cache.expire ASAP : this setting is really not recommended at all, as evictions are really costful and it schedules evictions for fielddata values, even if they are used. Could you give us the result of a node stats API ?
Update 1 :
I see minimum_master_nodes is set to 1, but you say you have 3 nodes. This should be set to 2 according to the equation generally used (number of nodes/2 + 1, documentation)
Update 2 :
Do you still experience split-brains with the updated configuration?
Depending on your cluster ES version (> 1.0) , you might want to tune the fielddata circuit-breaker to prevent the most costly requests to be done : take a look here.

Related

ensure minimum functionality under heavy search load elk

I have a 3x node elk cluster with fillowing node characteristics:
96GB RAM
25GB heap
10x CPU
6TB total indexes size for past 30 days distributed in 1.5TB per node
the filebeat index 50GB per index split across 3x shards
~ total 100GB/day , including the 1xreplica ~ 200GB/day
The thing is that when viewing my log metrics for past 1-7 days everything is fine , but if I open dashboard to see details for past 30 days I cannot do anything else , and if I open kibana from other browser tab is throw error 408 , all CPU's are at 100%
Please, advice if there is any option to ensure single request will not take the 100% from all CPUs in all nodes and I would be able to see from second tab at least the cluster monitoring?
Thanks,
R2D2
Output from GET _cluster/stats?pretty&human
{
"_nodes" : {
"total" : 6,
"successful" : 6,
"failed" : 0
},
"cluster_name" : "elk",
"cluster_uuid" : "CztzsocFTgmKHWI_GnqDvQ",
"timestamp" : 1634106014829,
"status" : "green",
"indices" : {
"count" : 116,
"shards" : {
"total" : 304,
"primaries" : 152,
"replication" : 1.0,
"index" : {
"shards" : {
"min" : 2,
"max" : 6,
"avg" : 2.6206896551724137
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.3103448275862069
},
"replication" : {
"min" : 1.0,
"max" : 1.0,
"avg" : 1.0
}
}
},
"docs" : {
"count" : 4672727260,
"deleted" : 1262101
},
"store" : {
"size" : "4tb",
"size_in_bytes" : 4490413194045,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "58.3mb",
"memory_size_in_bytes" : 61151040,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "4.1gb",
"memory_size_in_bytes" : 4406840440,
"total_count" : 2390354,
"hit_count" : 14385,
"miss_count" : 2375969,
"cache_size" : 4760,
"cache_count" : 5724,
"evictions" : 964
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 3404,
"memory" : "60.1mb",
"memory_in_bytes" : 63090152,
"terms_memory" : "29.9mb",
"terms_memory_in_bytes" : 31446928,
"stored_fields_memory" : "3.9mb",
"stored_fields_memory_in_bytes" : 4122896,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "13.8kb",
"norms_memory_in_bytes" : 14208,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "26.2mb",
"doc_values_memory_in_bytes" : 27506120,
"index_writer_memory" : "59.6mb",
"index_writer_memory_in_bytes" : 62543712,
"version_map_memory" : "830b",
"version_map_memory_in_bytes" : 830,
"fixed_bit_set" : "1gb",
"fixed_bit_set_memory_in_bytes" : 1168657528,
"max_unsafe_auto_id_timestamp" : 1634083207166,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 2686,
"index_count" : 79
},
{
"name" : "binary",
"count" : 14,
"index_count" : 3
},
{
"name" : "boolean",
"count" : 8714,
"index_count" : 103
},
{
"name" : "byte",
"count" : 1,
"index_count" : 1
},
{
"name" : "date",
"count" : 11809,
"index_count" : 115
},
{
"name" : "double",
"count" : 2607,
"index_count" : 79
},
{
"name" : "flattened",
"count" : 641,
"index_count" : 80
},
{
"name" : "float",
"count" : 2611,
"index_count" : 88
},
{
"name" : "geo_point",
"count" : 639,
"index_count" : 80
},
{
"name" : "half_float",
"count" : 56,
"index_count" : 14
},
{
"name" : "integer",
"count" : 186,
"index_count" : 11
},
{
"name" : "ip",
"count" : 9888,
"index_count" : 80
},
{
"name" : "keyword",
"count" : 285169,
"index_count" : 113
},
{
"name" : "long",
"count" : 79464,
"index_count" : 109
},
{
"name" : "nested",
"count" : 272,
"index_count" : 95
},
{
"name" : "object",
"count" : 57866,
"index_count" : 114
},
{
"name" : "short",
"count" : 7981,
"index_count" : 80
},
{
"name" : "text",
"count" : 7837,
"index_count" : 104
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "pattern_capture",
"count" : 1,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "uax_url_email",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "unique",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 6,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 6,
"remote_cluster_client" : 3,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.10.2"
],
"os" : {
"available_processors" : 36,
"allocated_processors" : 36,
"names" : [
{
"name" : "Linux",
"count" : 6
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 8",
"count" : 6
}
],
"mem" : {
"total" : "294gb",
"total_in_bytes" : 315680096256,
"free" : "36.5gb",
"free_in_bytes" : 39295123456,
"used" : "257.4gb",
"used_in_bytes" : 276384972800,
"free_percent" : 12,
"used_percent" : 88
}
},
"process" : {
"cpu" : {
"percent" : 3
},
"open_file_descriptors" : {
"min" : 389,
"max" : 1318,
"avg" : 849
}
},
"jvm" : {
"max_uptime" : "4.6d",
"max_uptime_in_millis" : 399502750,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 6
}
],
"mem" : {
"heap_used" : "30.9gb",
"heap_used_in_bytes" : 33225823136,
"heap_max" : "78gb",
"heap_max_in_bytes" : 83751862272
},
"threads" : 428
},
"fs" : {
"total" : "5.9tb",
"total_in_bytes" : 6521808826368,
"free" : "1.8tb",
"free_in_bytes" : 2028853403648,
"available" : "1.5tb",
"available_in_bytes" : 1697288646656
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 6
},
"http_types" : {
"security4" : 6
}
},
"discovery_types" : {
"zen" : 6
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 6
}
],
"ingest" : {
"number_of_pipelines" : 4,
"processor_stats" : {
"conditional" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "18.5s",
"time_in_millis" : 18569
},
"geoip" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "11.3s",
"time_in_millis" : 11315
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"rename" : {
"count" : 615290336,
"failed" : 0,
"current" : 0,
"time" : "7.1s",
"time_in_millis" : 7100
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Output from : GET _cat/nodes?v&h=v,n,m,r,rm,du,dt,cpu,ram.percent,ram.current,heap.percent,search.query_current,heap.current,heap.total,segments.count&s=n:asc
v n m r rm du dt cpu ram.percent ram.current heap.percent search.query_current heap.current segments.count
7.10.2 elk-ingest-data-0 - cdhilrstw 96gb 1.3tb 1.9tb 100 78 74.5gb 46 0 11.7gb 1162
7.10.2 elk-ingest-data-1 - cdhilrstw 96gb 1.5tb 1.9tb 71 77 73.7gb 34 0 8.6gb 1155
7.10.2 elk-ingest-data-2 - cdhilrstw 96gb 1.5tb 1.9tb 68 100 95.9gb 52 0 13gb 1091
7.10.2 elk-master-0 * lm 2gb 568.3mb 9.7gb 53 100 1.9gb 37 0 381.9mb 0
7.10.2 elk-master-1 - lm 2gb 568.3mb 9.7gb 2 68 1.3gb 61 0 628.9mb 0
7.10.2 elk-master-2 - lm 2gb 568.3mb 9.7gb 3 68 1.3gb 55 0 572.6mb 0

Elasticsearch cluster NodeStatsCollector time out when collecting data

A few minutes after the elasticsearch cluster run, all nodes gives the error:
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
[ERROR][o.e.x.m.c.n.NodeStatsCollector] [node-1] collector [node_stats] timed out when collecting data
all nodes gives this error and indexer that is a java program index data whit bulk api very well. and there is no error in logs of my indexer program. but elastic search node gives this error.
i used 4 elasticsearch nodes and kibana in one machine whit windows server 2019 data center
the version of elasticsearch and kibana is 7.10.0 and Cluster Status is :
{
"_nodes" : {
"total" : 4,
"successful" : 4,
"failed" : 0
},
"cluster_name" : "es-cluster",
"cluster_uuid" : "RRhGhaElfh5lUxGfsKg",
"timestamp" : 1245375859907,
"status" : "green",
"indices" : {
"count" : 1,
"shards" : {
"total" : 9,
"primaries" : 3,
"replication" : 2.0,
"index" : {
"shards" : {
"min" : 9,
"max" : 9,
"avg" : 9.0
},
"primaries" : {
"min" : 3,
"max" : 3,
"avg" : 3.0
},
"replication" : {
"min" : 2.0,
"max" : 2.0,
"avg" : 2.0
}
}
},
"docs" : {
"count" : 0,
"deleted" : 0
},
"store" : {
"size" : "36.1mb",
"size_in_bytes" : 37936718,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"total_count" : 0,
"hit_count" : 0,
"miss_count" : 0,
"cache_size" : 0,
"cache_count" : 0,
"evictions" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 0,
"memory" : "0b",
"memory_in_bytes" : 0,
"terms_memory" : "0b",
"terms_memory_in_bytes" : 0,
"stored_fields_memory" : "0b",
"stored_fields_memory_in_bytes" : 0,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "0b",
"norms_memory_in_bytes" : 0,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "0b",
"doc_values_memory_in_bytes" : 0,
"index_writer_memory" : "233.1mb",
"index_writer_memory_in_bytes" : 244509188,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0,
"fixed_bit_set" : "0b",
"fixed_bit_set_memory_in_bytes" : 0,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "boolean",
"count" : 3,
"index_count" : 1
},
{
"name" : "date",
"count" : 4,
"index_count" : 1
},
{
"name" : "geo_point",
"count" : 1,
"index_count" : 1
},
{
"name" : "integer",
"count" : 8,
"index_count" : 1
},
{
"name" : "ip",
"count" : 2,
"index_count" : 1
},
{
"name" : "keyword",
"count" : 12,
"index_count" : 1
},
{
"name" : "object",
"count" : 1,
"index_count" : 1
},
{
"name" : "text",
"count" : 17,
"index_count" : 1
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "stop",
"count" : 3,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "standard",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "arabic_normalization",
"count" : 1,
"index_count" : 1
},
{
"name" : "decimal_digit",
"count" : 1,
"index_count" : 1
},
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "persian_normalization",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 4,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 0,
"remote_cluster_client" : 4,
"transform" : 3,
"voting_only" : 1
},
"versions" : [
"7.10.0"
],
"os" : {
"available_processors" : 48,
"allocated_processors" : 24,
"names" : [
{
"name" : "Windows Server 2019",
"count" : 4
}
],
"pretty_names" : [
{
"pretty_name" : "Windows Server 2019",
"count" : 4
}
],
"mem" : {
"total" : "383.4gb",
"total_in_bytes" : 411772076032,
"free" : "127.2gb",
"free_in_bytes" : 136611741696,
"used" : "256.2gb",
"used_in_bytes" : 275160334336,
"free_percent" : 33,
"used_percent" : 67
}
},
"process" : {
"cpu" : {
"percent" : 0
},
"open_file_descriptors" : {
"min" : -1,
"max" : -1,
"avg" : 0
}
},
"jvm" : {
"max_uptime" : "38.2m",
"max_uptime_in_millis" : 2297261,
"versions" : [
{
"version" : "14.0.2",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "14.0.2+12-46",
"vm_vendor" : "Oracle Corporation",
"bundled_jdk" : true,
"using_bundled_jdk" : false,
"count" : 4
}
],
"mem" : {
"heap_used" : "2.2gb",
"heap_used_in_bytes" : 2433056080,
"heap_max" : "50gb",
"heap_max_in_bytes" : 53687091200
},
"threads" : 153
},
"fs" : {
"total" : "6.5tb",
"total_in_bytes" : 7196607758336,
"free" : "6.2tb",
"free_in_bytes" : 6888031485952,
"available" : "6.2tb",
"available_in_bytes" : 6888031469568
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 4
},
"http_types" : {
"security4" : 4
}
},
"discovery_types" : {
"zen" : 4
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "zip",
"count" : 4
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Any Idea for solve this error?

Elasticsearch ngram tokenizer returns all results regardless of query input

I am trying to build a query to search for records in the following format: TR000002_1_2020.
Users should be able to search for results the following ways:
TR000002 or 2_1_2020 or TR000002_1_2020 or 2020. I figured an ngram tokenization query would be best suited for my needs. I am using Elasticsearch 6.8 so I cannot use the built in Search-As-You-Type introduced in E7.
Here's my implementation I followed from docs here. The only thing I modified was EdgeNGram -> NGram as the user can search from any point of the text.
My Analysis block looks like this:
.Analysis(a => a
.Analyzers(aa => aa
.Custom("autocomplete", ca => ca
.Tokenizer("autocomplete")
.Filters(new string[] {
"lowercase"
})
)
.Custom("autocomplete_search", ca => ca
.Tokenizer("lowercase")
)
)
.Tokenizers(t => t
.NGram("autocomplete", e => e
.MinGram(2)
.MaxGram(16)
.TokenChars(new TokenChar[] {
TokenChar.Letter,
TokenChar.Digit,
TokenChar.Punctuation,
TokenChar.Symbol
})
)
)
)
Then in my mapping I define:
.Text(t => t
.Name(tr => tr.TestRecordId)
.Analyzer("autocomplete")
.SearchAnalyzer("autocomplete_search")
)
When I search for TR000002, my query returns all results instead of just the records that contain those specific characters. What am I doing wrong? Is there a better tokenizer for this specific use case? Thanks!
EDIT: Here's a sample of what is returned:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 27,
"max_score" : 0.105360515,
"hits" : [
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "3",
"_score" : 0.105360515,
"_source" : {
"id" : 3,
"testRecordId" : "TR000002_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 12,
"testStatus" : {
"testStatusId" : 12,
"name" : "Complete: Postponed Until Further Notice"
},
"discriminator" : "SingleEventEffectsRecord",
"testRecordServiceOrders" : [
{
"testRecordId" : 3,
"serviceOrderId" : 9,
"serviceOrder" : {
"serviceOrderId" : 9,
"serviceOrderNumber" : "105702"
}
}
],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
},
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"sEETestRates" : [ ]
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "11",
"_score" : 0.105360515,
"_source" : {
"id" : 11,
"testRecordId" : "TR000011_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"partLDC" : "12",
"waferLot" : "1",
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"testStartDate" : "2020-07-30T00:00:00",
"actualCompletionDate" : "2020-07-31T00:00:00"
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "17",
"_score" : 0.105360515,
"_source" : {
"id" : 17,
"testRecordId" : "TR000017_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "lewallen",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false
}
},
Also here's what shows for mapping:
"testRecordId" : {
"type" : "text",
"analyzer" : "autocomplete",
"search_analyzer" : "autocomplete_search"
},
I guess I should also mention, I've been testing this query in the console like so:
GET test-records-development/_search
{
"query": {
"match": {
"testRecordId": {
"query": "TR000002_1_2020"
}
}
}
}
EDIT 2: Added API response from index _settings endpoint:
{
"test-records-development-09-09-2020-02-00-00" : {
"settings" : {
"index" : {
"number_of_shards" : "5",
"provided_name" : "test-records-development-09-09-2020-02-00-00",
"creation_date" : "1599617013874",
"analysis" : {
"analyzer" : {
"autocomplete" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "autocomplete"
},
"autocomplete_search" : {
"type" : "custom",
"tokenizer" : "lowercase"
}
},
"tokenizer" : {
"autocomplete" : {
"token_chars" : [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram" : "2",
"type" : "ngram",
"max_gram" : "16"
}
}
},
"number_of_replicas" : "0",
"uuid" : "FSeCa0YwRCOJVbjfxYGkig",
"version" : {
"created" : "6080199"
}
}
}
}
}
As I don't have the analyzer setting access in JSON format,I can't confirm it but most probably issue is with your search analyzer autocomplete_search which is creating search time tokens which are matching the index time tokens.
For example: you are searching for TR000002_1_2020 and if it creates 2020 as a token and for document containing TR000011_1_2020 also creates a 2020 token than your query will match it.
You can use the analyze API to check the generated tokens based on a analyzer and as mentioned earlier mostly there is some tokens which are matching as shown above.

where does ES store data

Similar questions were asked before suggestion to hit:
http://127.0.0.1:9200/_nodes/stats/fs?pretty
This shows:
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "elasticsearch",
"nodes" : {
"-MS6L0okT7SwuQQzp64F0Q" : {
"timestamp" : 1568228571281,
"name" : "Bladiblah",
"transport_address" : "127.0.0.1:9300",
"host" : "127.0.0.1",
"ip" : "127.0.0.1:9300",
"roles" : [
"ingest",
"master",
"data"
],
"attributes" : {
"ml.machine_memory" : "274841600000",
"xpack.installed" : "true",
"ml.max_open_jobs" : "20"
},
"fs" : {
"timestamp" : 1568228571281,
"total" : {
"total_in_bytes" : 400052711424,
"free_in_bytes" : 317666811904,
"available_in_bytes" : 317666811904
},
"least_usage_estimate" : {
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"total_in_bytes" : 400052711424,
"available_in_bytes" : 317666963456,
"used_disk_percent" : 20.593723180814195
},
"most_usage_estimate" : {
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"total_in_bytes" : 400052711424,
"available_in_bytes" : 317666963456,
"used_disk_percent" : 20.593723180814195
},
"data" : [
{
"path" : "C:\\elasticsearch-7.3.1\\data\\nodes\\0",
"mount" : "(C:)",
"type" : "NTFS",
"total_in_bytes" : 400052711424,
"free_in_bytes" : 317666811904,
"available_in_bytes" : 317666811904
}
]
}
}
}
}
This shows that quite a bit of memory is used. However, when I go to:
C:\elasticsearch-7.3.1\data\nodes\0
and go to properties the used memory is only a few kilobytes. Am I missing something?

Elasticsearch: sum of total term frequency in ONE document

I need sumttf of ONE document in a field. However I can get sumttf of all documents only...
I need to be able to access the variable in script like _index['field'].sumttf() of that particular document. This is what I've got so far.
Mapping:
{"document2" : {
"mappings" : {
"document2" : {
"_all" : {
"enabled" : false
},
"properties" : {
"content" : {
"type" : "string",
"term_vector" : "yes",
"fields" : {
"with_shingles" : {
"type" : "string",
"analyzer" : "my_shingle_analyzer"
}
}
},
...
Term vector:
"_index" : "document2",
"_type" : "document2",
"_id" : "709718",
"_version" : 1,
"term_vectors" : {
"content" : {
"field_statistics" : {
"sum_doc_freq" : 60676474,
"doc_count" : 198373,
"sum_ttf" : 224960172
},
terms" : {
"0" : {
"term_freq" : 8
},
"0.5" : {
"term_freq" : 1
},
"003a0e45ea07a" : {
"term_freq" : 1
},
"005" : {
"term_freq" : 1
},
"0081989" : {
"term_freq" : 1
},
"01" : {
"term_freq" : 1
},
"01.08.2002" : {
"term_freq" : 1
},
...

Resources