Related
I have a 3x node elk cluster with fillowing node characteristics:
96GB RAM
25GB heap
10x CPU
6TB total indexes size for past 30 days distributed in 1.5TB per node
the filebeat index 50GB per index split across 3x shards
~ total 100GB/day , including the 1xreplica ~ 200GB/day
The thing is that when viewing my log metrics for past 1-7 days everything is fine , but if I open dashboard to see details for past 30 days I cannot do anything else , and if I open kibana from other browser tab is throw error 408 , all CPU's are at 100%
Please, advice if there is any option to ensure single request will not take the 100% from all CPUs in all nodes and I would be able to see from second tab at least the cluster monitoring?
Thanks,
R2D2
Output from GET _cluster/stats?pretty&human
{
"_nodes" : {
"total" : 6,
"successful" : 6,
"failed" : 0
},
"cluster_name" : "elk",
"cluster_uuid" : "CztzsocFTgmKHWI_GnqDvQ",
"timestamp" : 1634106014829,
"status" : "green",
"indices" : {
"count" : 116,
"shards" : {
"total" : 304,
"primaries" : 152,
"replication" : 1.0,
"index" : {
"shards" : {
"min" : 2,
"max" : 6,
"avg" : 2.6206896551724137
},
"primaries" : {
"min" : 1,
"max" : 3,
"avg" : 1.3103448275862069
},
"replication" : {
"min" : 1.0,
"max" : 1.0,
"avg" : 1.0
}
}
},
"docs" : {
"count" : 4672727260,
"deleted" : 1262101
},
"store" : {
"size" : "4tb",
"size_in_bytes" : 4490413194045,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "58.3mb",
"memory_size_in_bytes" : 61151040,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "4.1gb",
"memory_size_in_bytes" : 4406840440,
"total_count" : 2390354,
"hit_count" : 14385,
"miss_count" : 2375969,
"cache_size" : 4760,
"cache_count" : 5724,
"evictions" : 964
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 3404,
"memory" : "60.1mb",
"memory_in_bytes" : 63090152,
"terms_memory" : "29.9mb",
"terms_memory_in_bytes" : 31446928,
"stored_fields_memory" : "3.9mb",
"stored_fields_memory_in_bytes" : 4122896,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "13.8kb",
"norms_memory_in_bytes" : 14208,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "26.2mb",
"doc_values_memory_in_bytes" : 27506120,
"index_writer_memory" : "59.6mb",
"index_writer_memory_in_bytes" : 62543712,
"version_map_memory" : "830b",
"version_map_memory_in_bytes" : 830,
"fixed_bit_set" : "1gb",
"fixed_bit_set_memory_in_bytes" : 1168657528,
"max_unsafe_auto_id_timestamp" : 1634083207166,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 2686,
"index_count" : 79
},
{
"name" : "binary",
"count" : 14,
"index_count" : 3
},
{
"name" : "boolean",
"count" : 8714,
"index_count" : 103
},
{
"name" : "byte",
"count" : 1,
"index_count" : 1
},
{
"name" : "date",
"count" : 11809,
"index_count" : 115
},
{
"name" : "double",
"count" : 2607,
"index_count" : 79
},
{
"name" : "flattened",
"count" : 641,
"index_count" : 80
},
{
"name" : "float",
"count" : 2611,
"index_count" : 88
},
{
"name" : "geo_point",
"count" : 639,
"index_count" : 80
},
{
"name" : "half_float",
"count" : 56,
"index_count" : 14
},
{
"name" : "integer",
"count" : 186,
"index_count" : 11
},
{
"name" : "ip",
"count" : 9888,
"index_count" : 80
},
{
"name" : "keyword",
"count" : 285169,
"index_count" : 113
},
{
"name" : "long",
"count" : 79464,
"index_count" : 109
},
{
"name" : "nested",
"count" : 272,
"index_count" : 95
},
{
"name" : "object",
"count" : 57866,
"index_count" : 114
},
{
"name" : "short",
"count" : 7981,
"index_count" : 80
},
{
"name" : "text",
"count" : 7837,
"index_count" : 104
}
]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [
{
"name" : "pattern_capture",
"count" : 1,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [
{
"name" : "uax_url_email",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "unique",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" : [ ]
}
},
"nodes" : {
"count" : {
"total" : 6,
"coordinating_only" : 0,
"data" : 3,
"data_cold" : 3,
"data_content" : 3,
"data_hot" : 3,
"data_warm" : 3,
"ingest" : 3,
"master" : 3,
"ml" : 6,
"remote_cluster_client" : 3,
"transform" : 3,
"voting_only" : 0
},
"versions" : [
"7.10.2"
],
"os" : {
"available_processors" : 36,
"allocated_processors" : 36,
"names" : [
{
"name" : "Linux",
"count" : 6
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 8",
"count" : 6
}
],
"mem" : {
"total" : "294gb",
"total_in_bytes" : 315680096256,
"free" : "36.5gb",
"free_in_bytes" : 39295123456,
"used" : "257.4gb",
"used_in_bytes" : 276384972800,
"free_percent" : 12,
"used_percent" : 88
}
},
"process" : {
"cpu" : {
"percent" : 3
},
"open_file_descriptors" : {
"min" : 389,
"max" : 1318,
"avg" : 849
}
},
"jvm" : {
"max_uptime" : "4.6d",
"max_uptime_in_millis" : 399502750,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 6
}
],
"mem" : {
"heap_used" : "30.9gb",
"heap_used_in_bytes" : 33225823136,
"heap_max" : "78gb",
"heap_max_in_bytes" : 83751862272
},
"threads" : 428
},
"fs" : {
"total" : "5.9tb",
"total_in_bytes" : 6521808826368,
"free" : "1.8tb",
"free_in_bytes" : 2028853403648,
"available" : "1.5tb",
"available_in_bytes" : 1697288646656
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 6
},
"http_types" : {
"security4" : 6
}
},
"discovery_types" : {
"zen" : 6
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "docker",
"count" : 6
}
],
"ingest" : {
"number_of_pipelines" : 4,
"processor_stats" : {
"conditional" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "18.5s",
"time_in_millis" : 18569
},
"geoip" : {
"count" : 307645168,
"failed" : 0,
"current" : 0,
"time" : "11.3s",
"time_in_millis" : 11315
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"rename" : {
"count" : 615290336,
"failed" : 0,
"current" : 0,
"time" : "7.1s",
"time_in_millis" : 7100
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
Output from : GET _cat/nodes?v&h=v,n,m,r,rm,du,dt,cpu,ram.percent,ram.current,heap.percent,search.query_current,heap.current,heap.total,segments.count&s=n:asc
v n m r rm du dt cpu ram.percent ram.current heap.percent search.query_current heap.current segments.count
7.10.2 elk-ingest-data-0 - cdhilrstw 96gb 1.3tb 1.9tb 100 78 74.5gb 46 0 11.7gb 1162
7.10.2 elk-ingest-data-1 - cdhilrstw 96gb 1.5tb 1.9tb 71 77 73.7gb 34 0 8.6gb 1155
7.10.2 elk-ingest-data-2 - cdhilrstw 96gb 1.5tb 1.9tb 68 100 95.9gb 52 0 13gb 1091
7.10.2 elk-master-0 * lm 2gb 568.3mb 9.7gb 53 100 1.9gb 37 0 381.9mb 0
7.10.2 elk-master-1 - lm 2gb 568.3mb 9.7gb 2 68 1.3gb 61 0 628.9mb 0
7.10.2 elk-master-2 - lm 2gb 568.3mb 9.7gb 3 68 1.3gb 55 0 572.6mb 0
I am trying to build a query to search for records in the following format: TR000002_1_2020.
Users should be able to search for results the following ways:
TR000002 or 2_1_2020 or TR000002_1_2020 or 2020. I figured an ngram tokenization query would be best suited for my needs. I am using Elasticsearch 6.8 so I cannot use the built in Search-As-You-Type introduced in E7.
Here's my implementation I followed from docs here. The only thing I modified was EdgeNGram -> NGram as the user can search from any point of the text.
My Analysis block looks like this:
.Analysis(a => a
.Analyzers(aa => aa
.Custom("autocomplete", ca => ca
.Tokenizer("autocomplete")
.Filters(new string[] {
"lowercase"
})
)
.Custom("autocomplete_search", ca => ca
.Tokenizer("lowercase")
)
)
.Tokenizers(t => t
.NGram("autocomplete", e => e
.MinGram(2)
.MaxGram(16)
.TokenChars(new TokenChar[] {
TokenChar.Letter,
TokenChar.Digit,
TokenChar.Punctuation,
TokenChar.Symbol
})
)
)
)
Then in my mapping I define:
.Text(t => t
.Name(tr => tr.TestRecordId)
.Analyzer("autocomplete")
.SearchAnalyzer("autocomplete_search")
)
When I search for TR000002, my query returns all results instead of just the records that contain those specific characters. What am I doing wrong? Is there a better tokenizer for this specific use case? Thanks!
EDIT: Here's a sample of what is returned:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 27,
"max_score" : 0.105360515,
"hits" : [
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "3",
"_score" : 0.105360515,
"_source" : {
"id" : 3,
"testRecordId" : "TR000002_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 12,
"testStatus" : {
"testStatusId" : 12,
"name" : "Complete: Postponed Until Further Notice"
},
"discriminator" : "SingleEventEffectsRecord",
"testRecordServiceOrders" : [
{
"testRecordId" : 3,
"serviceOrderId" : 9,
"serviceOrder" : {
"serviceOrderId" : 9,
"serviceOrderNumber" : "105702"
}
}
],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
},
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"sEETestRates" : [ ]
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "11",
"_score" : 0.105360515,
"_source" : {
"id" : 11,
"testRecordId" : "TR000011_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"partLDC" : "12",
"waferLot" : "1",
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false,
"testStartDate" : "2020-07-30T00:00:00",
"actualCompletionDate" : "2020-07-31T00:00:00"
}
},
{
"_index" : "test-records-development-09-09-2020-02-00-00",
"_type" : "testrecorddto",
"_id" : "17",
"_score" : 0.105360515,
"_source" : {
"id" : 17,
"testRecordId" : "TR000017_1_2020",
"type" : 0,
"typeName" : "TIDCo60",
"missionId" : 1,
"mission" : {
"missionId" : 1,
"name" : "[REDACTED]",
"mRPLUsername" : "[REDACTED]",
"missionRadiationPartsLead" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"missionInstruments" : [
{
"missionId" : 1,
"instrumentId" : 1,
"cognizantEngineerUsername" : "[REDACTED]",
"instrument" : {
"intstrumentId" : 1,
"name" : "Instrument"
},
"cognizantEngineer" : {
"username" : "lewallen",
"displayName" : "[REDACTED]"
}
},
{
"missionId" : 1,
"instrumentId" : 2,
"instrument" : {
"intstrumentId" : 2,
"name" : "Instrument 2"
}
}
]
},
"procurementPartId" : 2,
"procurementPart" : {
"procurementPartId" : 2,
"partNumber" : "procurement part",
"part" : {
"partId" : 1,
"manufacturer" : "Texas Instruments",
"genericPartNumber" : "123",
"description" : "description",
"partTechnology" : "Part Tech"
}
},
"testStatusId" : 1,
"testStatus" : {
"testStatusId" : 1,
"name" : "Active"
},
"discriminator" : "TotalIonizingDoseRecord",
"creatorUsername" : "[REDACTED]",
"creator" : {
"username" : "[REDACTED]",
"displayName" : "[REDACTED]"
},
"testRecordServiceOrders" : [ ],
"rtdbFiles" : [ ],
"personnelGroups" : [
{
"personnelGroupUsers" : [ ]
}
],
"testRecordTestSubTypes" : [ ],
"testRecordTestFacilityConditions" : [ ],
"testRecordFollowers" : [ ],
"isDeleted" : false
}
},
Also here's what shows for mapping:
"testRecordId" : {
"type" : "text",
"analyzer" : "autocomplete",
"search_analyzer" : "autocomplete_search"
},
I guess I should also mention, I've been testing this query in the console like so:
GET test-records-development/_search
{
"query": {
"match": {
"testRecordId": {
"query": "TR000002_1_2020"
}
}
}
}
EDIT 2: Added API response from index _settings endpoint:
{
"test-records-development-09-09-2020-02-00-00" : {
"settings" : {
"index" : {
"number_of_shards" : "5",
"provided_name" : "test-records-development-09-09-2020-02-00-00",
"creation_date" : "1599617013874",
"analysis" : {
"analyzer" : {
"autocomplete" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "autocomplete"
},
"autocomplete_search" : {
"type" : "custom",
"tokenizer" : "lowercase"
}
},
"tokenizer" : {
"autocomplete" : {
"token_chars" : [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram" : "2",
"type" : "ngram",
"max_gram" : "16"
}
}
},
"number_of_replicas" : "0",
"uuid" : "FSeCa0YwRCOJVbjfxYGkig",
"version" : {
"created" : "6080199"
}
}
}
}
}
As I don't have the analyzer setting access in JSON format,I can't confirm it but most probably issue is with your search analyzer autocomplete_search which is creating search time tokens which are matching the index time tokens.
For example: you are searching for TR000002_1_2020 and if it creates 2020 as a token and for document containing TR000011_1_2020 also creates a 2020 token than your query will match it.
You can use the analyze API to check the generated tokens based on a analyzer and as mentioned earlier mostly there is some tokens which are matching as shown above.
I have a result query like:
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 6,
"successful" : 6,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 15.0735855,
"hits" : [
{
"_index" : "khoso",
"_type" : "sim",
"_id" : "0964693123",
"_score" : 15.0735855,
"_source" : {
"id" : "0964693123",
"i" : "0964693123",
"ut" : 10,
"utP" : 1,
"utT" : 1,
"utC" : 1,
"f" : "09646.93.123",
"s" : [
{
"id" : 268,
"p" : 800000,
"pb" : 800000,
"pg" : 560000,
"l" : {
"sec" : 0,
"usec" : 0
},
"da" : {
"sec" : 0,
"usec" : 0
},
"d" : true,
"d3" : true,
"d2" : true
},
{
"id" : 2067,
"p" : 750000,
"pb" : 699000,
"pg" : 524250,
"l" : {
"sec" : 0,
"usec" : 0
},
"da" : {
"sec" : 0,
"usec" : 0
},
"d" : true,
"d3" : true,
"d2" : true
}
],
"s2" : [
268,
2067
],
"pt" : 4.5,
"m" : 10,
"p" : 0,
"pb" : 800000,
"pg" : 560000,
"c" : [
81,
111
],
"c2" : 81,
"t" : 1,
"d" : true,
"d2" : true,
"l" : {
"sec" : 0,
"usec" : 0
},
"d3" : true,
"h" : true,
"hg" : true,
"e" : "693123",
"pn" : 960000,
"s3" : [ ]
}
}
]
}
}
Now I wanna remove an object in this array. For example, I want to remove the object with id == 268. Like this
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 6,
"successful" : 6,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 15.0735855,
"hits" : [
{
"_index" : "khoso",
"_type" : "sim",
"_id" : "0964693123",
"_score" : 15.0735855,
"_source" : {
"id" : "0964693123",
"i" : "0964693123",
"ut" : 10,
"utP" : 1,
"utT" : 1,
"utC" : 1,
"f" : "09646.93.123",
"s" : [
{
"id" : 2067,
"p" : 750000,
"pb" : 699000,
"pg" : 524250,
"l" : {
"sec" : 0,
"usec" : 0
},
"da" : {
"sec" : 0,
"usec" : 0
},
"d" : true,
"d3" : true,
"d2" : true
}
],
"s2" : [
268,
2067
],
"pt" : 4.5,
"m" : 10,
"p" : 0,
"pb" : 800000,
"pg" : 560000,
"c" : [
81,
111
],
"c2" : 81,
"t" : 1,
"d" : true,
"d2" : true,
"l" : {
"sec" : 0,
"usec" : 0
},
"d3" : true,
"h" : true,
"hg" : true,
"e" : "693123",
"pn" : 960000,
"s3" : [ ]
}
}
]
}
}
How can i do it? Please give me some query to do this. Thank you!
I am observing high disk read I/O in Elasticsearch nodes.
Environment
Elasticssearch 2.3.1
Disk SSD
Cores - 16
RAM - 64 GB
Segments and merging could be one of the issue. But as mentioned in this link I don't see any INFO log stating now throttling indexing.
Can someone let me know what could be the problem and how can I debug this issue?
The node stats looks like below:-
"indices" : {
"docs" : {
"count" : 42096315,
"deleted" : 16809358
},
"store" : {
"size_in_bytes" : 188061514556,
"throttle_time_in_millis" : 0
},
"indexing" : {
"index_total" : 4971184,
"index_time_in_millis" : 24161898,
"index_current" : 0,
"index_failed" : 0,
"delete_total" : 10317957,
"delete_time_in_millis" : 3323977,
"delete_current" : 0,
"noop_update_total" : 165455,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 3078536,
"time_in_millis" : 4166218,
"exists_total" : 3076266,
"exists_time_in_millis" : 4165295,
"missing_total" : 2270,
"missing_time_in_millis" : 923,
"current" : 0
},
"search" : {
"open_contexts" : 0,
"query_total" : 811510,
"query_time_in_millis" : 49063191,
"query_current" : 0,
"fetch_total" : 55590,
"fetch_time_in_millis" : 2561106,
"fetch_current" : 0,
"scroll_total" : 205,
"scroll_time_in_millis" : 68748,
"scroll_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 37047,
"total_time_in_millis" : 29575123,
"total_docs" : 48646732,
"total_size_in_bytes" : 189196722890,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 7243267,
"total_auto_throttle_in_bytes" : 186360552
},
"refresh" : {
"total" : 253329,
"total_time_in_millis" : 14111583
},
"flush" : {
"total" : 824,
"total_time_in_millis" : 197608
},
"warmer" : {
"current" : 0,
"total" : 479781,
"total_time_in_millis" : 440805
},
"query_cache" : {
"memory_size_in_bytes" : 523777408,
"total_count" : 144964341,
"hit_count" : 5901881,
"miss_count" : 139062460,
"cache_size" : 7773,
"cache_count" : 442280,
"evictions" : 434507
},
"fielddata" : {
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"segments" : {
"count" : 263,
"memory_in_bytes" : 273031904,
"terms_memory_in_bytes" : 203860300,
"stored_fields_memory_in_bytes" : 34899016,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 5229120,
"doc_values_memory_in_bytes" : 29043468,
"index_writer_memory_in_bytes" : 0,
"index_writer_max_memory_in_bytes" : 527069180,
"version_map_memory_in_bytes" : 14761,
"fixed_bit_set_memory_in_bytes" : 7048640
},
"translog" : {
"operations" : 137655,
"size_in_bytes" : 122949018
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 0,
"miss_count" : 0
}
},
"os" : {
"timestamp" : 1508238172920,
"cpu_percent" : 91,
"load_average" : 22.31,
"mem" : {
"total_in_bytes" : 67543134208,
"free_in_bytes" : 912490496,
"used_in_bytes" : 66630643712,
"free_percent" : 1,
"used_percent" : 99
},
"swap" : {
"total_in_bytes" : 4093636608,
"free_in_bytes" : 1753239552,
"used_in_bytes" : 2340397056
}
},
"process" : {
"timestamp" : 1508238172920,
"open_file_descriptors" : 915,
"max_file_descriptors" : 65535,
"cpu" : {
"percent" : 2,
"total_in_millis" : 99746040
},
"mem" : {
"total_virtual_in_bytes" : 87529877504
}
},
"jvm" : {
"timestamp" : 1508238172921,
"uptime_in_millis" : 292500150,
"mem" : {
"heap_used_in_bytes" : 1868708912,
"heap_used_percent" : 35,
"heap_committed_in_bytes" : 5255331840,
"heap_max_in_bytes" : 5255331840,
"non_heap_used_in_bytes" : 103936064,
"non_heap_committed_in_bytes" : 106307584,
"pools" : {
"young" : {
"used_in_bytes" : 153647352,
"max_in_bytes" : 907345920,
"peak_used_in_bytes" : 907345920,
"peak_max_in_bytes" : 907345920
},
"survivor" : {
"used_in_bytes" : 35321888,
"max_in_bytes" : 113377280,
"peak_used_in_bytes" : 113377280,
"peak_max_in_bytes" : 113377280
},
"old" : {
"used_in_bytes" : 1679739672,
"max_in_bytes" : 4234608640,
"peak_used_in_bytes" : 3660407304,
"peak_max_in_bytes" : 4234608640
}
}
},
"threads" : {
"count" : 199,
"peak_count" : 204
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 32655,
"collection_time_in_millis" : 1844356
},
"old" : {
"collection_count" : 611,
"collection_time_in_millis" : 176197
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 258,
"used_in_bytes" : 269582255,
"total_capacity_in_bytes" : 269582255
},
"mapped" : {
"count" : 146,
"used_in_bytes" : 71612833894,
"total_capacity_in_bytes" : 71612833894
}
},
"classes" : {
"current_loaded_count" : 8483,
"total_loaded_count" : 8499,
"total_unloaded_count" : 16
}
},
"thread_pool" : {
"bulk" : {
"threads" : 16,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 16,
"completed" : 44913
},
"fetch_shard_started" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 9,
"completed" : 13
},
"fetch_shard_store" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"flush" : {
"threads" : 5,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 5,
"completed" : 9011
},
"force_merge" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"generic" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 6,
"completed" : 29268
},
"get" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"index" : {
"threads" : 16,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 16,
"completed" : 9460079
},
"listener" : {
"threads" : 8,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 8,
"completed" : 1237173
},
"management" : {
"threads" : 4,
"queue" : 0,
"active" : 1,
"rejected" : 0,
"largest" : 4,
"completed" : 44128
},
"percolate" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"refresh" : {
"threads" : 8,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 8,
"completed" : 253330
},
"search" : {
"threads" : 25,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 25,
"completed" : 992032
},
"snapshot" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"warmer" : {
"threads" : 5,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 5,
"completed" : 518307
}
},
"fs" : {
"timestamp" : 1508238172922,
"total" : {
"total_in_bytes" : 1847237029888,
"free_in_bytes" : 921102319616,
"available_in_bytes" : 839458172928
},
"data" : [ {
"path" : "/var/lib/elasticsearch/elasticsearch/nodes/0",
"mount" : "/var (/dev/mapper/vag-var)",
"type" : "ext4",
"total_in_bytes" : 1847237029888,
"free_in_bytes" : 921102319616,
"available_in_bytes" : 839458172928,
"spins" : "false"
} ]
},
"transport" : {
"server_open" : 140,
"rx_count" : 7926335,
"rx_size_in_bytes" : 15511144109,
"tx_count" : 7667433,
"tx_size_in_bytes" : 47171921335
},
"http" : {
"current_open" : 1,
"total_opened" : 63123
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 2102132736,
"limit_size" : "1.9gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 3153199104,
"limit_size" : "2.9gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.03,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 3678732288,
"limit_size" : "3.4gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
}
}
}
}
}
refresh_interval & translog.flush_threshold_size are both set to their default values.
hot-threads report is as follows:- (_nodes/hot_threads?pretty"
)
Hot threads at 2017-10-17T12:45:39.517Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
71.6% (357.8ms out of 500ms) cpu usage by thread 'elasticsearch[Axum][[denorm][1]: Lucene Merge Thread #6011]'
3/10 snapshots sharing following 13 elements
org.apache.lucene.index.MultiTermsEnum.pushTop(MultiTermsEnum.java:275)
org.apache.lucene.index.MultiTermsEnum.next(MultiTermsEnum.java:301)
org.apache.lucene.index.FilterLeafReader$FilterTermsEnum.next(FilterLeafReader.java:195)
org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter.write(BlockTreeTermsWriter.java:438)
org.apache.lucene.codecs.perfield.PerFieldPostingsFormat$FieldsWriter.write(PerFieldPostingsFormat.java:198)
org.apache.lucene.codecs.FieldsConsumer.merge(FieldsConsumer.java:105)
org.apache.lucene.index.SegmentMerger.mergeTerms(SegmentMerger.java:193)
org.apache.lucene.index.SegmentMerger.merge(SegmentMerger.java:95)
org.apache.lucene.index.IndexWriter.mergeMiddle(IndexWriter.java:4075)
org.apache.lucene.index.IndexWriter.merge(IndexWriter.java:3655)
org.apache.lucene.index.ConcurrentMergeScheduler.doMerge(ConcurrentMergeScheduler.java:588)
org.elasticsearch.index.engine.ElasticsearchConcurrentMergeScheduler.doMerge(ElasticsearchConcurrentMergeScheduler.java:94)
org.apache.lucene.index.ConcurrentMergeScheduler$MergeThread.run(ConcurrentMergeScheduler.java:626)
I have setup a cluster with 3 elasticsearch instances and they are being feed with documents provided by logstash (~165K docs per minute). I have those 3 machines with 16Gb RAM and each instance is started up with 8Gb.
The indexing works quite well, and I'm able to perform all required search operations I was expecting. The thing now is that I want to make it general available, but unfortunately depending on the queries that are executed (i.e. range facets over all indexes) it freezes the entire cluster and ends up in a split-brain state.
I already limited some things like:
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
My entire config file looks like:
index.number_of_shards: 10
index.number_of_replicas: 0
bootstrap.mlockall: true
# Indices settings
indices.memory.index_buffer_size: 30%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Cache Sizes
indices.fielddata.cache.size: 15%
indices.fielddata.cache.expire: 6h
indices.cache.filter.size: 15%
indices.cache.filter.expire: 6h
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
Is there something else I could improve here to avoid such freeze and split brain state?
Output of my nodes info:
{
"cluster_name" : "elasticsearch",
"nodes" : {
"7i5sZj_jT_qe6HNESfzO3A" : {
"name" : "Captain Fate",
"transport_address" : "inet[/192.168.0.83:9300]",
"host" : "esserver02",
"ip" : "192.168.0.83",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.83:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Captain Fate",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 8482,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 8482,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1411976625093,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9300]",
"publish_address" : "inet[/192.168.0.83:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0:9200]",
"publish_address" : "inet[/192.168.0.83:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"0OaMqY6IR1SYeL6rd6P61Q" : {
"name" : "Blonde Phantom",
"transport_address" : "inet[/192.168.0.100:9300]",
"host" : "esserver03",
"ip" : "192.168.0.100",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.100:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Blonde Phantom",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 98772,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 98772,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657551806,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.100:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.100:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
},
"H2h01oNGSuCL0uu8J3SF6w" : {
"name" : "Dakimh the Enchanter",
"transport_address" : "inet[/192.168.0.101:9300]",
"host" : "esserver04",
"ip" : "192.168.0.101",
"version" : "1.1.1",
"build" : "f1585f0",
"http_address" : "inet[/192.168.0.101:9200]",
"settings" : {
"index" : {
"number_of_replicas" : "0",
"translog" : {
"flush_threshold_ops" : "50000"
},
"number_of_shards" : "40",
"refresh_interval" : "30s"
},
"bootstrap" : {
"mlockall" : "true"
},
"transport" : {
"tcp" : {
"port" : "9300"
}
},
"http" : {
"port" : "9200"
},
"name" : "Dakimh the Enchanter",
"path" : {
"logs" : "/opt/as/es/logs",
"home" : "/opt/as/es"
},
"cluster" : {
"name" : "elasticsearch"
},
"indices" : {
"memory" : {
"index_buffer_size" : "50%",
"min_shard_index_buffer_size" : "12mb",
"min_index_buffer_size" : "96mb"
}
},
"discovery" : {
"zen" : {
"minimum_master_nodes" : "1",
"ping" : {
"unicast" : {
"hosts" : [ "esserver02", "esserver03", "esserver04" ]
},
"multicast" : {
"enabled" : "false"
},
"timeout" : "30s"
}
}
}
},
"os" : {
"refresh_interval" : 1000,
"available_processors" : 16
},
"process" : {
"refresh_interval" : 1000,
"id" : 88019,
"max_file_descriptors" : 128000,
"mlockall" : false
},
"jvm" : {
"pid" : 88019,
"version" : "1.7.0_45",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.45-b08",
"vm_vendor" : "Oracle Corporation",
"start_time" : 1414657560829,
"mem" : {
"heap_init_in_bytes" : 2147483648,
"heap_max_in_bytes" : 12771524608,
"non_heap_init_in_bytes" : 24313856,
"non_heap_max_in_bytes" : 136314880,
"direct_max_in_bytes" : 12771524608
},
"gc_collectors" : [ "ParNew", "ConcurrentMarkSweep" ],
"memory_pools" : [ "Code Cache", "Par Eden Space", "Par Survivor Space", "CMS Old Gen", "CMS Perm Gen" ]
},
"thread_pool" : {
"generic" : {
"type" : "cached",
"keep_alive" : "30s"
},
"index" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "200"
},
"get" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"snapshot" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"merge" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"suggest" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"bulk" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "50"
},
"optimize" : {
"type" : "fixed",
"min" : 1,
"max" : 1
},
"warmer" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"flush" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"search" : {
"type" : "fixed",
"min" : 48,
"max" : 48,
"queue_size" : "1k"
},
"percolate" : {
"type" : "fixed",
"min" : 16,
"max" : 16,
"queue_size" : "1k"
},
"management" : {
"type" : "scaling",
"min" : 1,
"max" : 5,
"keep_alive" : "5m"
},
"refresh" : {
"type" : "scaling",
"min" : 1,
"max" : 8,
"keep_alive" : "5m"
}
},
"network" : {
"refresh_interval" : 5000,
"primary_interface" : {
"address" : "",
"name" : "",
"mac_address" : ""
}
},
"transport" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9300]",
"publish_address" : "inet[/192.168.0.101:9300]"
},
"http" : {
"bound_address" : "inet[/0:0:0:0:0:0:0:0%0:9200]",
"publish_address" : "inet[/192.168.0.101:9200]",
"max_content_length_in_bytes" : 104857600
},
"plugins" : [ {
"name" : "head",
"version" : "NA",
"description" : "No description found.",
"url" : "/_plugin/head/",
"jvm" : false,
"site" : true
} ]
}
}
}
My old configuration:
index.number_of_shards: 40
index.number_of_replicas: 0
bootstrap.mlockall: true
## Threadpool Settings ##
# Indices settings
indices.memory.index_buffer_size: 50%
indices.memory.min_shard_index_buffer_size: 12mb
indices.memory.min_index_buffer_size: 96mb
# Indexing Settings for Writes
index.refresh_interval: 30s
index.translog.flush_threshold_ops: 50000
Your indices.fielddata.cache.size is set to 15% only. Why?
These data are used for aggregations/facets so it could be related. You should remove indices.fielddata.cache.expire ASAP : this setting is really not recommended at all, as evictions are really costful and it schedules evictions for fielddata values, even if they are used. Could you give us the result of a node stats API ?
Update 1 :
I see minimum_master_nodes is set to 1, but you say you have 3 nodes. This should be set to 2 according to the equation generally used (number of nodes/2 + 1, documentation)
Update 2 :
Do you still experience split-brains with the updated configuration?
Depending on your cluster ES version (> 1.0) , you might want to tune the fielddata circuit-breaker to prevent the most costly requests to be done : take a look here.