Overview
I have a ruby application that uses MongoDB as a database. While running tests for this application I am creating collections and indexes for every test case using Minitest.
The test environment is created using docker compose where one container is running the tests and the other container is running MongoDB.
Problem
When running the tests for the first time, after a while MongoDB gets stuck. Any request to query the collections doesn't respond.
I was able to connect to it before the tests started running using the command line client. When I checked the state of the server using db.serverStatus() I see that some operations have acquired locks. Looking at the globalLock field I understand that 1 operation has write lock and there are 2 operations are waiting to acquire read lock.
I am unable to understand why would these operations hang and not yield the locks. I have no idea how to debug this problem further.
MongoDB Version: 3.6.13
Ruby Driver version: 2.8.0
I've also tried other versions 3.6.x and 4.0
Any help or direction is highly appreciated.
db.serverStatus output
{
"host" : "c658c885eb90",
"version" : "3.6.14",
"process" : "mongod",
"pid" : NumberLong(1),
"uptime" : 98,
"uptimeMillis" : NumberLong(97909),
"uptimeEstimate" : NumberLong(97),
"localTime" : ISODate("2019-11-03T16:09:14.289Z"),
"asserts" : {
"regular" : 0,
"warning" : 0,
"msg" : 0,
"user" : 0,
"rollovers" : 0
},
"connections" : {
"current" : 6,
"available" : 838854,
"totalCreated" : 11
},
"extra_info" : {
"note" : "fields vary by platform",
"page_faults" : 0
},
"globalLock" : {
"totalTime" : NumberLong(97908000),
"currentQueue" : {
"total" : 2,
"readers" : 2,
"writers" : 0
},
"activeClients" : {
"total" : 13,
"readers" : 0,
"writers" : 1
}
},
"locks" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(14528),
"w" : NumberLong(12477),
"W" : NumberLong(5)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(1020),
"w" : NumberLong(14459),
"R" : NumberLong(3),
"W" : NumberLong(6599)
},
"acquireWaitCount" : {
"r" : NumberLong(2)
},
"timeAcquiringMicros" : {
"r" : NumberLong(76077321)
}
},
"Collection" : {
"acquireCount" : {
"R" : NumberLong(1018),
"W" : NumberLong(8805)
}
},
"Metadata" : {
"acquireCount" : {
"W" : NumberLong(37)
}
}
},
"logicalSessionRecordCache" : {
"activeSessionsCount" : 3,
"sessionsCollectionJobCount" : 1,
"lastSessionsCollectionJobDurationMillis" : 0,
"lastSessionsCollectionJobTimestamp" : ISODate("2019-11-03T16:07:36.407Z"),
"lastSessionsCollectionJobEntriesRefreshed" : 0,
"lastSessionsCollectionJobEntriesEnded" : 0,
"lastSessionsCollectionJobCursorsClosed" : 0,
"transactionReaperJobCount" : 0,
"lastTransactionReaperJobDurationMillis" : 0,
"lastTransactionReaperJobTimestamp" : ISODate("2019-11-03T16:07:36.407Z"),
"lastTransactionReaperJobEntriesCleanedUp" : 0
},
"network" : {
"bytesIn" : NumberLong(1682811),
"bytesOut" : NumberLong(1019834),
"physicalBytesIn" : NumberLong(1682811),
"physicalBytesOut" : NumberLong(1019834),
"numRequests" : NumberLong(7822),
"compression" : {
"snappy" : {
"compressor" : {
"bytesIn" : NumberLong(0),
"bytesOut" : NumberLong(0)
},
"decompressor" : {
"bytesIn" : NumberLong(0),
"bytesOut" : NumberLong(0)
}
}
},
"serviceExecutorTaskStats" : {
"executor" : "passthrough",
"threadsRunning" : 6
}
},
"opLatencies" : {
"reads" : {
"latency" : NumberLong(61374),
"ops" : NumberLong(963)
},
"writes" : {
"latency" : NumberLong(13074),
"ops" : NumberLong(286)
},
"commands" : {
"latency" : NumberLong(988232),
"ops" : NumberLong(6570)
}
},
"opReadConcernCounters" : {
"available" : NumberLong(0),
"linearizable" : NumberLong(0),
"local" : NumberLong(0),
"majority" : NumberLong(0),
"none" : NumberLong(944)
},
"opcounters" : {
"insert" : 246,
"query" : 944,
"update" : 40,
"delete" : 0,
"getmore" : 0,
"command" : 6595
},
"opcountersRepl" : {
"insert" : 0,
"query" : 0,
"update" : 0,
"delete" : 0,
"getmore" : 0,
"command" : 0
},
"storageEngine" : {
"name" : "ephemeralForTest",
"supportsCommittedReads" : false,
"readOnly" : false,
"persistent" : false
},
"tcmalloc" : {
"generic" : {
"current_allocated_bytes" : 8203504,
"heap_size" : 12496896
},
"tcmalloc" : {
"pageheap_free_bytes" : 2760704,
"pageheap_unmapped_bytes" : 0,
"max_total_thread_cache_bytes" : 516947968,
"current_total_thread_cache_bytes" : 1007120,
"total_free_bytes" : 1532688,
"central_cache_free_bytes" : 231040,
"transfer_cache_free_bytes" : 294528,
"thread_cache_free_bytes" : 1007120,
"aggressive_memory_decommit" : 0,
"pageheap_committed_bytes" : 12496896,
"pageheap_scavenge_count" : 0,
"pageheap_commit_count" : 9,
"pageheap_total_commit_bytes" : 12496896,
"pageheap_decommit_count" : 0,
"pageheap_total_decommit_bytes" : 0,
"pageheap_reserve_count" : 9,
"pageheap_total_reserve_bytes" : 12496896,
"spinlock_total_delay_ns" : 0,
"formattedString" : "------------------------------------------------\nMALLOC: 8204080 ( 7.8 MiB) Bytes in use by application\nMALLOC: + 2760704 ( 2.6 MiB) Bytes in page heap freelist\nMALLOC: + 231040 ( 0.2 MiB) Bytes in central cache freelist\nMALLOC: + 294528 ( 0.3 MiB) Bytes in transfer cache freelist\nMALLOC: + 1006544 ( 1.0 MiB) Bytes in thread cache freelists\nMALLOC: + 1204480 ( 1.1 MiB) Bytes in malloc metadata\nMALLOC: ------------\nMALLOC: = 13701376 ( 13.1 MiB) Actual memory used (physical + swap)\nMALLOC: + 0 ( 0.0 MiB) Bytes released to OS (aka unmapped)\nMALLOC: ------------\nMALLOC: = 13701376 ( 13.1 MiB) Virtual address space used\nMALLOC:\nMALLOC: 415 Spans in use\nMALLOC: 18 Thread heaps in use\nMALLOC: 4096 Tcmalloc page size\n------------------------------------------------\nCall ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).\nBytes released to the OS take up virtual address space but no physical memory.\n"
}
},
"transactions" : {
"retriedCommandsCount" : NumberLong(0),
"retriedStatementsCount" : NumberLong(0),
"transactionsCollectionWriteCount" : NumberLong(0)
},
"transportSecurity" : {
"1.0" : NumberLong(0),
"1.1" : NumberLong(0),
"1.2" : NumberLong(0),
"1.3" : NumberLong(0),
"unknown" : NumberLong(0)
},
"mem" : {
"bits" : 64,
"resident" : 41,
"virtual" : 836,
"supported" : true,
"mapped" : 0
},
"metrics" : {
"commands" : {
"buildInfo" : {
"failed" : NumberLong(0),
"total" : NumberLong(2)
},
"count" : {
"failed" : NumberLong(0),
"total" : NumberLong(21)
},
"createIndexes" : {
"failed" : NumberLong(0),
"total" : NumberLong(5656)
},
"drop" : {
"failed" : NumberLong(0),
"total" : NumberLong(784)
},
"dropIndexes" : {
"failed" : NumberLong(87),
"total" : NumberLong(87)
},
"find" : {
"failed" : NumberLong(0),
"total" : NumberLong(944)
},
"getLog" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
},
"insert" : {
"failed" : NumberLong(0),
"total" : NumberLong(246)
},
"isMaster" : {
"failed" : NumberLong(0),
"total" : NumberLong(38)
},
"listCollections" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
},
"listIndexes" : {
"failed" : NumberLong(1),
"total" : NumberLong(1)
},
"replSetGetStatus" : {
"failed" : NumberLong(1),
"total" : NumberLong(1)
},
"serverStatus" : {
"failed" : NumberLong(0),
"total" : NumberLong(2)
},
"update" : {
"failed" : NumberLong(0),
"total" : NumberLong(40)
},
"whatsmyuri" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
}
},
"cursor" : {
"timedOut" : NumberLong(0),
"open" : {
"noTimeout" : NumberLong(0),
"pinned" : NumberLong(0),
"total" : NumberLong(0)
}
},
"document" : {
"deleted" : NumberLong(0),
"inserted" : NumberLong(246),
"returned" : NumberLong(398),
"updated" : NumberLong(40)
},
"getLastError" : {
"wtime" : {
"num" : 0,
"totalMillis" : 0
},
"wtimeouts" : NumberLong(0)
},
"operation" : {
"scanAndOrder" : NumberLong(0),
"writeConflicts" : NumberLong(0)
},
"query" : {
"updateOneOpStyleBroadcastWithExactIDCount" : NumberLong(0),
"upsertReplacementCannotTargetByQueryCount" : NumberLong(0)
},
"queryExecutor" : {
"scanned" : NumberLong(435),
"scannedObjects" : NumberLong(438)
},
"record" : {
"moves" : NumberLong(0)
},
"repl" : {
"executor" : {
"pool" : {
"inProgressCount" : 0
},
"queues" : {
"networkInProgress" : 0,
"sleepers" : 0
},
"unsignaledEvents" : 0,
"shuttingDown" : false,
"networkInterface" : "\nNetworkInterfaceASIO Operations' Diagnostic:\nOperation: Count: \nConnecting 0 \nIn Progress 0 \nSucceeded 0 \nCanceled 0 \nFailed 0 \nTimed Out 0 \n\n"
},
"apply" : {
"attemptsToBecomeSecondary" : NumberLong(0),
"batchSize" : NumberLong(0),
"batches" : {
"num" : 0,
"totalMillis" : 0
},
"ops" : NumberLong(0)
},
"buffer" : {
"count" : NumberLong(0),
"maxSizeBytes" : NumberLong(0),
"sizeBytes" : NumberLong(0)
},
"initialSync" : {
"completed" : NumberLong(0),
"failedAttempts" : NumberLong(0),
"failures" : NumberLong(0)
},
"network" : {
"bytes" : NumberLong(0),
"getmores" : {
"num" : 0,
"totalMillis" : 0
},
"ops" : NumberLong(0),
"readersCreated" : NumberLong(0)
},
"preload" : {
"docs" : {
"num" : 0,
"totalMillis" : 0
},
"indexes" : {
"num" : 0,
"totalMillis" : 0
}
}
},
"storage" : {
"freelist" : {
"search" : {
"bucketExhausted" : NumberLong(0),
"requests" : NumberLong(0),
"scanned" : NumberLong(0)
}
}
},
"ttl" : {
"deletedDocuments" : NumberLong(0),
"passes" : NumberLong(1)
}
},
"ok" : 1
}
Related
We are using opensearch having elasticsearch v7.10.2 and we have a large index where we index data regularly and delete the data(which gets a month old) regularly as well but over a period of time now we are experiencing a degradation in search performance in our queries where we mostly use has_child queries, I want to know that the docs deleted are they still reciting in my index and are they still consuming the resources, if yes, then how can I get them removed, below I have attached a few stats.
GET _cat/segments?index=segment_index_570de84a4f0d925f98343571&s=docs.deleted
A few of my segments have reached 5GB as well, here attaching only a few segments example for better understanding
segment_index_570de84a4f0d925f98343571 83 r x.x.x.x _36u6 148830 4566709 5083016 4.2gb 292024 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 83 p x.x.x.x _36u6 148830 4566709 5083016 4.2gb 292024 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 84 p x.x.x.x _37hm 149674 4569844 5127644 4.4gb 290112 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 84 r x.x.x.x _37hm 149674 4569838 5127650 4.4gb 290112 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 40 p x.x.x.x _30oh 140849 3765550 5460181 4.7gb 292616 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 40 r x.x.x.x _30oh 140849 3765544 5460187 4.7gb 292616 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 29 r x.x.x.x _2ygs 137980 3359481 5519471 4.9gb 287504 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 29 p x.x.x.x _2ygs 137980 3359481 5519471 4.9gb 287504 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 13 p x.x.x.x _2u1h 132245 2731374 5565823 4gb 289480 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 13 r x.x.x.x _2u1h 132245 2731372 5565825 4gb 289480 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 57 p x.x.x.x _3bx3 155415 4144634 5615761 4.5gb 304728 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 57 r x.x.x.x _3bx3 155415 4144620 5615775 4.5gb 304728 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 10 r x.x.x.x _2yau 137766 3535782 5821903 4.6gb 290072 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 10 p x.x.x.x _2yau 137766 3535778 5821907 4.6gb 290072 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 5 r x.x.x.x _2vsz 134531 3052529 5940943 5gb 287288 true true 8.7.0 false
segment_index_570de84a4f0d925f98343571 5 p x.x.x.x _2vsz 134531 3052529 5940943 5gb 287288 true true 8.7.0 false
GET segment_index_570de84a4f0d925f98343571/_stats
{
"_shards" : {
"total" : 200,
"successful" : 200,
"failed" : 0
},
"_all" : {
"primaries" : {
"docs" : {
"count" : 2764210965,
"deleted" : 768121801
},
"store" : {
"size_in_bytes" : 1882036902899,
"reserved_in_bytes" : 0
},
"indexing" : {
"index_total" : 35049143,
"index_time_in_millis" : 93342006,
"index_current" : 1,
"index_failed" : 7,
"delete_total" : 28158400,
"delete_time_in_millis" : 2241164,
"delete_current" : 0,
"noop_update_total" : 108,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 8272612,
"time_in_millis" : 9407739,
"exists_total" : 7714730,
"exists_time_in_millis" : 6864869,
"missing_total" : 557882,
"missing_time_in_millis" : 2542870,
"current" : 0
},
"search" : {
"open_contexts" : 99,
"query_total" : 781661,
"query_time_in_millis" : 46180985,
"query_current" : 0,
"fetch_total" : 25828,
"fetch_time_in_millis" : 31922549,
"fetch_current" : 0,
"scroll_total" : 150005,
"scroll_time_in_millis" : 3934488045,
"scroll_current" : 99,
"suggest_total" : 0,
"suggest_time_in_millis" : 0,
"suggest_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 5069,
"total_time_in_millis" : 15670663,
"total_docs" : 232287891,
"total_size_in_bytes" : 144734357228,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 3012035,
"total_auto_throttle_in_bytes" : 1635281344
},
"refresh" : {
"total" : 26258,
"total_time_in_millis" : 24294337,
"external_total" : 23116,
"external_total_time_in_millis" : 51934585,
"listeners" : 0
},
"flush" : {
"total" : 129,
"periodic" : 29,
"total_time_in_millis" : 83553
},
"warmer" : {
"current" : 0,
"total" : 23099,
"total_time_in_millis" : 28150896
},
"query_cache" : {
"memory_size_in_bytes" : 1971367836,
"total_count" : 5941141,
"hit_count" : 1281540,
"miss_count" : 4659601,
"cache_size" : 34136,
"cache_count" : 34655,
"evictions" : 519
},
"fielddata" : {
"memory_size_in_bytes" : 2270860360,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 3062,
"memory_in_bytes" : 686053834,
"terms_memory_in_bytes" : 583830952,
"stored_fields_memory_in_bytes" : 2159936,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 84022720,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 16040226,
"index_writer_memory_in_bytes" : 3083169874,
"version_map_memory_in_bytes" : 14212574,
"fixed_bit_set_memory_in_bytes" : 441678080,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"translog" : {
"operations" : 1145991,
"size_in_bytes" : 1862599220,
"uncommitted_operations" : 1145991,
"uncommitted_size_in_bytes" : 1862599220,
"earliest_last_modified_age" : 10
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 692,
"miss_count" : 7824
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 146589584
}
},
"total" : {
"docs" : {
"count" : 5528419715,
"deleted" : 1568758887
},
"store" : {
"size_in_bytes" : 3779599075512,
"reserved_in_bytes" : 0
},
"indexing" : {
"index_total" : 65246167,
"index_time_in_millis" : 116379853,
"index_current" : 2,
"index_failed" : 7,
"delete_total" : 56316800,
"delete_time_in_millis" : 4569453,
"delete_current" : 0,
"noop_update_total" : 108,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 8279717,
"time_in_millis" : 9461541,
"exists_total" : 7721681,
"exists_time_in_millis" : 6917878,
"missing_total" : 558036,
"missing_time_in_millis" : 2543663,
"current" : 0
},
"search" : {
"open_contexts" : 200,
"query_total" : 1421264,
"query_time_in_millis" : 84711977,
"query_current" : 0,
"fetch_total" : 47121,
"fetch_time_in_millis" : 55494456,
"fetch_current" : 2,
"scroll_total" : 282690,
"scroll_time_in_millis" : 6909135621,
"scroll_current" : 200,
"suggest_total" : 0,
"suggest_time_in_millis" : 0,
"suggest_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 8563,
"total_time_in_millis" : 30676821,
"total_docs" : 452795172,
"total_size_in_bytes" : 273814327525,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 6337362,
"total_auto_throttle_in_bytes" : 3305840977
},
"refresh" : {
"total" : 47329,
"total_time_in_millis" : 46367778,
"external_total" : 43783,
"external_total_time_in_millis" : 98641382,
"listeners" : 0
},
"flush" : {
"total" : 298,
"periodic" : 98,
"total_time_in_millis" : 210368
},
"warmer" : {
"current" : 0,
"total" : 43760,
"total_time_in_millis" : 52941301
},
"query_cache" : {
"memory_size_in_bytes" : 3882183058,
"total_count" : 10826442,
"hit_count" : 2195511,
"miss_count" : 8630931,
"cache_size" : 66063,
"cache_count" : 66884,
"evictions" : 821
},
"fielddata" : {
"memory_size_in_bytes" : 4524309840,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 6121,
"memory_in_bytes" : 1359222728,
"terms_memory_in_bytes" : 1155693088,
"stored_fields_memory_in_bytes" : 4324024,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 166294144,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 32911472,
"index_writer_memory_in_bytes" : 5666776518,
"version_map_memory_in_bytes" : 26231773,
"fixed_bit_set_memory_in_bytes" : 887417576,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"translog" : {
"operations" : 31206542,
"size_in_bytes" : 28262050766,
"uncommitted_operations" : 31206542,
"uncommitted_size_in_bytes" : 28262050766,
"earliest_last_modified_age" : 10
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 1296,
"miss_count" : 13655
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 229545608
}
}
},
"indices" : {
"segment_index_570de84a4f0d925f98343571" : {
"uuid" : "fhZUqTwfSeum3hHlyFaILw",
"primaries" : {
"docs" : {
"count" : 2764210965,
"deleted" : 768121801
},
"store" : {
"size_in_bytes" : 1882036902899,
"reserved_in_bytes" : 0
},
"indexing" : {
"index_total" : 35049143,
"index_time_in_millis" : 93342006,
"index_current" : 1,
"index_failed" : 7,
"delete_total" : 28158400,
"delete_time_in_millis" : 2241164,
"delete_current" : 0,
"noop_update_total" : 108,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 8272612,
"time_in_millis" : 9407739,
"exists_total" : 7714730,
"exists_time_in_millis" : 6864869,
"missing_total" : 557882,
"missing_time_in_millis" : 2542870,
"current" : 0
},
"search" : {
"open_contexts" : 99,
"query_total" : 781661,
"query_time_in_millis" : 46180985,
"query_current" : 0,
"fetch_total" : 25828,
"fetch_time_in_millis" : 31922549,
"fetch_current" : 0,
"scroll_total" : 150005,
"scroll_time_in_millis" : 3934488045,
"scroll_current" : 99,
"suggest_total" : 0,
"suggest_time_in_millis" : 0,
"suggest_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 5069,
"total_time_in_millis" : 15670663,
"total_docs" : 232287891,
"total_size_in_bytes" : 144734357228,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 3012035,
"total_auto_throttle_in_bytes" : 1635281344
},
"refresh" : {
"total" : 26258,
"total_time_in_millis" : 24294337,
"external_total" : 23116,
"external_total_time_in_millis" : 51934585,
"listeners" : 0
},
"flush" : {
"total" : 129,
"periodic" : 29,
"total_time_in_millis" : 83553
},
"warmer" : {
"current" : 0,
"total" : 23099,
"total_time_in_millis" : 28150896
},
"query_cache" : {
"memory_size_in_bytes" : 1971367836,
"total_count" : 5941141,
"hit_count" : 1281540,
"miss_count" : 4659601,
"cache_size" : 34136,
"cache_count" : 34655,
"evictions" : 519
},
"fielddata" : {
"memory_size_in_bytes" : 2270860360,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 3062,
"memory_in_bytes" : 686053834,
"terms_memory_in_bytes" : 583830952,
"stored_fields_memory_in_bytes" : 2159936,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 84022720,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 16040226,
"index_writer_memory_in_bytes" : 3083169874,
"version_map_memory_in_bytes" : 14212574,
"fixed_bit_set_memory_in_bytes" : 441678080,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"translog" : {
"operations" : 1145991,
"size_in_bytes" : 1862599220,
"uncommitted_operations" : 1145991,
"uncommitted_size_in_bytes" : 1862599220,
"earliest_last_modified_age" : 10
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 692,
"miss_count" : 7824
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 146589584
}
},
"total" : {
"docs" : {
"count" : 5528419715,
"deleted" : 1568758887
},
"store" : {
"size_in_bytes" : 3779599075512,
"reserved_in_bytes" : 0
},
"indexing" : {
"index_total" : 65246167,
"index_time_in_millis" : 116379853,
"index_current" : 2,
"index_failed" : 7,
"delete_total" : 56316800,
"delete_time_in_millis" : 4569453,
"delete_current" : 0,
"noop_update_total" : 108,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 8279717,
"time_in_millis" : 9461541,
"exists_total" : 7721681,
"exists_time_in_millis" : 6917878,
"missing_total" : 558036,
"missing_time_in_millis" : 2543663,
"current" : 0
},
"search" : {
"open_contexts" : 200,
"query_total" : 1421264,
"query_time_in_millis" : 84711977,
"query_current" : 0,
"fetch_total" : 47121,
"fetch_time_in_millis" : 55494456,
"fetch_current" : 2,
"scroll_total" : 282690,
"scroll_time_in_millis" : 6909135621,
"scroll_current" : 200,
"suggest_total" : 0,
"suggest_time_in_millis" : 0,
"suggest_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 8563,
"total_time_in_millis" : 30676821,
"total_docs" : 452795172,
"total_size_in_bytes" : 273814327525,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 6337362,
"total_auto_throttle_in_bytes" : 3305840977
},
"refresh" : {
"total" : 47329,
"total_time_in_millis" : 46367778,
"external_total" : 43783,
"external_total_time_in_millis" : 98641382,
"listeners" : 0
},
"flush" : {
"total" : 298,
"periodic" : 98,
"total_time_in_millis" : 210368
},
"warmer" : {
"current" : 0,
"total" : 43760,
"total_time_in_millis" : 52941301
},
"query_cache" : {
"memory_size_in_bytes" : 3882183058,
"total_count" : 10826442,
"hit_count" : 2195511,
"miss_count" : 8630931,
"cache_size" : 66063,
"cache_count" : 66884,
"evictions" : 821
},
"fielddata" : {
"memory_size_in_bytes" : 4524309840,
"evictions" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 6121,
"memory_in_bytes" : 1359222728,
"terms_memory_in_bytes" : 1155693088,
"stored_fields_memory_in_bytes" : 4324024,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 166294144,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 32911472,
"index_writer_memory_in_bytes" : 5666776518,
"version_map_memory_in_bytes" : 26231773,
"fixed_bit_set_memory_in_bytes" : 887417576,
"max_unsafe_auto_id_timestamp" : -1,
"file_sizes" : { }
},
"translog" : {
"operations" : 31206542,
"size_in_bytes" : 28262050766,
"uncommitted_operations" : 31206542,
"uncommitted_size_in_bytes" : 28262050766,
"earliest_last_modified_age" : 10
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 1296,
"miss_count" : 13655
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 229545608
}
}
}
}
}
Would request to guide me with an appropriate approach as to what are best ways to deal with this and optimise the search performance
What I usually do in such cases is to run forcemerge and only expunge deleted docs
POST _forcemerge?only_expunge_deletes=true
Since the ratio of deleted/total docs is ~30% that should allow you to regain some space...
Try it out on a single index first. Record the size before and after and you should see some space gained.
I have a 2.4.4 cluster with a single server/node (esnode1) containing only one 220GB index with 1 shard and zero replicas.
The index was responding fine, but whenever I cleanly restart the server (ec2 with 2cpu 4GB RAM 500GB SSD) the cluster state gets stuck on red with "initializing_shards" = 1, but no CPU or disk usage (the system is idle and not swapping) for quite a while.
I've already raised indices.recovery.max_bytes_per_sec to 50mb, and tried the instructions at https://www.elastic.co/guide/en/elasticsearch/guide/current/_rolling_restarts.html, without success.
This only occurs if I set a 2GB heap for ES. However, with a 3GB heap the cluster status changes to green seconds after restart.
I'm at a loss as to how to debug or understand this, as the logs (below) seem pretty much normal, any hints?
/cluster/_health is
{
"cluster_name" : "escluster1",
"status" : "red",
"timed_out" : false,
"number_of_nodes" : 1,
"number_of_data_nodes" : 1,
"active_primary_shards" : 0,
"active_shards" : 0,
"relocating_shards" : 0,
"initializing_shards" : 1,
"unassigned_shards" : 0,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 0.0
}
This is the log following a restart:
[2017-05-04 15:00:37,975][INFO ][node ] [esnode1] version[2.4.4], pid[2761], build[fcbb46d/2017-01-03T11:33:16Z]
[2017-05-04 15:00:37,976][INFO ][node ] [esnode1] initializing ...
[2017-05-04 15:00:38,534][INFO ][plugins ] [esnode1] modules [reindex, lang-expression, lang-groovy], plugins [], sites []
[2017-05-04 15:00:38,563][INFO ][env ] [esnode1] using [1] data paths, mounts [[/mnt/esdata2 (/dev/xvdh1)]], net usable_space [226.3gb], net total_space [492gb], spins? [no], types [ext4]
[2017-05-04 15:00:38,563][INFO ][env ] [esnode1] heap size [1.9gb], compressed ordinary object pointers [true]
[2017-05-04 15:00:40,379][INFO ][node ] [esnode1] initialized
[2017-05-04 15:00:40,380][INFO ][node ] [esnode1] starting ...
[2017-05-04 15:00:40,501][INFO ][transport ] [esnode1] publish_address {127.0.0.1:9300}, bound_addresses {[::1]:9300}, {127.0.0.1:9300}
[2017-05-04 15:00:40,506][INFO ][discovery ] [esnode1] escluster1/sv3aHhUjSyueq5N4_w14mQ
[2017-05-04 15:00:43,565][INFO ][cluster.service ] [esnode1] new_master {esnode1}{sv3aHhUjSyueq5N4_w14mQ}{127.0.0.1}{127.0.0.1:9300}, reason: zen-disco-join(elected_as_master, [0] joins received)
[2017-05-04 15:00:43,595][INFO ][indices.recovery ] [esnode1] updating [indices.recovery.max_bytes_per_sec] from [40mb] to [50mb]
[2017-05-04 15:00:43,631][INFO ][http ] [esnode1] publish_address {127.0.0.1:9200}, bound_addresses {[::1]:9200}, {127.0.0.1:9200}
[2017-05-04 15:00:43,632][INFO ][node ] [esnode1] started
[2017-05-04 15:00:43,651][INFO ][gateway ] [esnode1] recovered
Edit 1: switching the log level to DEBUG, with a 2GB heap, the cluster state remains "red" and I can see the following message being logged repeatedly every 30 seconds:
[2017-05-10 15:58:45,985][DEBUG][index.shard ] [esnode1] [myIndex][0] updateBufferSize: engine is closed; skipping
[2017-05-10 15:59:15,985][DEBUG][indices.memory ] [esnode1] recalculating shard indexing buffer, total is [203.1mb] with [1] active shards, each shard set to indexing=[203.1mb], translog=[64kb]
[2017-05-10 15:59:15,990][DEBUG][index.shard ] [esnode1] [myIndex][0] updateBufferSize: engine is closed; skipping
[2017-05-10 15:59:45,990][DEBUG][indices.memory ] [esnode1] recalculating shard indexing buffer, total is [203.1mb] with [1] active shards, each shard set to indexing=[203.1mb], translog=[64kb]
[2017-05-10 15:59:45,997][DEBUG][index.shard ] [esnode1] [myIndex][0] updateBufferSize: engine is closed; skipping
[2017-05-10 16:00:15,997][DEBUG][indices.memory ] [esnode1] recalculating shard indexing buffer, total is [203.1mb] with [1] active shards, each shard set to indexing=[203.1mb], translog=[64kb]
Edit 2: outputs produced with a 3GB heap and "green" status:
_nodes/stats?filter_path=**.indices.segments :
{
"nodes" : {
"TeXgE1QKSMOE1xYS-miJug" : {
"indices" : {
"segments" : {
"count" : 73,
"memory_in_bytes" : 2272548617,
"terms_memory_in_bytes" : 2269433701,
"stored_fields_memory_in_bytes" : 3103096,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 4672,
"doc_values_memory_in_bytes" : 7148,
"index_writer_memory_in_bytes" : 0,
"index_writer_max_memory_in_bytes" : 320379289,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 0
}
}
}
}
/_nodes/stats/jvm?filter_path=**.heap_used_in_bytes
{
"cluster_name" : "escluster1",
"nodes" : {
"TeXgE1QKSMOE1xYS-miJug" : {
"timestamp" : 1494501231058,
"name" : "esnode1",
"transport_address" : "127.0.0.1:9300",
"host" : "127.0.0.1",
"ip" : [ "127.0.0.1:9300", "NONE" ],
"indices" : {
"docs" : {
"count" : 5352169,
"deleted" : 0
},
"store" : {
"size_in_bytes" : 234847391460,
"throttle_time_in_millis" : 0
},
"indexing" : {
"index_total" : 0,
"index_time_in_millis" : 0,
"index_current" : 0,
"index_failed" : 0,
"delete_total" : 0,
"delete_time_in_millis" : 0,
"delete_current" : 0,
"noop_update_total" : 0,
"is_throttled" : false,
"throttle_time_in_millis" : 0
},
"get" : {
"total" : 0,
"time_in_millis" : 0,
"exists_total" : 0,
"exists_time_in_millis" : 0,
"missing_total" : 0,
"missing_time_in_millis" : 0,
"current" : 0
},
"search" : {
"open_contexts" : 0,
"query_total" : 0,
"query_time_in_millis" : 0,
"query_current" : 0,
"fetch_total" : 0,
"fetch_time_in_millis" : 0,
"fetch_current" : 0,
"scroll_total" : 0,
"scroll_time_in_millis" : 0,
"scroll_current" : 0
},
"merges" : {
"current" : 0,
"current_docs" : 0,
"current_size_in_bytes" : 0,
"total" : 0,
"total_time_in_millis" : 0,
"total_docs" : 0,
"total_size_in_bytes" : 0,
"total_stopped_time_in_millis" : 0,
"total_throttled_time_in_millis" : 0,
"total_auto_throttle_in_bytes" : 20971520
},
"refresh" : {
"total" : 1,
"total_time_in_millis" : 14
},
"flush" : {
"total" : 1,
"total_time_in_millis" : 10
},
"warmer" : {
"current" : 0,
"total" : 3,
"total_time_in_millis" : 6
},
"query_cache" : {
"memory_size_in_bytes" : 0,
"total_count" : 0,
"hit_count" : 0,
"miss_count" : 0,
"cache_size" : 0,
"cache_count" : 0,
"evictions" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"percolate" : {
"total" : 0,
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 73,
"memory_in_bytes" : 2272548617,
"terms_memory_in_bytes" : 2269433701,
"stored_fields_memory_in_bytes" : 3103096,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 4672,
"doc_values_memory_in_bytes" : 7148,
"index_writer_memory_in_bytes" : 0,
"index_writer_max_memory_in_bytes" : 512000,
"version_map_memory_in_bytes" : 0,
"fixed_bit_set_memory_in_bytes" : 0
},
"translog" : {
"operations" : 0,
"size_in_bytes" : 43
},
"suggest" : {
"total" : 0,
"time_in_millis" : 0,
"current" : 0
},
"request_cache" : {
"memory_size_in_bytes" : 0,
"evictions" : 0,
"hit_count" : 0,
"miss_count" : 0
},
"recovery" : {
"current_as_source" : 0,
"current_as_target" : 0,
"throttle_time_in_millis" : 0
}
},
"os" : {
"timestamp" : 1494501231060,
"cpu_percent" : 0,
"load_average" : 0.0,
"mem" : {
"total_in_bytes" : 4142092288,
"free_in_bytes" : 117051392,
"used_in_bytes" : 4025040896,
"free_percent" : 3,
"used_percent" : 97
},
"swap" : {
"total_in_bytes" : 0,
"free_in_bytes" : 0,
"used_in_bytes" : 0
}
},
"process" : {
"timestamp" : 1494501231060,
"open_file_descriptors" : 203,
"max_file_descriptors" : 65536,
"cpu" : {
"percent" : 0,
"total_in_millis" : 14890
},
"mem" : {
"total_virtual_in_bytes" : 23821713408
}
},
"jvm" : {
"timestamp" : 1494501231060,
"uptime_in_millis" : 369041,
"mem" : {
"heap_used_in_bytes" : 2323777096,
"heap_used_percent" : 72,
"heap_committed_in_bytes" : 3203792896,
"heap_max_in_bytes" : 3203792896,
"non_heap_used_in_bytes" : 52525744,
"non_heap_committed_in_bytes" : 53305344,
"pools" : {
"young" : {
"used_in_bytes" : 121416432,
"max_in_bytes" : 139591680,
"peak_used_in_bytes" : 139591680,
"peak_max_in_bytes" : 139591680
},
"survivor" : {
"used_in_bytes" : 4653304,
"max_in_bytes" : 17432576,
"peak_used_in_bytes" : 17432576,
"peak_max_in_bytes" : 17432576
},
"old" : {
"used_in_bytes" : 2197707360,
"max_in_bytes" : 3046768640,
"peak_used_in_bytes" : 2197707360,
"peak_max_in_bytes" : 3046768640
}
}
},
"threads" : {
"count" : 34,
"peak_count" : 42
},
"gc" : {
"collectors" : {
"young" : {
"collection_count" : 23,
"collection_time_in_millis" : 1027
},
"old" : {
"collection_count" : 1,
"collection_time_in_millis" : 26
}
}
},
"buffer_pools" : {
"direct" : {
"count" : 24,
"used_in_bytes" : 3964472,
"total_capacity_in_bytes" : 3964472
},
"mapped" : {
"count" : 33,
"used_in_bytes" : 18005744733,
"total_capacity_in_bytes" : 18005744733
}
},
"classes" : {
"current_loaded_count" : 7490,
"total_loaded_count" : 7490,
"total_unloaded_count" : 0
}
},
"thread_pool" : {
"bulk" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"fetch_shard_started" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 1,
"completed" : 1
},
"fetch_shard_store" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"flush" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 1,
"completed" : 2
},
"force_merge" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"generic" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 5,
"completed" : 69
},
"get" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"index" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"listener" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 1,
"completed" : 2
},
"management" : {
"threads" : 3,
"queue" : 0,
"active" : 1,
"rejected" : 0,
"largest" : 3,
"completed" : 41
},
"percolate" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"refresh" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 1,
"completed" : 1
},
"search" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"snapshot" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"suggest" : {
"threads" : 0,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 0,
"completed" : 0
},
"warmer" : {
"threads" : 1,
"queue" : 0,
"active" : 0,
"rejected" : 0,
"largest" : 1,
"completed" : 1
}
},
"fs" : {
"timestamp" : 1494501231060,
"total" : {
"total_in_bytes" : 528311836672,
"free_in_bytes" : 249557147648,
"available_in_bytes" : 222696878080
},
"data" : [ {
"path" : "/mnt/esdata2/data/escluster1/nodes/0",
"mount" : "/mnt/esdata2 (/dev/xvdh1)",
"type" : "ext4",
"total_in_bytes" : 528311836672,
"free_in_bytes" : 249557147648,
"available_in_bytes" : 222696878080,
"spins" : "false"
} ]
},
"transport" : {
"server_open" : 0,
"rx_count" : 6,
"rx_size_in_bytes" : 2352,
"tx_count" : 6,
"tx_size_in_bytes" : 2352
},
"http" : {
"current_open" : 1,
"total_opened" : 6
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 1281517158,
"limit_size" : "1.1gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 1922275737,
"limit_size" : "1.7gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 3203792896,
"limit_size" : "2.9gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 2242655027,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
}
},
"script" : {
"compilations" : 0,
"cache_evictions" : 0
}
}
}
}
Your segments kind-of static data (meaning the terms, inverted index etc) that is associated with your data only, is pretty big - "memory_in_bytes" : 2272548617 which is 2.11 GB.
And that's the reason why your ES node is not able to do anything when you give it 2GB heap.
Apart from the 2.11 GB static data, while indexing, searching there is more memory required of course. So your node needs to have at least 3GB heap and at least 6GB RAM.
I'm looking for a way to get the storage size of an specific Elasticsearch snapshot? The snapshots are located on a shared filesystem.
It seems there is no API for this?
In order to get the size or status of the elasticsearch snapshot, run snapshot status API snapshot status API
curl -X GET "localhost:9200/_snapshot/my_repository/my_snapshot/_status?pretty"
Note: Mention appropriate values in the above curl.
Sample Output:
"snapshots" : [
{
"snapshot" : "index-01",
"repository" : "my_repository",
"uuid" : "OKHNDHSKENGHLEWNALWEERTJNS",
"state" : "SUCCESS",
"include_global_state" : true,
"shards_stats" : {
"initializing" : 0,
"started" : 0,
"finalizing" : 0,
"done" : 2,
"failed" : 0,
"total" : 2
},
"stats" : {
"incremental" : {
"file_count" : 149,
"size_in_bytes" : 8229187919
},
"total" : {
"file_count" : 463,
"size_in_bytes" : 169401330819
},
"start_time_in_millis" : 1631622333285,
"time_in_millis" : 208851,
"number_of_files" : 149,
"processed_files" : 149,
"total_size_in_bytes" : 8229187919,
"processed_size_in_bytes" : 8229187919
},
"indices" : {
"graylog_130" : {
"shards_stats" : {
"initializing" : 0,
"started" : 0,
"finalizing" : 0,
"done" : 2,
"failed" : 0,
"total" : 2
},
"stats" : {
"incremental" : {
"file_count" : 149,
"size_in_bytes" : 8229187919
},
"total" : {
"file_count" : 463,
"size_in_bytes" : 169401330819
},
"start_time_in_millis" : 1631622333285,
"time_in_millis" : 208851,
"number_of_files" : 149,
"processed_files" : 149,
"total_size_in_bytes" : 8229187919,
"processed_size_in_bytes" : 8229187919
},
"shards" : {
"0" : {
"stage" : "DONE",
"stats" : {
"incremental" : {
"file_count" : 97,
"size_in_bytes" : 1807163337
},
"total" : {
"file_count" : 271,
"size_in_bytes" : 84885391182
},
"start_time_in_millis" : 1631622334048,
"time_in_millis" : 49607,
"number_of_files" : 97,
"processed_files" : 97,
"total_size_in_bytes" : 1807163337,
"processed_size_in_bytes" : 1807163337
}
},
"1" : {
"stage" : "DONE",
"stats" : {
"incremental" : {
"file_count" : 52,
"size_in_bytes" : 6422024582
},
"total" : {
"file_count" : 192,
"size_in_bytes" : 84515939637
},
"start_time_in_millis" : 1631622333285,
"time_in_millis" : 208851,
"number_of_files" : 52,
"processed_files" : 52,
"total_size_in_bytes" : 6422024582,
"processed_size_in_bytes" : 6422024582
}
}
}
}
In the above output, look for
"total" : {
"file_count" : 463,
"size_in_bytes" : 169401330819
}
Now convert size_in_bytes to GB, you will get the exact size of the snapshot in GB's Convert bytes to GB
You could get storage used by index using _cat API ( primary store size). First snapshot should be around index size.
For Incremental snapshots, it depends . This is because snapshots are taken in a segment level ( index-.. ) so it may be much smaller depending your indexing. Merges could cause new segments to form etc..
https://www.elastic.co/blog/found-elasticsearch-snapshot-and-restore Gives a nice overview
I need an exact solution of the used size on the storage.
Now I use the following approach: separate directories on index/snapshot level and so I can get the used storage size on system level (du command) for a specific index or snapshot.
I am attempting to upgrade our Elastic Search cluster from 1.6 to 2.3.4. The upgrade seems to work, and I can see shard allocation starting to happen within Kopf - but at some point the shard allocation appears to stop with many shards left unallocated, and no errors being reported in the logs. Typically I'm left with 1200 / 3800 shards unallocated.
We have a typical 3 node cluster and I am trialing this standalone on my local machine with all 3 nodes running on my local machine.
I have seen similar symptoms reported - see https://t37.net/how-to-fix-your-elasticsearch-cluster-stuck-in-initializing-shards-mode.html
. The solution here seemed to be to manually allocate the shards, which I've tried (and works) but I'm at a loss to explain the behaviour of elastic search here. I'd prefer not to go down this route, as I want my cluster to spin up automatically without intervention.
There is also https://github.com/elastic/elasticsearch/pull/14494 which seems to be resolved with the latest ES version, so shouldn't be a problem.
There are no errors in log files - I have upped the root level logging to 'DEBUG' in order to see what I can. What I can see is lines like the below for each unallocated shard (this from the master node logs):
[2016-07-26 09:18:04,859][DEBUG][gateway ] [germany] [index][4] found 0 allocations of [index][4], node[null], [P], v[0], s[UNASSIGNED], unassigned_info[[reason=CLUSTER_RECOVERED], at[2016-07-26T08:05:04.447Z]], highest version: [-1]
[2016-07-26 09:18:04,859][DEBUG][gateway ] [germany] [index][4]: not allocating, number_of_allocated_shards_found [0]
Config file (with comments removed):
cluster.name: elasticsearch-jm-2.3.4
node.name: germany
script.inline: true
script.indexed: true
If I query the cluster health after reallocation has stopped - I get the response below:
http://localhost:9200/_cluster/health?pretty
cluster_name : elasticsearch-jm-2.3.4
status : red
timed_out : False
number_of_nodes : 3
number_of_data_nodes : 3
active_primary_shards : 1289
active_shards : 2578
relocating_shards : 0
initializing_shards : 0
unassigned_shards : 1264
delayed_unassigned_shards : 0
number_of_pending_tasks : 0
number_of_in_flight_fetch : 0
task_max_waiting_in_queue_millis : 0
active_shards_percent_as_number : 67.10046850598647
Further querying for shards - filtered to one index with unallocated shards. As can be seen - shard 0 and 4 are unallocated whereas shard 1 2 and 3 have been allocated :
http://localhost:9200/_cat/shards
cs-payment-warn-2016.07.20 3 p STARTED 106 92.4kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 3 r STARTED 106 92.4kb 127.0.0.1 switzerland
cs-payment-warn-2016.07.20 4 p UNASSIGNED
cs-payment-warn-2016.07.20 4 r UNASSIGNED
cs-payment-warn-2016.07.20 2 r STARTED 120 74.5kb 127.0.0.1 cyprus
cs-payment-warn-2016.07.20 2 p STARTED 120 74.5kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 1 r STARTED 120 73.8kb 127.0.0.1 cyprus
cs-payment-warn-2016.07.20 1 p STARTED 120 73.8kb 127.0.0.1 germany
cs-payment-warn-2016.07.20 0 p UNASSIGNED
cs-payment-warn-2016.07.20 0 r UNASSIGNED
Manually rerouting an unassigned shard appears to work - (stripped back results set)
http://localhost:9200/_cluster/reroute
POST:
{
"dry_run": true,
"commands": [
{
"allocate": {
"index": "cs-payment-warn-2016.07.20",
"shard": 4,
"node": "switzerland" ,
"allow_primary": true
}
}
]
}
Response:
{
"acknowledged" : true,
"state" : {
"version" : 722,
"state_uuid" : "Vw2vPoCMQk2ZosjzviD4TQ",
"master_node" : "yhL7XXy-SKu_WAM-C33dzA",
"blocks" : {},
"nodes" : {},
"routing_table" : {
"indices" : {
"cs-payment-warn-2016.07.20" : {
"shards" : {
"3" : [{
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 3,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "x_Iq88hmTqiasrjW09hVuw"
}
}, {
"state" : "STARTED",
"primary" : false,
"node" : "1a8dgBscTUS3c7Pv4mN9CQ",
"relocating_node" : null,
"shard" : 3,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "DF-EUEy_SpeUElnZI6cgsQ"
}
}
],
"4" : [{
"state" : "INITIALIZING",
"primary" : true,
"node" : "1a8dgBscTUS3c7Pv4mN9CQ",
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"allocation_id" : {
"id" : "1tw7C7YPQsWwm_O-8mYHRg"
},
"unassigned_info" : {
"reason" : "INDEX_CREATED",
"at" : "2016-07-26T14:20:15.395Z",
"details" : "force allocation from previous reason CLUSTER_RECOVERED, null"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
],
"2" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "eQ-_vWNbRp27So0iGSitmA"
}
}, {
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "O1PU1_NVS8-uB2yBrG76MA"
}
}
],
"1" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZmxtOvorRVmndR15OJMkMA"
}
}, {
"state" : "STARTED",
"primary" : true,
"node" : "yhL7XXy-SKu_WAM-C33dzA",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZNgzePThQxS-iqhRSXzZCw"
}
}
],
"0" : [{
"state" : "UNASSIGNED",
"primary" : true,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
]
}
}
},
"routing_nodes" : {
"unassigned" : [{
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 4,
"index" : "cs-payment-warn-2016.07.20",
"version" : 1,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : true,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}, {
"state" : "UNASSIGNED",
"primary" : false,
"node" : null,
"relocating_node" : null,
"shard" : 0,
"index" : "cs-payment-warn-2016.07.20",
"version" : 0,
"unassigned_info" : {
"reason" : "CLUSTER_RECOVERED",
"at" : "2016-07-26T11:24:11.868Z"
}
}
]
},
"nodes" : {
"rlRQ2u0XQRqxWld-wSrOug" : [{
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 2,
"index" : "cs-payment-warn-2016.07.20",
"version" : 22,
"allocation_id" : {
"id" : "eQ-_vWNbRp27So0iGSitmA"
}
}, {
"state" : "STARTED",
"primary" : false,
"node" : "rlRQ2u0XQRqxWld-wSrOug",
"relocating_node" : null,
"shard" : 1,
"index" : "cs-payment-warn-2016.07.20",
"version" : 24,
"allocation_id" : {
"id" : "ZmxtOvorRVmndR15OJMkMA"
}
}
]
}
}
}
}
I am using FindAndModify in MongoDB in several concurrent processes. The collection size is about 3 million entries and everything works like a blast as long as I don't pass a sorting option (by an indexed field). Once I try to do so, the following warning is spawned to the logs:
warning: ClientCursor::yield can't unlock b/c of recursive lock ns: test_db.wengine_queue top:
{
opid: 424210,
active: true,
lockType: "write",
waitingForLock: false,
secs_running: 0,
op: "query",
ns: "test_db",
query: {
findAndModify: "wengine_queue",
query: {
locked: { $ne: 1 },
rule_completed: { $in: [ "", "0", null ] },
execute_at: { $lt: 1324381363 },
company_id: 23,
debug: 0,
system_id: "AK/AK1201"
},
update: {
$set: { locked: 1 }
},
sort: {
execute_at: -1
}
},
client: "127.0.0.1:60873",
desc: "conn",
threadId: "0x1541bb000",
connectionId: 1147,
numYields: 0
}
I do have all the keys from the query indexed, here they are:
PRIMARY> db.wengine_queue.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "_id_"
},
{
"v" : 1,
"key" : {
"system_id" : 1,
"company_id" : 1,
"locked" : 1,
"rule_completed" : 1,
"execute_at" : -1,
"debug" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "system_id_1_company_id_1_locked_1_rule_completed_1_execute_at_-1_debug_1"
},
{
"v" : 1,
"key" : {
"debug" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "debug_1"
},
{
"v" : 1,
"key" : {
"system_id" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "system_id_1"
},
{
"v" : 1,
"key" : {
"company_id" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "company_id_1"
},
{
"v" : 1,
"key" : {
"locked" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "locked_1"
},
{
"v" : 1,
"key" : {
"rule_completed" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "rule_completed_1"
},
{
"v" : 1,
"key" : {
"execute_at" : -1
},
"ns" : "test_db.wengine_queue",
"name" : "execute_at_-1"
},
{
"v" : 1,
"key" : {
"thread_id" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "thread_id_1"
},
{
"v" : 1,
"key" : {
"rule_id" : 1
},
"ns" : "test_db.wengine_queue",
"name" : "rule_id_1"
}
]
Is there any way around this?
For those interested -- I had to create a separate index ending with the key that the set is to be sorted by.
That warning is thrown when an operation that wants to yield (such as long updates, removes, etc.) cannot do so because it cannot release the lock it's holding for whatever reason.
Do you have the field you're sorting on indexed? If not adding an index for that will probably remove the warnings.