diff options
author | Hemanth <hyelloji@redhat.com> | 2024-10-23 14:27:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-23 14:27:50 +0200 |
commit | fca07e95eae15cbb89849ed5c6f6289a9954794e (patch) | |
tree | 6edaf15ce29b60f80ba104844ec809d1b3821c7c | |
parent | Merge pull request #60316 from VallariAg/wip-nvmeof-teuthology-mtls-test (diff) | |
parent | PendingReleaseNote: add release note that mgr/volumes plugin can be... (diff) | |
download | ceph-fca07e95eae15cbb89849ed5c6f6289a9954794e.tar.xz ceph-fca07e95eae15cbb89849ed5c6f6289a9954794e.zip |
Merge pull request #58647 from rishabh-d-dave/mgr-vol-mod-disable
mgr: allow disabling always-on modules
-rw-r--r-- | PendingReleaseNotes | 3 | ||||
-rw-r--r-- | doc/cephfs/fs-volumes.rst | 23 | ||||
-rw-r--r-- | doc/cephfs/troubleshooting.rst | 5 | ||||
-rw-r--r-- | qa/cephfs/overrides/ignorelist_health.yaml | 1 | ||||
-rw-r--r-- | qa/tasks/cephfs/test_admin.py | 181 | ||||
-rw-r--r-- | src/mgr/PyModuleRegistry.cc | 15 | ||||
-rw-r--r-- | src/mon/MgrMap.h | 20 | ||||
-rw-r--r-- | src/mon/MgrMonitor.cc | 81 | ||||
-rw-r--r-- | src/mon/MonCommands.h | 4 |
9 files changed, 322 insertions, 11 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 8af2a262dff..d82ed125d92 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -25,6 +25,9 @@ - osd_op_num_shards_hdd = 1 (was 5) - osd_op_num_threads_per_shard_hdd = 5 (was 1) For more details see https://tracker.ceph.com/issues/66289. +* MGR: MGR's always-on modulues/plugins can now be force-disabled. This can be + necessary in cases where MGR(s) needs to be prevented from being flooded by + the module commands when coresponding Ceph service is down/degraded. * CephFS: Modifying the FS setting variable "max_mds" when a cluster is unhealthy now requires users to pass the confirmation flag diff --git a/doc/cephfs/fs-volumes.rst b/doc/cephfs/fs-volumes.rst index 86a8071a2d2..3e7dd7815ad 100644 --- a/doc/cephfs/fs-volumes.rst +++ b/doc/cephfs/fs-volumes.rst @@ -1419,5 +1419,28 @@ set with this id was present in the database $ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0 + +.. _disabling-volumes-plugin: + +Disabling Volumes Plugin +------------------------ +By default the volumes plugin is enabled and set to ``always on``. However, in +certain cases it might be appropriate to disable it. For example, when a CephFS +is in a degraded state, the volumes plugin commands may accumulate in MGR +instead of getting served. Which eventually causes policy throttles to kick in +and the MGR becomes unresponsive. + +In this event, volumes plugin can be disabled even though it is an +``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes +--yes-i-really-mean-it``. Do note that this command will disable operations +and remove commands of volumes plugin since it will disable all CephFS +services on the Ceph cluster accessed through this plugin. + +Before resorting to a measure as drastic as this, it is a good idea to try less +drastic measures and then assess if the file system experience has improved due +to it. One example of such less drastic measure is to disable asynchronous +threads launched by volumes plugins for cloning and purging trash. + + .. _manila: https://github.com/openstack/manila .. _CSI: https://github.com/ceph/ceph-csi diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst index 78d0a8f54d3..582daa10338 100644 --- a/doc/cephfs/troubleshooting.rst +++ b/doc/cephfs/troubleshooting.rst @@ -412,6 +412,11 @@ its associated key. A less drastic but half-fix is to change the osd cap for your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs data=....`` +Disabling Volumes Plugin +======================== +In certain scenarios, volumes plugin might be needed to disabled to prevent compromise +for rest of the Ceph cluster. For details see: :ref:`disabling-volumes-plugin` + Reporting Issues ================ diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml index 94b42579777..5ac25a8f790 100644 --- a/qa/cephfs/overrides/ignorelist_health.yaml +++ b/qa/cephfs/overrides/ignorelist_health.yaml @@ -24,3 +24,4 @@ overrides: - BLUESTORE_SLOW_OP_ALERT - slow operation indications in BlueStore - experiencing slow operations in BlueStore + - MGR_MODULE_ERROR diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index 00a68dd0183..beb41019e6d 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -2740,3 +2740,184 @@ class TestFSSetMaxMDS(TestAdminCommands): ''' self.fs.set_max_mds(2, confirm=True) self.assertEqual(self.fs.get_var('max_mds'), 2) + + +class TestToggleVolumes(CephFSTestCase): + ''' + Contains code for enabling/disabling mgr/volumes plugin. + ''' + + VOL_MOD_NAME = 'volumes' + CONFIRM = '--yes-i-really-mean-it' + + def tearDown(self): + ''' + Ensure that the volumes plugin is enabled after the test has finished + running since not doing so might affect tearDown() of CephFSTestCase or + other superclasses. + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + if 'volumes' in json_output['force_disabled_modules']: + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + + super(TestToggleVolumes, self).tearDown() + + def test_force_disable_with_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes + --yes-i-really-mean-it" successfully disables volumes plugin. + + Also test "ceph mgr module ls" output after this. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + def test_force_disable_fails_without_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes" fails with + EPERM when confirmation flag is not passed along. + + Also test that output of this command suggests user to pass + --yes-i-really-mean-it. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + # ensure that the confirmation flag was recommended + self.assertIn(self.CONFIRM, proc_stderr) + + def test_force_disable_idempotency(self): + ''' + Test that running "ceph mgr module force disable volumes" passes when + volumes plugin was already force disabled. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + sleep(5) + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format ' + 'json-pretty') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + # XXX: this this test, running this command 2nd time should pass. + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}') + + def test_force_disable_nonexistent_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable abcd {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_force_disable_non_alwayson_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + json_output = self.get_ceph_cmd_stdout( + 'mgr module ls --format json-pretty', check_status=False, + stderr=StringIO()) + output_dict = json.loads(json_output) + some_non_alwayson_mod = output_dict['enabled_modules'][0] + + proc = self.run_ceph_cmd( + f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_enabled_by_default(self): + ''' + Test that volumes plugin is enabled by default and is also reported as + "always on". + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + def test_disable_fails(self): + ''' + Test that running "ceph mgr module disable volumes" fails with EPERM. + + This is expected since volumes is an always-on module and therefore + it can only be disabled using command "ceph mgr module force disable + volumes". + ''' + proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + + def test_enable_idempotency(self): + ''' + Test that enabling volumes plugin when it is already enabled doesn't + exit with non-zero return value. + + Also test that it reports plugin as already enabled. + ''' + proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}', + stderr=StringIO()) + self.assertEqual(proc.returncode, 0) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('already enabled', proc_stderr) + self.assertIn('always-on', proc_stderr) + + def test_enable_post_disabling(self): + ''' + Test that enabling volumes plugin after (force-)disabling it works + successfully. + + Alo test "ceph mgr module ls" output for volumes plugin afterwards. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + # give bit of time for plugin to be disabled. + sleep(5) + + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + # give bit of time for plugin to be functional again + sleep(5) + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + # plugin is reported properly by "ceph mgr module ls" command, check if + # it is also working fine. + self.run_ceph_cmd('fs volume ls') diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc index 0eb304e7353..08501568a2c 100644 --- a/src/mgr/PyModuleRegistry.cc +++ b/src/mgr/PyModuleRegistry.cc @@ -151,7 +151,8 @@ bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_) return false; } else { bool modules_changed = mgr_map_.modules != mgr_map.modules || - mgr_map_.always_on_modules != mgr_map.always_on_modules; + mgr_map_.always_on_modules != mgr_map.always_on_modules || + mgr_map_.force_disabled_modules != mgr_map.force_disabled_modules; mgr_map = mgr_map_; if (standby_modules != nullptr) { @@ -240,10 +241,20 @@ void PyModuleRegistry::active_start( // Anything we're skipping because of !can_run will be flagged // to the user separately via get_health_checks if (!(i.second->is_enabled() && i.second->is_loaded())) { + dout(8) << __func__ << " Not starting module '" << i.first << "', it is " + << "not enabled and loaded" << dendl; continue; } - dout(4) << "Starting " << i.first << dendl; + // These are always-on modules but user force-disabled them. + if (mgr_map.force_disabled_modules.find(i.first) != + mgr_map.force_disabled_modules.end()) { + dout(8) << __func__ << " Not starting module '" << i.first << "', it is " + << "force-disabled" << dendl; + continue; + } + + dout(4) << "Starting module '" << i.first << "'" << dendl; active_modules->start_one(i.second); } } diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h index 82f6ea88046..1ab542a871f 100644 --- a/src/mon/MgrMap.h +++ b/src/mon/MgrMap.h @@ -297,6 +297,9 @@ public: // active version. std::map<uint32_t, std::set<std::string>> always_on_modules; + // Modules which are always-on but have been force-disabled by user. + std::set<std::string> force_disabled_modules; + // Modules which are reported to exist std::vector<ModuleInfo> available_modules; @@ -448,7 +451,7 @@ public: ENCODE_FINISH(bl); return; } - ENCODE_START(13, 6, bl); + ENCODE_START(14, 6, bl); encode(epoch, bl); encode(active_addrs, bl, features); encode(active_gid, bl); @@ -473,13 +476,14 @@ public: encode(clients_addrs, bl, features); encode(clients_names, bl, features); encode(flags, bl); + encode(force_disabled_modules, bl); ENCODE_FINISH(bl); return; } void decode(ceph::buffer::list::const_iterator& p) { - DECODE_START(13, p); + DECODE_START(14, p); decode(epoch, p); decode(active_addrs, p); decode(active_gid, p); @@ -549,6 +553,11 @@ public: if (struct_v >= 13) { decode(flags, p); } + + if (struct_v >= 14) { + decode(force_disabled_modules, p); + } + DECODE_FINISH(p); } @@ -603,6 +612,13 @@ public: f->close_section(); } f->close_section(); // always_on_modules + + f->open_object_section("force_disabled_modules"); + for (auto& m : force_disabled_modules) { + f->dump_string("module", m); + } + f->close_section(); + f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); f->open_array_section("active_clients"); for (const auto& i : clients) { diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 30382b97052..b89878dddb7 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -146,10 +146,12 @@ void MgrMonitor::create_initial() } pending_map.always_on_modules = always_on_modules(); pending_command_descs = mgr_commands; - dout(10) << __func__ << " initial modules " << pending_map.modules - << ", always on modules " << pending_map.get_always_on_modules() - << ", " << pending_command_descs.size() << " commands" + dout(10) << __func__ << " initial enabled modules: " << pending_map.modules << dendl; + dout(10) << __func__ << "always on modules: " << + pending_map.get_always_on_modules() << dendl; + dout(10) << __func__ << "total " << pending_command_descs.size() << + " commands" << dendl; } void MgrMonitor::get_store_prefixes(std::set<string>& s) const @@ -1019,6 +1021,13 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op) f->dump_string("module", p); } f->close_section(); + + f->open_array_section("force_disabled_modules"); + for (auto& p : map.force_disabled_modules) { + f->dump_string("module", p); + } + f->close_section(); + f->open_array_section("enabled_modules"); for (auto& p : map.modules) { if (map.get_always_on_modules().count(p) > 0) @@ -1048,7 +1057,11 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op) for (auto& p : map.get_always_on_modules()) { tbl << p; - tbl << "on (always on)"; + if (map.force_disabled_modules.find(p) == map.force_disabled_modules.end()) { + tbl << "on (always on)"; + } else { + tbl << "off (always on but force-disabled)"; + } tbl << TextTable::endrow; } for (auto& p : map.modules) { @@ -1269,10 +1282,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) r = -EINVAL; goto out; } - if (pending_map.get_always_on_modules().count(module) > 0) { + + if (pending_map.get_always_on_modules().count(module) > 0 && + !pending_map.force_disabled_modules.contains(module)) { ss << "module '" << module << "' is already enabled (always-on)"; goto out; } + bool force = false; cmd_getval_compat_cephbool(cmdmap, "force", force); if (!pending_map.all_support_module(module) && @@ -1296,7 +1312,12 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) ss << "module '" << module << "' is already enabled"; r = 0; goto out; + } else if (pending_map.force_disabled_modules.contains(module)) { + pending_map.force_disabled_modules.erase(module); + r = 0; + goto out; } + pending_map.modules.insert(module); } else if (prefix == "mgr module disable") { string module; @@ -1306,8 +1327,9 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) goto out; } if (pending_map.get_always_on_modules().count(module) > 0) { - ss << "module '" << module << "' cannot be disabled (always-on)"; - r = -EINVAL; + ss << "module '" << module << "' cannot be disabled (always-on), use " << + "'ceph mgr module force disable' command to disable an always-on module"; + r = -EPERM; goto out; } if (!pending_map.module_enabled(module)) { @@ -1318,7 +1340,52 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) if (!pending_map.modules.count(module)) { ss << "module '" << module << "' is not enabled"; } + dout(8) << __func__ << " disabling module " << module << " from new " << dendl; pending_map.modules.erase(module); + } else if (prefix == "mgr module force disable") { + string mod; + cmd_getval(cmdmap, "module", mod); + + bool confirmation_flag = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", confirmation_flag); + + if (mod.empty()) { + ss << "Module name wasn't passed!"; + r = -EINVAL; + goto out; + } + + if (!pending_map.get_always_on_modules().contains(mod)) { + ss << "Always-on module named \"" << mod << "\" does not exist"; + r = -EINVAL; + goto out; + } else if (pending_map.modules.contains(mod)) { + ss << "Module '" << mod << "' is not an always-on module, only always-on " << + "modules can be disabled through this command."; + r = -EINVAL; + goto out; + } + + if (pending_map.force_disabled_modules.contains(mod)) { + ss << "Module \"" << mod << "\"is already disabled"; + r = 0; + goto out; + } + + if (!confirmation_flag) { + ss << "This command will disable operations and remove commands that " + << "other Ceph utilities expect to be available. Do not continue " + << "unless your cluster is already experiencing an event due to " + << "which it is advised to disable this module as part of " + << "troubleshooting. If you are sure that you wish to continue, " + << "run again with --yes-i-really-mean-it"; + r = -EPERM; + goto out; + } + + dout(8) << __func__ << " force-disabling module '" << mod << "'" << dendl; + pending_map.force_disabled_modules.insert(mod); + pending_map.modules.erase(mod); } else { ss << "Command '" << prefix << "' not implemented!"; r = -ENOSYS; diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index b5de8837cb7..3cc3c8abd1e 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -1357,6 +1357,10 @@ COMMAND("mgr module enable " COMMAND("mgr module disable " "name=module,type=CephString", "disable mgr module", "mgr", "rw") +COMMAND("mgr module force disable " + "name=module,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "force disable a always-on mgr module", "mgr", "rw") COMMAND("mgr metadata name=who,type=CephString,req=false", "dump metadata for all daemons or a specific daemon", "mgr", "r") |