summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHemanth <hyelloji@redhat.com>2024-10-23 14:27:50 +0200
committerGitHub <noreply@github.com>2024-10-23 14:27:50 +0200
commitfca07e95eae15cbb89849ed5c6f6289a9954794e (patch)
tree6edaf15ce29b60f80ba104844ec809d1b3821c7c
parentMerge pull request #60316 from VallariAg/wip-nvmeof-teuthology-mtls-test (diff)
parentPendingReleaseNote: add release note that mgr/volumes plugin can be... (diff)
downloadceph-fca07e95eae15cbb89849ed5c6f6289a9954794e.tar.xz
ceph-fca07e95eae15cbb89849ed5c6f6289a9954794e.zip
Merge pull request #58647 from rishabh-d-dave/mgr-vol-mod-disable
mgr: allow disabling always-on modules
-rw-r--r--PendingReleaseNotes3
-rw-r--r--doc/cephfs/fs-volumes.rst23
-rw-r--r--doc/cephfs/troubleshooting.rst5
-rw-r--r--qa/cephfs/overrides/ignorelist_health.yaml1
-rw-r--r--qa/tasks/cephfs/test_admin.py181
-rw-r--r--src/mgr/PyModuleRegistry.cc15
-rw-r--r--src/mon/MgrMap.h20
-rw-r--r--src/mon/MgrMonitor.cc81
-rw-r--r--src/mon/MonCommands.h4
9 files changed, 322 insertions, 11 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 8af2a262dff..d82ed125d92 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -25,6 +25,9 @@
- osd_op_num_shards_hdd = 1 (was 5)
- osd_op_num_threads_per_shard_hdd = 5 (was 1)
For more details see https://tracker.ceph.com/issues/66289.
+* MGR: MGR's always-on modulues/plugins can now be force-disabled. This can be
+ necessary in cases where MGR(s) needs to be prevented from being flooded by
+ the module commands when coresponding Ceph service is down/degraded.
* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
unhealthy now requires users to pass the confirmation flag
diff --git a/doc/cephfs/fs-volumes.rst b/doc/cephfs/fs-volumes.rst
index 86a8071a2d2..3e7dd7815ad 100644
--- a/doc/cephfs/fs-volumes.rst
+++ b/doc/cephfs/fs-volumes.rst
@@ -1419,5 +1419,28 @@ set with this id was present in the database
$ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0
+
+.. _disabling-volumes-plugin:
+
+Disabling Volumes Plugin
+------------------------
+By default the volumes plugin is enabled and set to ``always on``. However, in
+certain cases it might be appropriate to disable it. For example, when a CephFS
+is in a degraded state, the volumes plugin commands may accumulate in MGR
+instead of getting served. Which eventually causes policy throttles to kick in
+and the MGR becomes unresponsive.
+
+In this event, volumes plugin can be disabled even though it is an
+``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes
+--yes-i-really-mean-it``. Do note that this command will disable operations
+and remove commands of volumes plugin since it will disable all CephFS
+services on the Ceph cluster accessed through this plugin.
+
+Before resorting to a measure as drastic as this, it is a good idea to try less
+drastic measures and then assess if the file system experience has improved due
+to it. One example of such less drastic measure is to disable asynchronous
+threads launched by volumes plugins for cloning and purging trash.
+
+
.. _manila: https://github.com/openstack/manila
.. _CSI: https://github.com/ceph/ceph-csi
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
index 78d0a8f54d3..582daa10338 100644
--- a/doc/cephfs/troubleshooting.rst
+++ b/doc/cephfs/troubleshooting.rst
@@ -412,6 +412,11 @@ its associated key. A less drastic but half-fix is to change the osd cap for
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
data=....``
+Disabling Volumes Plugin
+========================
+In certain scenarios, volumes plugin might be needed to disabled to prevent compromise
+for rest of the Ceph cluster. For details see: :ref:`disabling-volumes-plugin`
+
Reporting Issues
================
diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index 94b42579777..5ac25a8f790 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -24,3 +24,4 @@ overrides:
- BLUESTORE_SLOW_OP_ALERT
- slow operation indications in BlueStore
- experiencing slow operations in BlueStore
+ - MGR_MODULE_ERROR
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index 00a68dd0183..beb41019e6d 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -2740,3 +2740,184 @@ class TestFSSetMaxMDS(TestAdminCommands):
'''
self.fs.set_max_mds(2, confirm=True)
self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+
+class TestToggleVolumes(CephFSTestCase):
+ '''
+ Contains code for enabling/disabling mgr/volumes plugin.
+ '''
+
+ VOL_MOD_NAME = 'volumes'
+ CONFIRM = '--yes-i-really-mean-it'
+
+ def tearDown(self):
+ '''
+ Ensure that the volumes plugin is enabled after the test has finished
+ running since not doing so might affect tearDown() of CephFSTestCase or
+ other superclasses.
+ '''
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ if 'volumes' in json_output['force_disabled_modules']:
+ self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+
+ super(TestToggleVolumes, self).tearDown()
+
+ def test_force_disable_with_confirmation(self):
+ '''
+ Test that running "ceph mgr module force disable volumes
+ --yes-i-really-mean-it" successfully disables volumes plugin.
+
+ Also test "ceph mgr module ls" output after this.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+ def test_force_disable_fails_without_confirmation(self):
+ '''
+ Test that running "ceph mgr module force disable volumes" fails with
+ EPERM when confirmation flag is not passed along.
+
+ Also test that output of this command suggests user to pass
+ --yes-i-really-mean-it.
+ '''
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable {self.VOL_MOD_NAME}',
+ stderr=StringIO(), check_status=False)
+
+ self.assertEqual(proc.returncode, errno.EPERM)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('EPERM', proc_stderr)
+ # ensure that the confirmation flag was recommended
+ self.assertIn(self.CONFIRM, proc_stderr)
+
+ def test_force_disable_idempotency(self):
+ '''
+ Test that running "ceph mgr module force disable volumes" passes when
+ volumes plugin was already force disabled.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+ sleep(5)
+
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format '
+ 'json-pretty')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+ # XXX: this this test, running this command 2nd time should pass.
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}')
+
+ def test_force_disable_nonexistent_mod(self):
+ '''
+ Test that passing non-existent name to "ceph mgr module force disable"
+ command leads to an error.
+ '''
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable abcd {self.CONFIRM}',
+ check_status=False, stderr=StringIO())
+ self.assertEqual(proc.returncode, errno.EINVAL)
+ self.assertIn('EINVAL', proc.stderr.getvalue())
+
+ def test_force_disable_non_alwayson_mod(self):
+ '''
+ Test that passing non-existent name to "ceph mgr module force disable"
+ command leads to an error.
+ '''
+ json_output = self.get_ceph_cmd_stdout(
+ 'mgr module ls --format json-pretty', check_status=False,
+ stderr=StringIO())
+ output_dict = json.loads(json_output)
+ some_non_alwayson_mod = output_dict['enabled_modules'][0]
+
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}',
+ check_status=False, stderr=StringIO())
+ self.assertEqual(proc.returncode, errno.EINVAL)
+ self.assertIn('EINVAL', proc.stderr.getvalue())
+
+ def test_enabled_by_default(self):
+ '''
+ Test that volumes plugin is enabled by default and is also reported as
+ "always on".
+ '''
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ def test_disable_fails(self):
+ '''
+ Test that running "ceph mgr module disable volumes" fails with EPERM.
+
+ This is expected since volumes is an always-on module and therefore
+ it can only be disabled using command "ceph mgr module force disable
+ volumes".
+ '''
+ proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}',
+ stderr=StringIO(), check_status=False)
+ self.assertEqual(proc.returncode, errno.EPERM)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('EPERM', proc_stderr)
+
+ def test_enable_idempotency(self):
+ '''
+ Test that enabling volumes plugin when it is already enabled doesn't
+ exit with non-zero return value.
+
+ Also test that it reports plugin as already enabled.
+ '''
+ proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}',
+ stderr=StringIO())
+ self.assertEqual(proc.returncode, 0)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('already enabled', proc_stderr)
+ self.assertIn('always-on', proc_stderr)
+
+ def test_enable_post_disabling(self):
+ '''
+ Test that enabling volumes plugin after (force-)disabling it works
+ successfully.
+
+ Alo test "ceph mgr module ls" output for volumes plugin afterwards.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+ # give bit of time for plugin to be disabled.
+ sleep(5)
+
+ self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+ # give bit of time for plugin to be functional again
+ sleep(5)
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ # plugin is reported properly by "ceph mgr module ls" command, check if
+ # it is also working fine.
+ self.run_ceph_cmd('fs volume ls')
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc
index 0eb304e7353..08501568a2c 100644
--- a/src/mgr/PyModuleRegistry.cc
+++ b/src/mgr/PyModuleRegistry.cc
@@ -151,7 +151,8 @@ bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
return false;
} else {
bool modules_changed = mgr_map_.modules != mgr_map.modules ||
- mgr_map_.always_on_modules != mgr_map.always_on_modules;
+ mgr_map_.always_on_modules != mgr_map.always_on_modules ||
+ mgr_map_.force_disabled_modules != mgr_map.force_disabled_modules;
mgr_map = mgr_map_;
if (standby_modules != nullptr) {
@@ -240,10 +241,20 @@ void PyModuleRegistry::active_start(
// Anything we're skipping because of !can_run will be flagged
// to the user separately via get_health_checks
if (!(i.second->is_enabled() && i.second->is_loaded())) {
+ dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+ << "not enabled and loaded" << dendl;
continue;
}
- dout(4) << "Starting " << i.first << dendl;
+ // These are always-on modules but user force-disabled them.
+ if (mgr_map.force_disabled_modules.find(i.first) !=
+ mgr_map.force_disabled_modules.end()) {
+ dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+ << "force-disabled" << dendl;
+ continue;
+ }
+
+ dout(4) << "Starting module '" << i.first << "'" << dendl;
active_modules->start_one(i.second);
}
}
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
index 82f6ea88046..1ab542a871f 100644
--- a/src/mon/MgrMap.h
+++ b/src/mon/MgrMap.h
@@ -297,6 +297,9 @@ public:
// active version.
std::map<uint32_t, std::set<std::string>> always_on_modules;
+ // Modules which are always-on but have been force-disabled by user.
+ std::set<std::string> force_disabled_modules;
+
// Modules which are reported to exist
std::vector<ModuleInfo> available_modules;
@@ -448,7 +451,7 @@ public:
ENCODE_FINISH(bl);
return;
}
- ENCODE_START(13, 6, bl);
+ ENCODE_START(14, 6, bl);
encode(epoch, bl);
encode(active_addrs, bl, features);
encode(active_gid, bl);
@@ -473,13 +476,14 @@ public:
encode(clients_addrs, bl, features);
encode(clients_names, bl, features);
encode(flags, bl);
+ encode(force_disabled_modules, bl);
ENCODE_FINISH(bl);
return;
}
void decode(ceph::buffer::list::const_iterator& p)
{
- DECODE_START(13, p);
+ DECODE_START(14, p);
decode(epoch, p);
decode(active_addrs, p);
decode(active_gid, p);
@@ -549,6 +553,11 @@ public:
if (struct_v >= 13) {
decode(flags, p);
}
+
+ if (struct_v >= 14) {
+ decode(force_disabled_modules, p);
+ }
+
DECODE_FINISH(p);
}
@@ -603,6 +612,13 @@ public:
f->close_section();
}
f->close_section(); // always_on_modules
+
+ f->open_object_section("force_disabled_modules");
+ for (auto& m : force_disabled_modules) {
+ f->dump_string("module", m);
+ }
+ f->close_section();
+
f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
f->open_array_section("active_clients");
for (const auto& i : clients) {
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index 30382b97052..b89878dddb7 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -146,10 +146,12 @@ void MgrMonitor::create_initial()
}
pending_map.always_on_modules = always_on_modules();
pending_command_descs = mgr_commands;
- dout(10) << __func__ << " initial modules " << pending_map.modules
- << ", always on modules " << pending_map.get_always_on_modules()
- << ", " << pending_command_descs.size() << " commands"
+ dout(10) << __func__ << " initial enabled modules: " << pending_map.modules
<< dendl;
+ dout(10) << __func__ << "always on modules: " <<
+ pending_map.get_always_on_modules() << dendl;
+ dout(10) << __func__ << "total " << pending_command_descs.size() <<
+ " commands" << dendl;
}
void MgrMonitor::get_store_prefixes(std::set<string>& s) const
@@ -1019,6 +1021,13 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
f->dump_string("module", p);
}
f->close_section();
+
+ f->open_array_section("force_disabled_modules");
+ for (auto& p : map.force_disabled_modules) {
+ f->dump_string("module", p);
+ }
+ f->close_section();
+
f->open_array_section("enabled_modules");
for (auto& p : map.modules) {
if (map.get_always_on_modules().count(p) > 0)
@@ -1048,7 +1057,11 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
for (auto& p : map.get_always_on_modules()) {
tbl << p;
- tbl << "on (always on)";
+ if (map.force_disabled_modules.find(p) == map.force_disabled_modules.end()) {
+ tbl << "on (always on)";
+ } else {
+ tbl << "off (always on but force-disabled)";
+ }
tbl << TextTable::endrow;
}
for (auto& p : map.modules) {
@@ -1269,10 +1282,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
r = -EINVAL;
goto out;
}
- if (pending_map.get_always_on_modules().count(module) > 0) {
+
+ if (pending_map.get_always_on_modules().count(module) > 0 &&
+ !pending_map.force_disabled_modules.contains(module)) {
ss << "module '" << module << "' is already enabled (always-on)";
goto out;
}
+
bool force = false;
cmd_getval_compat_cephbool(cmdmap, "force", force);
if (!pending_map.all_support_module(module) &&
@@ -1296,7 +1312,12 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
ss << "module '" << module << "' is already enabled";
r = 0;
goto out;
+ } else if (pending_map.force_disabled_modules.contains(module)) {
+ pending_map.force_disabled_modules.erase(module);
+ r = 0;
+ goto out;
}
+
pending_map.modules.insert(module);
} else if (prefix == "mgr module disable") {
string module;
@@ -1306,8 +1327,9 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
goto out;
}
if (pending_map.get_always_on_modules().count(module) > 0) {
- ss << "module '" << module << "' cannot be disabled (always-on)";
- r = -EINVAL;
+ ss << "module '" << module << "' cannot be disabled (always-on), use " <<
+ "'ceph mgr module force disable' command to disable an always-on module";
+ r = -EPERM;
goto out;
}
if (!pending_map.module_enabled(module)) {
@@ -1318,7 +1340,52 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
if (!pending_map.modules.count(module)) {
ss << "module '" << module << "' is not enabled";
}
+ dout(8) << __func__ << " disabling module " << module << " from new " << dendl;
pending_map.modules.erase(module);
+ } else if (prefix == "mgr module force disable") {
+ string mod;
+ cmd_getval(cmdmap, "module", mod);
+
+ bool confirmation_flag = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", confirmation_flag);
+
+ if (mod.empty()) {
+ ss << "Module name wasn't passed!";
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (!pending_map.get_always_on_modules().contains(mod)) {
+ ss << "Always-on module named \"" << mod << "\" does not exist";
+ r = -EINVAL;
+ goto out;
+ } else if (pending_map.modules.contains(mod)) {
+ ss << "Module '" << mod << "' is not an always-on module, only always-on " <<
+ "modules can be disabled through this command.";
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (pending_map.force_disabled_modules.contains(mod)) {
+ ss << "Module \"" << mod << "\"is already disabled";
+ r = 0;
+ goto out;
+ }
+
+ if (!confirmation_flag) {
+ ss << "This command will disable operations and remove commands that "
+ << "other Ceph utilities expect to be available. Do not continue "
+ << "unless your cluster is already experiencing an event due to "
+ << "which it is advised to disable this module as part of "
+ << "troubleshooting. If you are sure that you wish to continue, "
+ << "run again with --yes-i-really-mean-it";
+ r = -EPERM;
+ goto out;
+ }
+
+ dout(8) << __func__ << " force-disabling module '" << mod << "'" << dendl;
+ pending_map.force_disabled_modules.insert(mod);
+ pending_map.modules.erase(mod);
} else {
ss << "Command '" << prefix << "' not implemented!";
r = -ENOSYS;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index b5de8837cb7..3cc3c8abd1e 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -1357,6 +1357,10 @@ COMMAND("mgr module enable "
COMMAND("mgr module disable "
"name=module,type=CephString",
"disable mgr module", "mgr", "rw")
+COMMAND("mgr module force disable "
+ "name=module,type=CephString "
+ "name=yes_i_really_mean_it,type=CephBool,req=false",
+ "force disable a always-on mgr module", "mgr", "rw")
COMMAND("mgr metadata name=who,type=CephString,req=false",
"dump metadata for all daemons or a specific daemon",
"mgr", "r")