diff options
author | Venky Shankar <vshankar@redhat.com> | 2023-06-28 06:53:54 +0200 |
---|---|---|
committer | Venky Shankar <vshankar@redhat.com> | 2023-06-28 06:53:54 +0200 |
commit | 809d4758148a4468b0513ea891e4d804fab7f3cc (patch) | |
tree | 629c589bfe87854967420cd3d99eaddcf288bb08 /qa | |
parent | Merge pull request #52090 from xxhdx1985126/wip-fixedkv-btree-merge-fix (diff) | |
parent | doc/cephfs: document MDS_CLIENTS_LAGGY health warning (diff) | |
download | ceph-809d4758148a4468b0513ea891e4d804fab7f3cc.tar.xz ceph-809d4758148a4468b0513ea891e4d804fab7f3cc.zip |
Merge PR #49971 into main
* refs/pull/49971/head:
doc/cephfs: document MDS_CLIENTS_LAGGY health warning
qa: ignore warnings
qa: add test cases to check client eviction if an OSD is laggy
mds,messages: enable beacon to report clients lagginess
mds: do not evict client on laggy osds
common: add new config option to defer client eviction
osd: add method to check for laggy osds
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Diffstat (limited to 'qa')
-rw-r--r-- | qa/suites/fs/functional/tasks/client-recovery.yaml | 3 | ||||
-rw-r--r-- | qa/tasks/cephfs/test_client_recovery.py | 108 |
2 files changed, 111 insertions, 0 deletions
diff --git a/qa/suites/fs/functional/tasks/client-recovery.yaml b/qa/suites/fs/functional/tasks/client-recovery.yaml index e67acc3aba3..7ea93a3677b 100644 --- a/qa/suites/fs/functional/tasks/client-recovery.yaml +++ b/qa/suites/fs/functional/tasks/client-recovery.yaml @@ -9,6 +9,9 @@ overrides: - MDS_CLIENT_LATE_RELEASE - t responding to mclientcaps - file system flag refuse_client_session is set + - Degraded data redundancy + - MDS_CLIENTS_LAGGY + - Reduced data availability tasks: - cephfs_test_runner: fail_on_skip: false diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py index f8a50ad37a9..aea508ab785 100644 --- a/qa/tasks/cephfs/test_client_recovery.py +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -4,6 +4,8 @@ Teuthology task for exercising CephFS client recovery """ import logging +import random +import signal from textwrap import dedent import time import distutils.version as version @@ -755,3 +757,109 @@ class TestClientRecovery(CephFSTestCase): self.assertEqual(len(self.fs.mds_tell(["session", "ls"])), 0) self.mount_a.umount_wait(force=True) + +class TestClientOnLaggyOSD(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + def make_osd_laggy(self, osd, sleep=120): + self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGSTOP) + time.sleep(sleep) + self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGCONT) + + def clear_laggy_params(self, osd): + default_laggy_weight = self.config_get('mon', 'mon_osd_laggy_weight') + self.config_set('mon', 'mon_osd_laggy_weight', 1) + self.mds_cluster.mon_manager.revive_osd(osd) + self.config_set('mon', 'mon_osd_laggy_weight', default_laggy_weight) + + def get_a_random_osd(self): + osds = self.mds_cluster.mon_manager.get_osd_status() + return random.choice(osds['live']) + + def test_client_eviction_if_config_is_set(self): + """ + If any client gets unresponsive/it's session get idle due to lagginess + with any OSD and if config option defer_client_eviction_on_laggy_osds + is set true(default true) then make sure clients are not evicted until + OSD(s) return to normal. + """ + + self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) + self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'true') + self.assertEqual(self.config_get( + 'mds', 'defer_client_eviction_on_laggy_osds'), 'true') + + # make an OSD laggy + osd = self.get_a_random_osd() + self.make_osd_laggy(osd) + + try: + mount_a_gid = self.mount_a.get_global_id() + + self.mount_a.kill() + + # client session should be open, it gets stale + # only after session_timeout time. + self.assert_session_state(mount_a_gid, "open") + + # makes session stale + time.sleep(self.fs.get_var("session_timeout") * 1.5) + self.assert_session_state(mount_a_gid, "stale") + + # it takes time to have laggy clients entries in cluster log, + # wait for 6 minutes to see if it is visible, finally restart + # the client + tries = 6 + while True: + try: + with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs", + timeout=55): + # make sure clients weren't evicted + self.assert_session_count(2) + break + except AssertionError: + tries -= 1 + if tries: + continue + raise + finally: + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + self.mount_a.create_destroy() + self.clear_laggy_params(osd) + + def test_client_eviction_if_config_is_unset(self): + """ + If an OSD is laggy but config option defer_client_eviction_on_laggy_osds + is unset then an unresponsive client does get evicted. + """ + + self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) + self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'false') + self.assertEqual(self.config_get( + 'mds', 'defer_client_eviction_on_laggy_osds'), 'false') + + # make an OSD laggy + osd = self.get_a_random_osd() + self.make_osd_laggy(osd) + + try: + session_timeout = self.fs.get_var("session_timeout") + mount_a_gid = self.mount_a.get_global_id() + + self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)]) + + self.mount_a.kill() + + self.assert_session_count(2) + + time.sleep(session_timeout * 1.5) + self.assert_session_state(mount_a_gid, "open") + + time.sleep(session_timeout) + self.assert_session_count(1) + finally: + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + self.mount_a.create_destroy() + self.clear_laggy_params(osd) |