summaryrefslogtreecommitdiffstats
path: root/qa
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2023-06-28 06:53:54 +0200
committerVenky Shankar <vshankar@redhat.com>2023-06-28 06:53:54 +0200
commit809d4758148a4468b0513ea891e4d804fab7f3cc (patch)
tree629c589bfe87854967420cd3d99eaddcf288bb08 /qa
parentMerge pull request #52090 from xxhdx1985126/wip-fixedkv-btree-merge-fix (diff)
parentdoc/cephfs: document MDS_CLIENTS_LAGGY health warning (diff)
downloadceph-809d4758148a4468b0513ea891e4d804fab7f3cc.tar.xz
ceph-809d4758148a4468b0513ea891e4d804fab7f3cc.zip
Merge PR #49971 into main
* refs/pull/49971/head: doc/cephfs: document MDS_CLIENTS_LAGGY health warning qa: ignore warnings qa: add test cases to check client eviction if an OSD is laggy mds,messages: enable beacon to report clients lagginess mds: do not evict client on laggy osds common: add new config option to defer client eviction osd: add method to check for laggy osds Reviewed-by: Venky Shankar <vshankar@redhat.com>
Diffstat (limited to 'qa')
-rw-r--r--qa/suites/fs/functional/tasks/client-recovery.yaml3
-rw-r--r--qa/tasks/cephfs/test_client_recovery.py108
2 files changed, 111 insertions, 0 deletions
diff --git a/qa/suites/fs/functional/tasks/client-recovery.yaml b/qa/suites/fs/functional/tasks/client-recovery.yaml
index e67acc3aba3..7ea93a3677b 100644
--- a/qa/suites/fs/functional/tasks/client-recovery.yaml
+++ b/qa/suites/fs/functional/tasks/client-recovery.yaml
@@ -9,6 +9,9 @@ overrides:
- MDS_CLIENT_LATE_RELEASE
- t responding to mclientcaps
- file system flag refuse_client_session is set
+ - Degraded data redundancy
+ - MDS_CLIENTS_LAGGY
+ - Reduced data availability
tasks:
- cephfs_test_runner:
fail_on_skip: false
diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py
index f8a50ad37a9..aea508ab785 100644
--- a/qa/tasks/cephfs/test_client_recovery.py
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -4,6 +4,8 @@ Teuthology task for exercising CephFS client recovery
"""
import logging
+import random
+import signal
from textwrap import dedent
import time
import distutils.version as version
@@ -755,3 +757,109 @@ class TestClientRecovery(CephFSTestCase):
self.assertEqual(len(self.fs.mds_tell(["session", "ls"])), 0)
self.mount_a.umount_wait(force=True)
+
+class TestClientOnLaggyOSD(CephFSTestCase):
+ CLIENTS_REQUIRED = 2
+
+ def make_osd_laggy(self, osd, sleep=120):
+ self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGSTOP)
+ time.sleep(sleep)
+ self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGCONT)
+
+ def clear_laggy_params(self, osd):
+ default_laggy_weight = self.config_get('mon', 'mon_osd_laggy_weight')
+ self.config_set('mon', 'mon_osd_laggy_weight', 1)
+ self.mds_cluster.mon_manager.revive_osd(osd)
+ self.config_set('mon', 'mon_osd_laggy_weight', default_laggy_weight)
+
+ def get_a_random_osd(self):
+ osds = self.mds_cluster.mon_manager.get_osd_status()
+ return random.choice(osds['live'])
+
+ def test_client_eviction_if_config_is_set(self):
+ """
+ If any client gets unresponsive/it's session get idle due to lagginess
+ with any OSD and if config option defer_client_eviction_on_laggy_osds
+ is set true(default true) then make sure clients are not evicted until
+ OSD(s) return to normal.
+ """
+
+ self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+ self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'true')
+ self.assertEqual(self.config_get(
+ 'mds', 'defer_client_eviction_on_laggy_osds'), 'true')
+
+ # make an OSD laggy
+ osd = self.get_a_random_osd()
+ self.make_osd_laggy(osd)
+
+ try:
+ mount_a_gid = self.mount_a.get_global_id()
+
+ self.mount_a.kill()
+
+ # client session should be open, it gets stale
+ # only after session_timeout time.
+ self.assert_session_state(mount_a_gid, "open")
+
+ # makes session stale
+ time.sleep(self.fs.get_var("session_timeout") * 1.5)
+ self.assert_session_state(mount_a_gid, "stale")
+
+ # it takes time to have laggy clients entries in cluster log,
+ # wait for 6 minutes to see if it is visible, finally restart
+ # the client
+ tries = 6
+ while True:
+ try:
+ with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
+ timeout=55):
+ # make sure clients weren't evicted
+ self.assert_session_count(2)
+ break
+ except AssertionError:
+ tries -= 1
+ if tries:
+ continue
+ raise
+ finally:
+ self.mount_a.kill_cleanup()
+ self.mount_a.mount_wait()
+ self.mount_a.create_destroy()
+ self.clear_laggy_params(osd)
+
+ def test_client_eviction_if_config_is_unset(self):
+ """
+ If an OSD is laggy but config option defer_client_eviction_on_laggy_osds
+ is unset then an unresponsive client does get evicted.
+ """
+
+ self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+ self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'false')
+ self.assertEqual(self.config_get(
+ 'mds', 'defer_client_eviction_on_laggy_osds'), 'false')
+
+ # make an OSD laggy
+ osd = self.get_a_random_osd()
+ self.make_osd_laggy(osd)
+
+ try:
+ session_timeout = self.fs.get_var("session_timeout")
+ mount_a_gid = self.mount_a.get_global_id()
+
+ self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)])
+
+ self.mount_a.kill()
+
+ self.assert_session_count(2)
+
+ time.sleep(session_timeout * 1.5)
+ self.assert_session_state(mount_a_gid, "open")
+
+ time.sleep(session_timeout)
+ self.assert_session_count(1)
+ finally:
+ self.mount_a.kill_cleanup()
+ self.mount_a.mount_wait()
+ self.mount_a.create_destroy()
+ self.clear_laggy_params(osd)