Merge PR #55616 into main

* refs/pull/55616/head: PendingReleaseNotes: add note for replay completion warning qa: test to verify `MDS_ESTIMATED_REPLAY_TIME` warning doc: add a note for `MDS_ESTIMATED_REPLAY_TIME` MDS warning mds: emit warning for estinated replay time Reviewed-by: Patrick Donnelly <pdonnell@ibm.com> Reviewed-by: Milind Changire <mchangir@redhat.com>
author: Venky Shankar <vshankar@redhat.com> 2024-12-27 12:06:10 +0100
committer: Venky Shankar <vshankar@redhat.com> 2024-12-27 12:06:10 +0100
commit: 2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch)
tree: 85216d9ee5584fc0ea849fc3fce2e86b44324f8e /qa
parent: Merge PR #60640 into main (diff)
parent: PendingReleaseNotes: add note for replay completion warning (diff)
download: ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz
ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip
2 files changed, 56 insertions, 0 deletions
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
index 55dde639c23..b7a0338566c 100644
--- a/qa/suites/fs/multifs/tasks/failover.yaml
+++ b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -8,6 +8,7 @@ overrides:
       - \(MDS_DAMAGE\)
       - \(FS_DEGRADED\)
       - \(MDS_CACHE_OVERSIZED\)
+      - \(MDS_ESTIMATED_REPLAY_TIME\)
   ceph-fuse:
     disabled: true
 tasks:
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 29af1e76a4f..46139163ddd 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,4 @@
+import re
 import time
 import signal
 import logging
@@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase):
 
         self.fs.wait_for_daemons(timeout=90)
 
+class TestFailoverBeaconHealth(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def initiate_journal_replay(self, num_files=100):
+        """ Initiate journal replay by creating files and restarting mds server."""
+
+        self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
+        self.mounts[0].test_files = [str(x) for x in range(num_files)]
+        self.mounts[0].create_files()
+        self.fs.fail()
+        self.fs.set_joinable()
+
+    def test_replay_beacon_estimated_time(self):
+        """
+        That beacon emits warning message with estimated time to complete replay
+        """
+        self.initiate_journal_replay()
+        self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
+        # remove the config so that replay finishes and the cluster
+        # is HEALTH_OK
+        self.config_rm("mds", "mds_delay_journal_replay_for_testing")
+        self.wait_for_health_clear(timeout=60)
+
+    def test_replay_estimated_time_accuracy(self):
+        self.initiate_journal_replay(250)
+        def replay_complete():
+            health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+            codes = [s for s in health['checks']]
+            return 'MDS_ESTIMATED_REPLAY_TIME' not in codes
+
+        def get_estimated_time():
+            completion_percentage = 0.0
+            time_duration = pending_duration = 0
+            with safe_while(sleep=5, tries=360) as proceed:
+                while proceed():
+                    health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+                    codes = [s for s in health['checks']]
+                    if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
+                        message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
+                        ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
+                        m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
+                        if not m:
+                            continue
+                        completion_percentage = float(m.group(1))
+                        time_duration = int(m.group(3))
+                        pending_duration = int(m.group(4))
+                        log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
+                        if completion_percentage >= 50:
+                            return (completion_percentage, time_duration, pending_duration)
+        _, _, pending_duration = get_estimated_time()
+        # wait for 25% more time to avoid false negative failures
+        self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)
+
 class TestFailover(CephFSTestCase):
     CLIENTS_REQUIRED = 1
     MDSS_REQUIRED = 2
author	Venky Shankar <vshankar@redhat.com>	2024-12-27 12:06:10 +0100
committer	Venky Shankar <vshankar@redhat.com>	2024-12-27 12:06:10 +0100
commit	2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch)
tree	85216d9ee5584fc0ea849fc3fce2e86b44324f8e /qa
parent	Merge PR #60640 into main (diff)
parent	PendingReleaseNotes: add note for replay completion warning (diff)
download	ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip