Merge PR #55616 into main

* refs/pull/55616/head: PendingReleaseNotes: add note for replay completion warning qa: test to verify `MDS_ESTIMATED_REPLAY_TIME` warning doc: add a note for `MDS_ESTIMATED_REPLAY_TIME` MDS warning mds: emit warning for estinated replay time Reviewed-by: Patrick Donnelly <pdonnell@ibm.com> Reviewed-by: Milind Changire <mchangir@redhat.com>
author: Venky Shankar <vshankar@redhat.com> 2024-12-27 12:06:10 +0100
committer: Venky Shankar <vshankar@redhat.com> 2024-12-27 12:06:10 +0100
commit: 2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch)
tree: 85216d9ee5584fc0ea849fc3fce2e86b44324f8e
parent: Merge PR #60640 into main (diff)
parent: PendingReleaseNotes: add note for replay completion warning (diff)
download: ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz
ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip
11 files changed, 154 insertions, 1 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 873ac6d21dd..6ea402c9991 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -331,6 +331,8 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
 * NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block,
   which specifies the path within the CephFS to mount this export on. If this and the other
   `EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`.
+* CephFS: MDS emits a warning with estimated replay completion time  when replay
+  runs for more than 30 seconds.
 
 >=18.0.0
 
diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst
index 0f171c6ccc9..7aa1f2e44ee 100644
--- a/doc/cephfs/health-messages.rst
+++ b/doc/cephfs/health-messages.rst
@@ -269,3 +269,11 @@ other daemons, please see :ref:`health-checks`.
 
     To evict and permanently block broken clients from connecting to the
     cluster, set the ``required_client_feature`` bit ``client_mds_auth_caps``.
+
+``MDS_ESTIMATED_REPLAY_TIME``
+-----------------------------
+  Message
+    "HEALTH_WARN Replay: x% complete. Estimated time remaining *x* seconds
+
+  Description
+    When an MDS journal replay takes more than 30 seconds, this message indicates the estimated time to completion.
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
index 55dde639c23..b7a0338566c 100644
--- a/qa/suites/fs/multifs/tasks/failover.yaml
+++ b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -8,6 +8,7 @@ overrides:
       - \(MDS_DAMAGE\)
       - \(FS_DEGRADED\)
       - \(MDS_CACHE_OVERSIZED\)
+      - \(MDS_ESTIMATED_REPLAY_TIME\)
   ceph-fuse:
     disabled: true
 tasks:
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 29af1e76a4f..46139163ddd 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,4 @@
+import re
 import time
 import signal
 import logging
@@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase):
 
         self.fs.wait_for_daemons(timeout=90)
 
+class TestFailoverBeaconHealth(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def initiate_journal_replay(self, num_files=100):
+        """ Initiate journal replay by creating files and restarting mds server."""
+
+        self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
+        self.mounts[0].test_files = [str(x) for x in range(num_files)]
+        self.mounts[0].create_files()
+        self.fs.fail()
+        self.fs.set_joinable()
+
+    def test_replay_beacon_estimated_time(self):
+        """
+        That beacon emits warning message with estimated time to complete replay
+        """
+        self.initiate_journal_replay()
+        self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
+        # remove the config so that replay finishes and the cluster
+        # is HEALTH_OK
+        self.config_rm("mds", "mds_delay_journal_replay_for_testing")
+        self.wait_for_health_clear(timeout=60)
+
+    def test_replay_estimated_time_accuracy(self):
+        self.initiate_journal_replay(250)
+        def replay_complete():
+            health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+            codes = [s for s in health['checks']]
+            return 'MDS_ESTIMATED_REPLAY_TIME' not in codes
+
+        def get_estimated_time():
+            completion_percentage = 0.0
+            time_duration = pending_duration = 0
+            with safe_while(sleep=5, tries=360) as proceed:
+                while proceed():
+                    health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+                    codes = [s for s in health['checks']]
+                    if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
+                        message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
+                        ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
+                        m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
+                        if not m:
+                            continue
+                        completion_percentage = float(m.group(1))
+                        time_duration = int(m.group(3))
+                        pending_duration = int(m.group(4))
+                        log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
+                        if completion_percentage >= 50:
+                            return (completion_percentage, time_duration, pending_duration)
+        _, _, pending_duration = get_estimated_time()
+        # wait for 25% more time to avoid false negative failures
+        self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)
+
 class TestFailover(CephFSTestCase):
     CLIENTS_REQUIRED = 1
     MDSS_REQUIRED = 2
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index 94824faef6b..4cbc079a215 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1713,6 +1713,12 @@ options:
   default: 1000
   services:
   - mds
+- name: mds_delay_journal_replay_for_testing
+  type: millisecs
+  level: dev
+  desc: Delay the journal replay to verify the replay time estimate
+  long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests.
+  default: 0
   flags:
   - runtime
 - name: mds_server_dispatch_killpoint_random
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 6fbfc79d416..1c1eeb4ecf8 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -26,6 +26,7 @@
 #include "mds/MDSRank.h"
 #include "mds/MDSMap.h"
 #include "mds/Locker.h"
+#include "mds/mdstypes.h"
 
 #include "Beacon.h"
 
@@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds)
       }
     }
   }
+  if (mds->is_replay()) {
+    CachedStackStringStream css;
+    auto estimate = mds->mdlog->get_estimated_replay_finish_time();
+    // this probably should be configurable, however, its fine to report
+    // if replay is running for more than 30 seconds.
+    if (estimate.elapsed_time > std::chrono::seconds(30)) {
+      *css << "replay: " << estimate.percent_complete << "% complete - elapsed time: "
+	   << estimate.elapsed_time << ", estimated time remaining: "
+	   << estimate.estimated_time;
+      MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv());
+      health.metrics.push_back(m);
+    }
+  }
 }
 
 MDSMap::DaemonState Beacon::get_want_state() const
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 4bbf2a1a141..d041e3b2fc8 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const
   return journaler->get_write_safe_pos(); 
 }
 
+// estimate the replay completion time based on mdlog journal pointers
+EstimatedReplayTime MDLog::get_estimated_replay_finish_time() {
+  ceph_assert(mds->is_replay());
 
+  EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()};
+  if (!journaler) {
+    return estimated_time;
+  }
+
+  auto read_pos = journaler->get_read_pos();
+  auto write_pos = journaler->get_write_pos();
+  auto trimmed_pos = journaler->get_trimmed_pos();
+
+  dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos="
+	   << write_pos << ", trimmed_pos=" << trimmed_pos << dendl;
+
+  if (read_pos == trimmed_pos || write_pos == trimmed_pos) {
+    return estimated_time;
+  }
+
+  auto total_bytes = write_pos - trimmed_pos;
+  double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes;
+  auto elapsed_time = std::chrono::duration_cast<std::chrono::seconds>
+    (ceph::coarse_mono_clock::now() - replay_start_time);
+  auto time = ((1 - percent_complete) / percent_complete) * elapsed_time;
+
+  dout(20) << __func__ << "percent_complete=" << percent_complete
+	   << ", elapsed_time=" << elapsed_time
+	   << ", estimated_time=" << std::chrono::round<std::chrono::seconds>(time)
+	   << dendl;
+
+  estimated_time.percent_complete = percent_complete * 100;
+  estimated_time.elapsed_time = elapsed_time;
+  estimated_time.estimated_time = std::chrono::round<std::chrono::seconds>(time);
+  dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete
+	   << ", estimated_time.elapsed_time=" << estimated_time.elapsed_time
+	   << ", estimated_time.estimated_time=" << estimated_time.estimated_time
+	   << dendl;
+  return estimated_time;
+}
 
 void MDLog::create(MDSContext *c)
 {
@@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion)
   {
     std::lock_guard l(mds->mds_lock);
     journaler = front_journal;
+    replay_start_time = ceph::coarse_mono_clock::now();
   }
 
   C_SaferCond recover_wait;
@@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
 // i am a separate thread
 void MDLog::_replay_thread()
 {
-  dout(10) << "_replay_thread start" << dendl;
+  dout(10) << __func__ << ": start time: " << replay_start_time << ", now: "
+           << ceph::coarse_mono_clock::now() << dendl;
 
   // loop
   int r = 0;
   while (1) {
+    auto sleep_time = g_conf().get_val<std::chrono::milliseconds>("mds_delay_journal_replay_for_testing");
+    if (unlikely(sleep_time > 0ms)) {
+      dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl;
+      std::this_thread::sleep_for(sleep_time);
+    }
     // wait for read?
     journaler->check_isreadable(); 
     if (journaler->get_error()) {
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index a858b40fa03..180a34c9d82 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -53,6 +53,7 @@ enum {
 #include "LogSegment.h"
 #include "MDSMap.h"
 #include "SegmentBoundary.h"
+#include "mdstypes.h"
 
 #include <list>
 #include <map>
@@ -162,6 +163,7 @@ public:
   void reopen(MDSContext *onopen);
   void append();
   void replay(MDSContext *onfinish);
+  EstimatedReplayTime get_estimated_replay_finish_time();
 
   void standby_trim_segments();
 
@@ -328,5 +330,7 @@ private:
   std::atomic<bool> upkeep_log_trim_shutdown{false};
 
   std::map<uint64_t, std::vector<Context*>> waiting_for_expire; // protected by mds_lock
+
+  ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero();
 };
 #endif
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 680218e62e3..f9424eed6dc 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec
   ls.back()->realm.seq = 2;
   ls.back()->realm.parent = 1;
 }
+
+void EstimatedReplayTime::print(std::ostream& out) {
+  out << "replay: " << percent_complete << "% complete - elapsed time: "
+      << elapsed_time << ", estimated time remaining: " << estimated_time;
+}
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 3b8269006cb..742d7b23432 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r)
 }
 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
 
+struct EstimatedReplayTime {
+  double percent_complete;
+  std::chrono::seconds estimated_time;
+  std::chrono::seconds elapsed_time;
+
+  void print(std::ostream& out);
+};
+
 #endif
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index c157c33e758..526285aae8c 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -48,6 +48,7 @@ enum mds_metric_t {
   MDS_HEALTH_CLIENTS_LAGGY,
   MDS_HEALTH_CLIENTS_LAGGY_MANY,
   MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH,
+  MDS_HEALTH_ESTIMATED_REPLAY_TIME,
   MDS_HEALTH_DUMMY, // not a real health warning, for testing
 };
 
@@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m)
   case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY";
   case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY";
   case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH";
+  case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME";
   case MDS_HEALTH_DUMMY: return "MDS_DUMMY";
   default:
     return "???";
@@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m)
     return "%num% client(s) laggy due to laggy OSDs";  
   case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH:
     return "%num% MDS report clients with broken root_squash implementation";
+  case MDS_HEALTH_ESTIMATED_REPLAY_TIME:
+    return "%num% estimated journal replay time";
   default:
     return "???";
   }
author	Venky Shankar <vshankar@redhat.com>	2024-12-27 12:06:10 +0100
committer	Venky Shankar <vshankar@redhat.com>	2024-12-27 12:06:10 +0100
commit	2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch)
tree	85216d9ee5584fc0ea849fc3fce2e86b44324f8e
parent	Merge PR #60640 into main (diff)
parent	PendingReleaseNotes: add note for replay completion warning (diff)
download	ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip