summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2024-12-27 12:06:10 +0100
committerVenky Shankar <vshankar@redhat.com>2024-12-27 12:06:10 +0100
commit2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch)
tree85216d9ee5584fc0ea849fc3fce2e86b44324f8e
parentMerge PR #60640 into main (diff)
parentPendingReleaseNotes: add note for replay completion warning (diff)
downloadceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz
ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip
Merge PR #55616 into main
* refs/pull/55616/head: PendingReleaseNotes: add note for replay completion warning qa: test to verify `MDS_ESTIMATED_REPLAY_TIME` warning doc: add a note for `MDS_ESTIMATED_REPLAY_TIME` MDS warning mds: emit warning for estinated replay time Reviewed-by: Patrick Donnelly <pdonnell@ibm.com> Reviewed-by: Milind Changire <mchangir@redhat.com>
-rw-r--r--PendingReleaseNotes2
-rw-r--r--doc/cephfs/health-messages.rst8
-rw-r--r--qa/suites/fs/multifs/tasks/failover.yaml1
-rw-r--r--qa/tasks/cephfs/test_failover.py55
-rw-r--r--src/common/options/mds.yaml.in6
-rw-r--r--src/mds/Beacon.cc14
-rw-r--r--src/mds/MDLog.cc48
-rw-r--r--src/mds/MDLog.h4
-rw-r--r--src/mds/mdstypes.cc5
-rw-r--r--src/mds/mdstypes.h8
-rw-r--r--src/messages/MMDSBeacon.h4
11 files changed, 154 insertions, 1 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 873ac6d21dd..6ea402c9991 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -331,6 +331,8 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block,
which specifies the path within the CephFS to mount this export on. If this and the other
`EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`.
+* CephFS: MDS emits a warning with estimated replay completion time when replay
+ runs for more than 30 seconds.
>=18.0.0
diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst
index 0f171c6ccc9..7aa1f2e44ee 100644
--- a/doc/cephfs/health-messages.rst
+++ b/doc/cephfs/health-messages.rst
@@ -269,3 +269,11 @@ other daemons, please see :ref:`health-checks`.
To evict and permanently block broken clients from connecting to the
cluster, set the ``required_client_feature`` bit ``client_mds_auth_caps``.
+
+``MDS_ESTIMATED_REPLAY_TIME``
+-----------------------------
+ Message
+ "HEALTH_WARN Replay: x% complete. Estimated time remaining *x* seconds
+
+ Description
+ When an MDS journal replay takes more than 30 seconds, this message indicates the estimated time to completion.
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
index 55dde639c23..b7a0338566c 100644
--- a/qa/suites/fs/multifs/tasks/failover.yaml
+++ b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -8,6 +8,7 @@ overrides:
- \(MDS_DAMAGE\)
- \(FS_DEGRADED\)
- \(MDS_CACHE_OVERSIZED\)
+ - \(MDS_ESTIMATED_REPLAY_TIME\)
ceph-fuse:
disabled: true
tasks:
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 29af1e76a4f..46139163ddd 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,4 @@
+import re
import time
import signal
import logging
@@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase):
self.fs.wait_for_daemons(timeout=90)
+class TestFailoverBeaconHealth(CephFSTestCase):
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 1
+
+ def initiate_journal_replay(self, num_files=100):
+ """ Initiate journal replay by creating files and restarting mds server."""
+
+ self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
+ self.mounts[0].test_files = [str(x) for x in range(num_files)]
+ self.mounts[0].create_files()
+ self.fs.fail()
+ self.fs.set_joinable()
+
+ def test_replay_beacon_estimated_time(self):
+ """
+ That beacon emits warning message with estimated time to complete replay
+ """
+ self.initiate_journal_replay()
+ self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
+ # remove the config so that replay finishes and the cluster
+ # is HEALTH_OK
+ self.config_rm("mds", "mds_delay_journal_replay_for_testing")
+ self.wait_for_health_clear(timeout=60)
+
+ def test_replay_estimated_time_accuracy(self):
+ self.initiate_journal_replay(250)
+ def replay_complete():
+ health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+ codes = [s for s in health['checks']]
+ return 'MDS_ESTIMATED_REPLAY_TIME' not in codes
+
+ def get_estimated_time():
+ completion_percentage = 0.0
+ time_duration = pending_duration = 0
+ with safe_while(sleep=5, tries=360) as proceed:
+ while proceed():
+ health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+ codes = [s for s in health['checks']]
+ if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
+ message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
+ ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
+ m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
+ if not m:
+ continue
+ completion_percentage = float(m.group(1))
+ time_duration = int(m.group(3))
+ pending_duration = int(m.group(4))
+ log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
+ if completion_percentage >= 50:
+ return (completion_percentage, time_duration, pending_duration)
+ _, _, pending_duration = get_estimated_time()
+ # wait for 25% more time to avoid false negative failures
+ self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)
+
class TestFailover(CephFSTestCase):
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 2
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index 94824faef6b..4cbc079a215 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1713,6 +1713,12 @@ options:
default: 1000
services:
- mds
+- name: mds_delay_journal_replay_for_testing
+ type: millisecs
+ level: dev
+ desc: Delay the journal replay to verify the replay time estimate
+ long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests.
+ default: 0
flags:
- runtime
- name: mds_server_dispatch_killpoint_random
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 6fbfc79d416..1c1eeb4ecf8 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -26,6 +26,7 @@
#include "mds/MDSRank.h"
#include "mds/MDSMap.h"
#include "mds/Locker.h"
+#include "mds/mdstypes.h"
#include "Beacon.h"
@@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds)
}
}
}
+ if (mds->is_replay()) {
+ CachedStackStringStream css;
+ auto estimate = mds->mdlog->get_estimated_replay_finish_time();
+ // this probably should be configurable, however, its fine to report
+ // if replay is running for more than 30 seconds.
+ if (estimate.elapsed_time > std::chrono::seconds(30)) {
+ *css << "replay: " << estimate.percent_complete << "% complete - elapsed time: "
+ << estimate.elapsed_time << ", estimated time remaining: "
+ << estimate.estimated_time;
+ MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv());
+ health.metrics.push_back(m);
+ }
+ }
}
MDSMap::DaemonState Beacon::get_want_state() const
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 4bbf2a1a141..d041e3b2fc8 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const
return journaler->get_write_safe_pos();
}
+// estimate the replay completion time based on mdlog journal pointers
+EstimatedReplayTime MDLog::get_estimated_replay_finish_time() {
+ ceph_assert(mds->is_replay());
+ EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()};
+ if (!journaler) {
+ return estimated_time;
+ }
+
+ auto read_pos = journaler->get_read_pos();
+ auto write_pos = journaler->get_write_pos();
+ auto trimmed_pos = journaler->get_trimmed_pos();
+
+ dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos="
+ << write_pos << ", trimmed_pos=" << trimmed_pos << dendl;
+
+ if (read_pos == trimmed_pos || write_pos == trimmed_pos) {
+ return estimated_time;
+ }
+
+ auto total_bytes = write_pos - trimmed_pos;
+ double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes;
+ auto elapsed_time = std::chrono::duration_cast<std::chrono::seconds>
+ (ceph::coarse_mono_clock::now() - replay_start_time);
+ auto time = ((1 - percent_complete) / percent_complete) * elapsed_time;
+
+ dout(20) << __func__ << "percent_complete=" << percent_complete
+ << ", elapsed_time=" << elapsed_time
+ << ", estimated_time=" << std::chrono::round<std::chrono::seconds>(time)
+ << dendl;
+
+ estimated_time.percent_complete = percent_complete * 100;
+ estimated_time.elapsed_time = elapsed_time;
+ estimated_time.estimated_time = std::chrono::round<std::chrono::seconds>(time);
+ dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete
+ << ", estimated_time.elapsed_time=" << estimated_time.elapsed_time
+ << ", estimated_time.estimated_time=" << estimated_time.estimated_time
+ << dendl;
+ return estimated_time;
+}
void MDLog::create(MDSContext *c)
{
@@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion)
{
std::lock_guard l(mds->mds_lock);
journaler = front_journal;
+ replay_start_time = ceph::coarse_mono_clock::now();
}
C_SaferCond recover_wait;
@@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa
// i am a separate thread
void MDLog::_replay_thread()
{
- dout(10) << "_replay_thread start" << dendl;
+ dout(10) << __func__ << ": start time: " << replay_start_time << ", now: "
+ << ceph::coarse_mono_clock::now() << dendl;
// loop
int r = 0;
while (1) {
+ auto sleep_time = g_conf().get_val<std::chrono::milliseconds>("mds_delay_journal_replay_for_testing");
+ if (unlikely(sleep_time > 0ms)) {
+ dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl;
+ std::this_thread::sleep_for(sleep_time);
+ }
// wait for read?
journaler->check_isreadable();
if (journaler->get_error()) {
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index a858b40fa03..180a34c9d82 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -53,6 +53,7 @@ enum {
#include "LogSegment.h"
#include "MDSMap.h"
#include "SegmentBoundary.h"
+#include "mdstypes.h"
#include <list>
#include <map>
@@ -162,6 +163,7 @@ public:
void reopen(MDSContext *onopen);
void append();
void replay(MDSContext *onfinish);
+ EstimatedReplayTime get_estimated_replay_finish_time();
void standby_trim_segments();
@@ -328,5 +330,7 @@ private:
std::atomic<bool> upkeep_log_trim_shutdown{false};
std::map<uint64_t, std::vector<Context*>> waiting_for_expire; // protected by mds_lock
+
+ ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero();
};
#endif
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 680218e62e3..f9424eed6dc 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec
ls.back()->realm.seq = 2;
ls.back()->realm.parent = 1;
}
+
+void EstimatedReplayTime::print(std::ostream& out) {
+ out << "replay: " << percent_complete << "% complete - elapsed time: "
+ << elapsed_time << ", estimated time remaining: " << estimated_time;
+}
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 3b8269006cb..742d7b23432 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r)
}
WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
+struct EstimatedReplayTime {
+ double percent_complete;
+ std::chrono::seconds estimated_time;
+ std::chrono::seconds elapsed_time;
+
+ void print(std::ostream& out);
+};
+
#endif
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index c157c33e758..526285aae8c 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -48,6 +48,7 @@ enum mds_metric_t {
MDS_HEALTH_CLIENTS_LAGGY,
MDS_HEALTH_CLIENTS_LAGGY_MANY,
MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH,
+ MDS_HEALTH_ESTIMATED_REPLAY_TIME,
MDS_HEALTH_DUMMY, // not a real health warning, for testing
};
@@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m)
case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY";
case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY";
case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH";
+ case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME";
case MDS_HEALTH_DUMMY: return "MDS_DUMMY";
default:
return "???";
@@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m)
return "%num% client(s) laggy due to laggy OSDs";
case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH:
return "%num% MDS report clients with broken root_squash implementation";
+ case MDS_HEALTH_ESTIMATED_REPLAY_TIME:
+ return "%num% estimated journal replay time";
default:
return "???";
}