diff options
author | Venky Shankar <vshankar@redhat.com> | 2024-12-27 12:06:10 +0100 |
---|---|---|
committer | Venky Shankar <vshankar@redhat.com> | 2024-12-27 12:06:10 +0100 |
commit | 2215d554bea0d47fc131e90f2c5a0e6494f471c0 (patch) | |
tree | 85216d9ee5584fc0ea849fc3fce2e86b44324f8e | |
parent | Merge PR #60640 into main (diff) | |
parent | PendingReleaseNotes: add note for replay completion warning (diff) | |
download | ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.tar.xz ceph-2215d554bea0d47fc131e90f2c5a0e6494f471c0.zip |
Merge PR #55616 into main
* refs/pull/55616/head:
PendingReleaseNotes: add note for replay completion warning
qa: test to verify `MDS_ESTIMATED_REPLAY_TIME` warning
doc: add a note for `MDS_ESTIMATED_REPLAY_TIME` MDS warning
mds: emit warning for estinated replay time
Reviewed-by: Patrick Donnelly <pdonnell@ibm.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
-rw-r--r-- | PendingReleaseNotes | 2 | ||||
-rw-r--r-- | doc/cephfs/health-messages.rst | 8 | ||||
-rw-r--r-- | qa/suites/fs/multifs/tasks/failover.yaml | 1 | ||||
-rw-r--r-- | qa/tasks/cephfs/test_failover.py | 55 | ||||
-rw-r--r-- | src/common/options/mds.yaml.in | 6 | ||||
-rw-r--r-- | src/mds/Beacon.cc | 14 | ||||
-rw-r--r-- | src/mds/MDLog.cc | 48 | ||||
-rw-r--r-- | src/mds/MDLog.h | 4 | ||||
-rw-r--r-- | src/mds/mdstypes.cc | 5 | ||||
-rw-r--r-- | src/mds/mdstypes.h | 8 | ||||
-rw-r--r-- | src/messages/MMDSBeacon.h | 4 |
11 files changed, 154 insertions, 1 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 873ac6d21dd..6ea402c9991 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -331,6 +331,8 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config * NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block, which specifies the path within the CephFS to mount this export on. If this and the other `EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`. +* CephFS: MDS emits a warning with estimated replay completion time when replay + runs for more than 30 seconds. >=18.0.0 diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst index 0f171c6ccc9..7aa1f2e44ee 100644 --- a/doc/cephfs/health-messages.rst +++ b/doc/cephfs/health-messages.rst @@ -269,3 +269,11 @@ other daemons, please see :ref:`health-checks`. To evict and permanently block broken clients from connecting to the cluster, set the ``required_client_feature`` bit ``client_mds_auth_caps``. + +``MDS_ESTIMATED_REPLAY_TIME`` +----------------------------- + Message + "HEALTH_WARN Replay: x% complete. Estimated time remaining *x* seconds + + Description + When an MDS journal replay takes more than 30 seconds, this message indicates the estimated time to completion. diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 55dde639c23..b7a0338566c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -8,6 +8,7 @@ overrides: - \(MDS_DAMAGE\) - \(FS_DEGRADED\) - \(MDS_CACHE_OVERSIZED\) + - \(MDS_ESTIMATED_REPLAY_TIME\) ceph-fuse: disabled: true tasks: diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 29af1e76a4f..46139163ddd 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,4 @@ +import re import time import signal import logging @@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase): self.fs.wait_for_daemons(timeout=90) +class TestFailoverBeaconHealth(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def initiate_journal_replay(self, num_files=100): + """ Initiate journal replay by creating files and restarting mds server.""" + + self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000") + self.mounts[0].test_files = [str(x) for x in range(num_files)] + self.mounts[0].create_files() + self.fs.fail() + self.fs.set_joinable() + + def test_replay_beacon_estimated_time(self): + """ + That beacon emits warning message with estimated time to complete replay + """ + self.initiate_journal_replay() + self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60) + # remove the config so that replay finishes and the cluster + # is HEALTH_OK + self.config_rm("mds", "mds_delay_journal_replay_for_testing") + self.wait_for_health_clear(timeout=60) + + def test_replay_estimated_time_accuracy(self): + self.initiate_journal_replay(250) + def replay_complete(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + return 'MDS_ESTIMATED_REPLAY_TIME' not in codes + + def get_estimated_time(): + completion_percentage = 0.0 + time_duration = pending_duration = 0 + with safe_while(sleep=5, tries=360) as proceed: + while proceed(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + if 'MDS_ESTIMATED_REPLAY_TIME' in codes: + message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message'] + ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s" + m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message) + if not m: + continue + completion_percentage = float(m.group(1)) + time_duration = int(m.group(3)) + pending_duration = int(m.group(4)) + log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}") + if completion_percentage >= 50: + return (completion_percentage, time_duration, pending_duration) + _, _, pending_duration = get_estimated_time() + # wait for 25% more time to avoid false negative failures + self.wait_until_true(replay_complete, timeout=pending_duration * 1.25) + class TestFailover(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 94824faef6b..4cbc079a215 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1713,6 +1713,12 @@ options: default: 1000 services: - mds +- name: mds_delay_journal_replay_for_testing + type: millisecs + level: dev + desc: Delay the journal replay to verify the replay time estimate + long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests. + default: 0 flags: - runtime - name: mds_server_dispatch_killpoint_random diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 6fbfc79d416..1c1eeb4ecf8 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -26,6 +26,7 @@ #include "mds/MDSRank.h" #include "mds/MDSMap.h" #include "mds/Locker.h" +#include "mds/mdstypes.h" #include "Beacon.h" @@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds) } } } + if (mds->is_replay()) { + CachedStackStringStream css; + auto estimate = mds->mdlog->get_estimated_replay_finish_time(); + // this probably should be configurable, however, its fine to report + // if replay is running for more than 30 seconds. + if (estimate.elapsed_time > std::chrono::seconds(30)) { + *css << "replay: " << estimate.percent_complete << "% complete - elapsed time: " + << estimate.elapsed_time << ", estimated time remaining: " + << estimate.estimated_time; + MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv()); + health.metrics.push_back(m); + } + } } MDSMap::DaemonState Beacon::get_want_state() const diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 4bbf2a1a141..d041e3b2fc8 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const return journaler->get_write_safe_pos(); } +// estimate the replay completion time based on mdlog journal pointers +EstimatedReplayTime MDLog::get_estimated_replay_finish_time() { + ceph_assert(mds->is_replay()); + EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()}; + if (!journaler) { + return estimated_time; + } + + auto read_pos = journaler->get_read_pos(); + auto write_pos = journaler->get_write_pos(); + auto trimmed_pos = journaler->get_trimmed_pos(); + + dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos=" + << write_pos << ", trimmed_pos=" << trimmed_pos << dendl; + + if (read_pos == trimmed_pos || write_pos == trimmed_pos) { + return estimated_time; + } + + auto total_bytes = write_pos - trimmed_pos; + double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes; + auto elapsed_time = std::chrono::duration_cast<std::chrono::seconds> + (ceph::coarse_mono_clock::now() - replay_start_time); + auto time = ((1 - percent_complete) / percent_complete) * elapsed_time; + + dout(20) << __func__ << "percent_complete=" << percent_complete + << ", elapsed_time=" << elapsed_time + << ", estimated_time=" << std::chrono::round<std::chrono::seconds>(time) + << dendl; + + estimated_time.percent_complete = percent_complete * 100; + estimated_time.elapsed_time = elapsed_time; + estimated_time.estimated_time = std::chrono::round<std::chrono::seconds>(time); + dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete + << ", estimated_time.elapsed_time=" << estimated_time.elapsed_time + << ", estimated_time.estimated_time=" << estimated_time.estimated_time + << dendl; + return estimated_time; +} void MDLog::create(MDSContext *c) { @@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion) { std::lock_guard l(mds->mds_lock); journaler = front_journal; + replay_start_time = ceph::coarse_mono_clock::now(); } C_SaferCond recover_wait; @@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa // i am a separate thread void MDLog::_replay_thread() { - dout(10) << "_replay_thread start" << dendl; + dout(10) << __func__ << ": start time: " << replay_start_time << ", now: " + << ceph::coarse_mono_clock::now() << dendl; // loop int r = 0; while (1) { + auto sleep_time = g_conf().get_val<std::chrono::milliseconds>("mds_delay_journal_replay_for_testing"); + if (unlikely(sleep_time > 0ms)) { + dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl; + std::this_thread::sleep_for(sleep_time); + } // wait for read? journaler->check_isreadable(); if (journaler->get_error()) { diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index a858b40fa03..180a34c9d82 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -53,6 +53,7 @@ enum { #include "LogSegment.h" #include "MDSMap.h" #include "SegmentBoundary.h" +#include "mdstypes.h" #include <list> #include <map> @@ -162,6 +163,7 @@ public: void reopen(MDSContext *onopen); void append(); void replay(MDSContext *onfinish); + EstimatedReplayTime get_estimated_replay_finish_time(); void standby_trim_segments(); @@ -328,5 +330,7 @@ private: std::atomic<bool> upkeep_log_trim_shutdown{false}; std::map<uint64_t, std::vector<Context*>> waiting_for_expire; // protected by mds_lock + + ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero(); }; #endif diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 680218e62e3..f9424eed6dc 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec ls.back()->realm.seq = 2; ls.back()->realm.parent = 1; } + +void EstimatedReplayTime::print(std::ostream& out) { + out << "replay: " << percent_complete << "% complete - elapsed time: " + << elapsed_time << ", estimated time remaining: " << estimated_time; +} diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 3b8269006cb..742d7b23432 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) } WRITE_CLASS_ENCODER(MDSCacheObjectInfo) +struct EstimatedReplayTime { + double percent_complete; + std::chrono::seconds estimated_time; + std::chrono::seconds elapsed_time; + + void print(std::ostream& out); +}; + #endif diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index c157c33e758..526285aae8c 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -48,6 +48,7 @@ enum mds_metric_t { MDS_HEALTH_CLIENTS_LAGGY, MDS_HEALTH_CLIENTS_LAGGY_MANY, MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH, + MDS_HEALTH_ESTIMATED_REPLAY_TIME, MDS_HEALTH_DUMMY, // not a real health warning, for testing }; @@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m) case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY"; case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME"; case MDS_HEALTH_DUMMY: return "MDS_DUMMY"; default: return "???"; @@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m) return "%num% client(s) laggy due to laggy OSDs"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "%num% MDS report clients with broken root_squash implementation"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: + return "%num% estimated journal replay time"; default: return "???"; } |