diff options
author | Yuri Weinstein <yweinste@redhat.com> | 2024-10-30 19:10:42 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-30 19:10:42 +0100 |
commit | 408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95 (patch) | |
tree | 22b117691c41ca68646c86be55038c3e70da15c4 | |
parent | Merge pull request #55693 from adk3798/cephadm-nfsv3 (diff) | |
parent | osd: remove unnecessary return statements (diff) | |
download | ceph-408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95.tar.xz ceph-408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95.zip |
Merge pull request #54954 from diffs/main
osd: add clear_shards_repaired command
Reviewed-by: Ronen Friedman <rfriedma@redhat.com>
-rw-r--r-- | doc/rados/operations/health-checks.rst | 7 | ||||
-rwxr-xr-x | qa/standalone/osd/osd-rep-recov-eio.sh | 14 | ||||
-rw-r--r-- | src/osd/OSD.cc | 18 | ||||
-rw-r--r-- | src/osd/OSD.h | 1 |
4 files changed, 38 insertions, 2 deletions
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 1d5bb342d74..d627dfea01e 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1100,6 +1100,13 @@ the object data, there might exist failing disks that are not registering any scrub errors. This repair count is maintained as a way of identifying any such failing disks. +In order to allow clearing of the warning, a new command +``ceph tell osd.# clear_shards_repaired [count]`` has been added. +By default it will set the repair count to 0. A `count` value can be passed +to the command. Thus, the administrator has the option to re-enable the warning +by passing the value of ``mon_osd_warn_num_repaired`` (or above) to the command. +An alternative to using `clear_shards_repaired` is to mute the +`OSD_TOO_MANY_REPAIRS` alert with `ceph health mute`. LARGE_OMAP_OBJECTS __________________ diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 6fea441b3a9..a34f4a47189 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -219,6 +219,18 @@ function TEST_rados_repair_warning() { ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph health unmute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + + set -o pipefail + # Should clear this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -235,7 +247,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index ce46bb245ea..be69745765b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1064,7 +1064,12 @@ void OSDService::inc_osd_stat_repaired() { std::lock_guard l(stat_lock); osd_stat.num_shards_repaired++; - return; +} + +void OSDService::set_osd_stat_repaired(int64_t count) +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired = count; } float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, @@ -3219,6 +3224,11 @@ will start to track new ops received afterwards."; scrub_purged_snaps(); } + else if (prefix == "clear_shards_repaired") { + int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 0); + service.set_osd_stat_repaired(count); + } + else if (prefix == "reset_purged_snaps_last") { lock_guard l(osd_lock); superblock.purged_snaps_last = 0; @@ -4440,6 +4450,12 @@ void OSD::final_init() asok_hook, "debug the scrubber"); ceph_assert(r == 0); + r = admin_socket->register_command( + "clear_shards_repaired " + "name=count,type=CephInt,req=false,range=0", + asok_hook, + "clear num_shards_repaired to clear health warning"); + ceph_assert(r == 0); // -- pg commands -- // old form: ceph pg <pgid> command ... diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 7c9aed7c6ba..c825c19b1ff 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -731,6 +731,7 @@ public: osd_alert_list_t& alerts); osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs); void inc_osd_stat_repaired(void); + void set_osd_stat_repaired(int64_t count); float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0); osd_stat_t get_osd_stat() { std::lock_guard l(stat_lock); |