summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuri Weinstein <yweinste@redhat.com>2024-10-30 19:10:42 +0100
committerGitHub <noreply@github.com>2024-10-30 19:10:42 +0100
commit408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95 (patch)
tree22b117691c41ca68646c86be55038c3e70da15c4
parentMerge pull request #55693 from adk3798/cephadm-nfsv3 (diff)
parentosd: remove unnecessary return statements (diff)
downloadceph-408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95.tar.xz
ceph-408c10613e8fd88ed29fbf9daa4cc04e7bf5cb95.zip
Merge pull request #54954 from diffs/main
osd: add clear_shards_repaired command Reviewed-by: Ronen Friedman <rfriedma@redhat.com>
-rw-r--r--doc/rados/operations/health-checks.rst7
-rwxr-xr-xqa/standalone/osd/osd-rep-recov-eio.sh14
-rw-r--r--src/osd/OSD.cc18
-rw-r--r--src/osd/OSD.h1
4 files changed, 38 insertions, 2 deletions
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
index 1d5bb342d74..d627dfea01e 100644
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -1100,6 +1100,13 @@ the object data, there might exist failing disks that are not registering any
scrub errors. This repair count is maintained as a way of identifying any such
failing disks.
+In order to allow clearing of the warning, a new command
+``ceph tell osd.# clear_shards_repaired [count]`` has been added.
+By default it will set the repair count to 0. A `count` value can be passed
+to the command. Thus, the administrator has the option to re-enable the warning
+by passing the value of ``mon_osd_warn_num_repaired`` (or above) to the command.
+An alternative to using `clear_shards_repaired` is to mute the
+`OSD_TOO_MANY_REPAIRS` alert with `ceph health mute`.
LARGE_OMAP_OBJECTS
__________________
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh
index 6fea441b3a9..a34f4a47189 100755
--- a/qa/standalone/osd/osd-rep-recov-eio.sh
+++ b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -219,6 +219,18 @@ function TEST_rados_repair_warning() {
ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
set +o pipefail
+ ceph health unmute OSD_TOO_MANY_REPAIRS
+ ceph tell osd.$primary clear_shards_repaired
+ sleep 10
+
+ set -o pipefail
+ # Should clear this
+ ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
+ set +o pipefail
+
+ ceph tell osd.$primary clear_shards_repaired $OBJS
+ sleep 10
+
for i in $(seq 1 $OBJS)
do
inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
@@ -235,7 +247,7 @@ function TEST_rados_repair_warning() {
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
- # Give mon a chance to notice additional OSD and unmute
+ # Give mon a chance to notice additional OSD and reset num_shards_repaired
# The default tick time is 5 seconds
CHECKTIME=10
LOOPS=0
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index ce46bb245ea..be69745765b 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1064,7 +1064,12 @@ void OSDService::inc_osd_stat_repaired()
{
std::lock_guard l(stat_lock);
osd_stat.num_shards_repaired++;
- return;
+}
+
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+ std::lock_guard l(stat_lock);
+ osd_stat.num_shards_repaired = count;
}
float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
@@ -3219,6 +3224,11 @@ will start to track new ops received afterwards.";
scrub_purged_snaps();
}
+ else if (prefix == "clear_shards_repaired") {
+ int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 0);
+ service.set_osd_stat_repaired(count);
+ }
+
else if (prefix == "reset_purged_snaps_last") {
lock_guard l(osd_lock);
superblock.purged_snaps_last = 0;
@@ -4440,6 +4450,12 @@ void OSD::final_init()
asok_hook,
"debug the scrubber");
ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "clear_shards_repaired "
+ "name=count,type=CephInt,req=false,range=0",
+ asok_hook,
+ "clear num_shards_repaired to clear health warning");
+ ceph_assert(r == 0);
// -- pg commands --
// old form: ceph pg <pgid> command ...
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 7c9aed7c6ba..c825c19b1ff 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -731,6 +731,7 @@ public:
osd_alert_list_t& alerts);
osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs);
void inc_osd_stat_repaired(void);
+ void set_osd_stat_repaired(int64_t count);
float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
osd_stat_t get_osd_stat() {
std::lock_guard l(stat_lock);