diff options
author | DanWritesCode <github@dann.me> | 2023-12-18 22:09:07 +0100 |
---|---|---|
committer | DanWritesCode <github@dann.me> | 2024-03-04 22:08:48 +0100 |
commit | 78d6bfe54c3b9b60fab36a640b1ce77c8f022fa9 (patch) | |
tree | 895c45ce18afa98fbd2baf923a5deea50d802006 /qa/standalone | |
parent | Merge pull request #52495 from adamemerson/wip-neorados-learning-from-experience (diff) | |
download | ceph-78d6bfe54c3b9b60fab36a640b1ce77c8f022fa9.tar.xz ceph-78d6bfe54c3b9b60fab36a640b1ce77c8f022fa9.zip |
osd: add clear_shards_repaired command
This command will allow us to clear the OSD_TOO_MANY_REPAIRS alert
by setting the shard repair count to 0. This will help in cases where
the alert was a false positive, or a condition that has since cleared
at the disk level. Often, zeroing out the repair count is
better than muting the alert or restarting the OSD.
Fixes: https://tracker.ceph.com/issues/54182
Co-authored-by: David Zafman <dzafman@redhat.com>
Signed-off-by: Daniel Radjenovic <dradjenovic@digitalocean.com>
Diffstat (limited to 'qa/standalone')
-rwxr-xr-x | qa/standalone/osd/osd-rep-recov-eio.sh | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 6fea441b3a9..a34f4a47189 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -219,6 +219,18 @@ function TEST_rados_repair_warning() { ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph health unmute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + + set -o pipefail + # Should clear this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -235,7 +247,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 |