summaryrefslogtreecommitdiffstats
path: root/qa/standalone/scrub
diff options
context:
space:
mode:
Diffstat (limited to 'qa/standalone/scrub')
-rwxr-xr-xqa/standalone/scrub/osd-recovery-scrub.sh4
-rwxr-xr-xqa/standalone/scrub/osd-scrub-repair.sh2
-rwxr-xr-xqa/standalone/scrub/osd-scrub-test.sh238
-rw-r--r--qa/standalone/scrub/scrub-helpers.sh108
4 files changed, 343 insertions, 9 deletions
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 843e9b9901b..7b77a60f35b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -163,7 +163,7 @@ function wait_for_scrub_mod() {
fi
sleep 1
# are we still the primary?
- local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
if [ $orig_primary != $current_primary ]; then
echo $orig_primary no longer primary for $pgid
return 0
@@ -194,7 +194,7 @@ function pg_scrub_mod() {
local last_scrub=$(get_last_scrub_stamp $pgid)
# locate the primary
- local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
local recovery=false
ceph pg scrub $pgid
#ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 491e46603f7..6dd5b10ae8f 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -5833,7 +5833,7 @@ function TEST_periodic_scrub_replicated() {
flush_pg_stats
# Request a regular scrub and it will be done
- pg_schedule_scrub $pg
+ pg_scrub $pg
grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
# deep-scrub error is no longer present
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index 8015e023bdd..385479258f2 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() {
--osd_op_queue=wpq \
--osd_stats_update_period_not_scrubbing=1 \
--osd_stats_update_period_scrubbing=1 \
+ --osd_scrub_retry_after_noscrub=1 \
+ --osd_scrub_retry_pg_state=2 \
+ --osd_scrub_retry_delay=2 \
--osd_scrub_sleep=0.2"
for osd in $(seq 0 $(expr $OSDS - 1))
@@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() {
declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" )
wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1
- sleep 2
-
#
# step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub
# scheduled for the future' value
#
- ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
- ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
ceph osd set noscrub || return 1
sleep 2
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
+ ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
+ sleep 8
saved_last_stamp=${sched_data['query_last_stamp']}
ceph tell $pgid schedule-scrub
@@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() {
teardown $dir || return 1
}
+function wait_initial_scrubs() {
+ local -n pg_to_prim_dict=$1
+ local extr_dbg=1 # note: 3 and above leave some temp files around
+
+ # set a long schedule for the periodic scrubs. Wait for the
+ # initial 'no previous scrub is known' scrubs to finish for all PGs.
+ ceph tell osd.* config set osd_scrub_min_interval 7200
+ ceph tell osd.* config set osd_deep_scrub_interval 14400
+ ceph tell osd.* config set osd_max_scrubs 32
+ ceph tell osd.* config set osd_scrub_sleep 0
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max 10
+ ceph tell osd.* config set osd_scrub_chunk_max 10
+
+ for pg in "${!pg_to_prim_dict[@]}"; do
+ (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg"
+ ceph tell $pg scrub || return 1
+ done
+
+ sleep 1
+ (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+
+ tout=20
+ while [ $tout -gt 0 ] ; do
+ sleep 0.5
+ (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+ not_done=$(ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l )
+ # note that we should ignore a header line
+ if [ "$not_done" -le 1 ]; then
+ break
+ fi
+ not_done=$(( (not_done - 2) / 4 ))
+ echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)"
+ tout=$((tout - 1))
+ done
+ (( tout == 0 )) && return 1
+ return 0
+}
+
+
+# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued
+# for its replicas:
+# if the operator is requesting a scrub of the same PG, the operator's request
+# should trigger an abort of the ongoing scrub.
+#
+# The test process:
+# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one.
+# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one
+# should be stuck in replica reservation. We will verify that.
+# - now - the operator is requesting that second PG to be scrubbed. The original (pending)
+# scrub should be aborted. We would check for:
+# - the new, operator's scrub to be scheduled
+# - the replicas' reservers to be released
+function TEST_abort_periodic_for_operator() {
+ local dir=$1
+ local -A cluster_conf=(
+ ['osds_num']="5"
+ ['pgs_in_pool']="16"
+ ['pool_name']="test"
+ )
+ local extr_dbg=1 # note: 3 and above leave some temp files around
+
+ standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1
+ local poolid=${cluster_conf['pool_id']}
+ local poolname=${cluster_conf['pool_name']}
+ echo "Pool: $poolname : $poolid"
+
+ #turn off '-x' (but remember previous state)
+ local saved_echo_flag=${-//[^x]/}
+ set +x
+
+ # fill the pool with some data
+ TESTDATA="testdata.$$"
+ dd if=/dev/urandom of=$TESTDATA bs=320 count=1
+ for i in $( seq 1 256 )
+ do
+ rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null
+ done
+ rm -f $TESTDATA
+ if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+
+ # create the dictionary of the PGs in the pool
+ declare -A pg_pr
+ declare -A pg_ac
+ declare -A pg_po
+ build_pg_dicts "$dir" pg_pr pg_ac pg_po "-"
+ (( extr_dbg >= 2 )) && echo "PGs table:"
+ for pg in "${!pg_pr[@]}"; do
+ (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}"
+ done
+
+ wait_initial_scrubs pg_pr || return 1
+
+ # limit all OSDs to one scrub at a time
+ ceph tell osd.* config set osd_max_scrubs 1
+ ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1
+
+ # configure for slow scrubs
+ ceph tell osd.* config set osd_scrub_sleep 3
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max 2
+ ceph tell osd.* config set osd_scrub_chunk_max 2
+ (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty
+
+ # the first PG to work with:
+ local pg1="1.0"
+ # and another one, that shares its primary, and at least one more active set member
+ local pg2=""
+ for pg in "${!pg_pr[@]}"; do
+ if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then
+ local -i common=0
+ count_common_active $pg $pg1 pg_ac common
+ if [[ $common -gt 1 ]]; then
+ pg2=$pg
+ break
+ fi
+ fi
+ done
+ if [[ -z "$pg2" ]]; then
+ # \todo handle the case when no such PG is found
+ echo "No PG found with the same primary as $pg1"
+ return 1
+ fi
+
+ # the common primary is allowed two concurrent scrubs
+ ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2
+ echo "The two PGs to manipulate are $pg1 and $pg2"
+
+ set_query_debug "$pg1"
+ # wait till the information published by pg1 is updated to show it as
+ # not being scrubbed
+ local is_act
+ for i in $( seq 1 3 )
+ do
+ is_act=$(ceph pg "$pg1" query | jq '.scrubber.active')
+ if [[ "$is_act" = "false" ]]; then
+ break
+ fi
+ echo "Still waiting for pg $pg1 to finish scrubbing"
+ sleep 0.7
+ done
+ ceph pg dump pgs
+ if [[ "$is_act" != "false" ]]; then
+ ceph pg "$pg1" query
+ echo "PG $pg1 appears to be still scrubbing"
+ return 1
+ fi
+ sleep 0.5
+
+ echo "Initiating a periodic scrub of $pg1"
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ ceph tell $pg1 schedule-deep-scrub || return 1
+ sleep 1
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+ for i in $( seq 1 14 )
+ do
+ sleep 0.5
+ stt=$(ceph pg "$pg1" query | jq '.scrubber')
+ is_active=$(echo $stt | jq '.active')
+ is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas')
+ if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then
+ break
+ fi
+ echo "Still waiting for pg $pg1 to start scrubbing: $stt"
+ done
+ if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then
+ ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ echo "The scrub is not active or is reserving replicas"
+ return 1
+ fi
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+
+ # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared
+ # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from
+ # reserving its replicas.
+
+ (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty
+
+ # now - the 2'nd scrub - which should be blocked on reserving
+ set_query_debug "$pg2"
+ ceph tell "$pg2" schedule-deep-scrub
+ sleep 0.5
+ (( extr_dbg >= 2 )) && echo "===================================================================================="
+ (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ sleep 1
+ (( extr_dbg >= 2 )) && echo "===================================================================================="
+ (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+ # make sure pg2 scrub is stuck in the reserving state
+ local stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+ local pg2_is_reserving
+ pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+ if [[ "$pg2_is_reserving" != "true" ]]; then
+ echo "The scheduled scrub for $pg2 should have been stuck"
+ ceph pg dump pgs
+ return 1
+ fi
+
+ # now - issue an operator-initiated scrub on pg2.
+ # The periodic scrub should be aborted, and the operator-initiated scrub should start.
+ echo "Instructing $pg2 to perform a high-priority scrub"
+ ceph tell "$pg2" scrub
+ for i in $( seq 1 10 )
+ do
+ sleep 0.5
+ stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+ pg2_is_active=$(echo $stt2 | jq '.active')
+ pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+ if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then
+ break
+ fi
+ echo "Still waiting: $stt2"
+ done
+
+ if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then
+ echo "The high-priority scrub for $pg2 is not active or is reserving replicas"
+ return 1
+ fi
+ echo "Done"
+}
+
+
+
main osd-scrub-test "$@"
# Local Variables:
diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh
index 49b8346b8d2..dd37b643e08 100644
--- a/qa/standalone/scrub/scrub-helpers.sh
+++ b/qa/standalone/scrub/scrub-helpers.sh
@@ -240,8 +240,8 @@ function standard_scrub_cluster() {
local saved_echo_flag=${-//[^x]/}
set +x
- run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
+ run_mon $dir a --osd_pool_default_size=3 || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
--osd_scrub_interval_randomize_ratio=0 \
@@ -249,9 +249,12 @@ function standard_scrub_cluster() {
--osd_pool_default_pg_autoscale_mode=off \
--osd_pg_stat_report_interval_max_seconds=1 \
--osd_pg_stat_report_interval_max_epochs=1 \
+ --osd_stats_update_period_not_scrubbing=3 \
+ --osd_stats_update_period_scrubbing=1 \
--osd_scrub_retry_after_noscrub=5 \
--osd_scrub_retry_pg_state=5 \
--osd_scrub_retry_delay=3 \
+ --osd_pool_default_size=3 \
$extra_pars"
for osd in $(seq 0 $(expr $OSDS - 1))
@@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() {
}
+# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries:
+# - pg_primary_dict: a dictionary of pgid -> acting_primary
+# - pg_acting_dict: a dictionary of pgid -> acting set
+# - pg_pool_dict: a dictionary of pgid -> pool
+# If the input file is '-', the function will fetch the dump directly from the ceph cluster.
+function build_pg_dicts {
+ local dir=$1
+ local -n pg_primary_dict=$2
+ local -n pg_acting_dict=$3
+ local -n pg_pool_dict=$4
+ local infile=$5
+
+ local extr_dbg=0 # note: 3 and above leave some temp files around
+
+ #turn off '-x' (but remember previous state)
+ local saved_echo_flag=${-//[^x]/}
+ set +x
+
+ # if the infile name is '-', fetch the dump directly from the ceph cluster
+ if [[ $infile == "-" ]]; then
+ local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty"
+ local -r ceph_cmd_out=$(eval $ceph_cmd)
+ local -r ceph_cmd_rc=$?
+ if [[ $ceph_cmd_rc -ne 0 ]]; then
+ echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc"
+ fi
+ (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2
+ l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' `
+ else
+ l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile `
+ fi
+ (( extr_dbg >= 2 )) && echo "L0: $l0"
+
+ mapfile -t l1 < <(echo "$l0" | jq -c '.[]')
+ (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}"
+
+ for item in "${l1[@]}"; do
+ pgid=$(echo "$item" | jq -r '.pgid')
+ acting=$(echo "$item" | jq -r '.acting | @sh')
+ pg_acting_dict["$pgid"]=$acting
+ acting_primary=$(echo "$item" | jq -r '.acting_primary')
+ pg_primary_dict["$pgid"]=$acting_primary
+ pool=$(echo "$item" | jq -r '.pool')
+ pg_pool_dict["$pgid"]=$pool
+ done
+
+ if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+}
+
+
+# a function that counts the number of common active-set elements between two PGs
+# 1 - the first PG
+# 2 - the second PG
+# 3 - the dictionary of active sets
+function count_common_active {
+ local pg1=$1
+ local pg2=$2
+ local -n pg_acting_dict=$3
+ local -n res=$4
+
+ local -a a1=(${pg_acting_dict[$pg1]})
+ local -a a2=(${pg_acting_dict[$pg2]})
+
+ local -i cnt=0
+ for i in "${a1[@]}"; do
+ for j in "${a2[@]}"; do
+ if [[ $i -eq $j ]]; then
+ cnt=$((cnt+1))
+ fi
+ done
+ done
+
+ res=$cnt
+}
+
+
+# given a PG, find another one with a disjoint active set
+# - but allow a possible common Primary
+# 1 - the PG
+# 2 - the dictionary of active sets
+# 3 - [out] - the PG with a disjoint active set
+function find_disjoint_but_primary {
+ local pg=$1
+ local -n ac_dict=$2
+ local -n p_dict=$3
+ local -n res=$4
+
+ for cand in "${!ac_dict[@]}"; do
+ if [[ "$cand" != "$pg" ]]; then
+ local -i common=0
+ count_common_active "$pg" "$cand" ac_dict common
+ if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then
+ res=$cand
+ return
+ fi
+ fi
+ done
+}
+
+
+
# A debug flag is set for the PG specified, causing the 'pg query' command to display
# an additional 'scrub sessions counter' field.
#