4 files changed, 343 insertions, 9 deletions
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 843e9b9901b..7b77a60f35b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -163,7 +163,7 @@ function wait_for_scrub_mod() {
         fi
         sleep 1
         # are we still the primary?
-        local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+        local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
         if [ $orig_primary != $current_primary ]; then
             echo $orig_primary no longer primary for $pgid
             return 0
@@ -194,7 +194,7 @@ function pg_scrub_mod() {
 
     local last_scrub=$(get_last_scrub_stamp $pgid)
     # locate the primary
-    local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+    local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
     local recovery=false
     ceph pg scrub $pgid
     #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 491e46603f7..6dd5b10ae8f 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -5833,7 +5833,7 @@ function TEST_periodic_scrub_replicated() {
 
     flush_pg_stats
     # Request a regular scrub and it will be done
-    pg_schedule_scrub $pg
+    pg_scrub $pg
     grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
 
     # deep-scrub error is no longer present
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index 8015e023bdd..385479258f2 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() {
             --osd_op_queue=wpq \
             --osd_stats_update_period_not_scrubbing=1 \
             --osd_stats_update_period_scrubbing=1 \
+            --osd_scrub_retry_after_noscrub=1 \
+            --osd_scrub_retry_pg_state=2 \
+            --osd_scrub_retry_delay=2 \
             --osd_scrub_sleep=0.2"
 
     for osd in $(seq 0 $(expr $OSDS - 1))
@@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() {
     declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" )
     wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1
 
-    sleep 2
-
     #
     # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub
     #         scheduled for the future' value
     #
 
-    ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
-    ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
     ceph osd set noscrub || return 1
     sleep 2
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
+    ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
+    sleep 8
     saved_last_stamp=${sched_data['query_last_stamp']}
 
     ceph tell $pgid schedule-scrub
@@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() {
     teardown $dir || return 1
 }
 
+function wait_initial_scrubs() {
+    local -n pg_to_prim_dict=$1
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    # set a long schedule for the periodic scrubs. Wait for the
+    # initial 'no previous scrub is known' scrubs to finish for all PGs.
+    ceph tell osd.* config set osd_scrub_min_interval 7200
+    ceph tell osd.* config set osd_deep_scrub_interval 14400
+    ceph tell osd.* config set osd_max_scrubs 32
+    ceph tell osd.* config set osd_scrub_sleep 0
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max 10
+    ceph tell osd.* config set osd_scrub_chunk_max 10
+
+    for pg in "${!pg_to_prim_dict[@]}"; do
+      (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg"
+      ceph tell $pg scrub || return 1
+    done
+
+    sleep 1
+    (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \
+      jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+
+    tout=20
+    while [ $tout -gt 0 ] ; do
+      sleep 0.5
+      (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \
+        jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+      not_done=$(ceph pg dump pgs --format=json-pretty | \
+        jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l )
+      # note that we should ignore a header line
+      if [ "$not_done" -le 1 ]; then
+        break
+      fi
+      not_done=$(( (not_done - 2) / 4 ))
+      echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)"
+      tout=$((tout - 1))
+    done
+    (( tout == 0 )) && return 1
+    return 0
+}
+
+
+# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued
+# for its replicas:
+# if the operator is requesting a scrub of the same PG, the operator's request
+# should trigger an abort of the ongoing scrub.
+#
+# The test process:
+# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one.
+# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one
+#   should be stuck in replica reservation. We will verify that.
+# - now - the operator is requesting that second PG to be scrubbed. The original (pending)
+#   scrub should be aborted. We would check for:
+#   - the new, operator's scrub to be scheduled
+#   - the replicas' reservers to be released
+function TEST_abort_periodic_for_operator() {
+    local dir=$1
+    local -A cluster_conf=(
+        ['osds_num']="5"
+        ['pgs_in_pool']="16"
+        ['pool_name']="test"
+    )
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1
+    local poolid=${cluster_conf['pool_id']}
+    local poolname=${cluster_conf['pool_name']}
+    echo "Pool: $poolname : $poolid"
+
+    #turn off '-x' (but remember previous state)
+    local saved_echo_flag=${-//[^x]/}
+    set +x
+
+    # fill the pool with some data
+    TESTDATA="testdata.$$"
+    dd if=/dev/urandom of=$TESTDATA bs=320 count=1
+    for i in $( seq 1 256 )
+    do
+        rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null
+    done
+    rm -f $TESTDATA
+    if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+
+    # create the dictionary of the PGs in the pool
+    declare -A pg_pr
+    declare -A pg_ac
+    declare -A pg_po
+    build_pg_dicts "$dir" pg_pr pg_ac pg_po "-"
+    (( extr_dbg >= 2 )) && echo "PGs table:"
+    for pg in "${!pg_pr[@]}"; do
+      (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}"
+    done
+
+    wait_initial_scrubs pg_pr || return 1
+
+    # limit all OSDs to one scrub at a time
+    ceph tell osd.* config set osd_max_scrubs 1
+    ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1
+
+    # configure for slow scrubs
+    ceph tell osd.* config set osd_scrub_sleep 3
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max 2
+    ceph tell osd.* config set osd_scrub_chunk_max 2
+    (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty
+
+    # the first PG to work with:
+    local pg1="1.0"
+    # and another one, that shares its primary, and at least one more active set member
+    local pg2=""
+     for pg in "${!pg_pr[@]}"; do
+      if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then
+        local -i common=0
+        count_common_active $pg $pg1 pg_ac common
+        if [[ $common -gt 1 ]]; then
+          pg2=$pg
+          break
+        fi
+      fi
+    done
+    if [[ -z "$pg2" ]]; then
+      # \todo handle the case when no such PG is found
+      echo "No PG found with the same primary as $pg1"
+      return 1
+    fi
+
+    # the common primary is allowed two concurrent scrubs
+    ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2
+    echo "The two PGs to manipulate are $pg1 and $pg2"
+
+    set_query_debug "$pg1"
+    # wait till the information published by pg1 is updated to show it as
+    # not being scrubbed
+    local is_act
+    for i in $( seq 1 3 )
+    do
+      is_act=$(ceph pg "$pg1" query | jq '.scrubber.active')
+      if [[ "$is_act" = "false" ]]; then
+          break
+      fi
+      echo "Still waiting for pg $pg1 to finish scrubbing"
+      sleep 0.7
+    done
+    ceph pg dump pgs
+    if [[ "$is_act" != "false" ]]; then
+      ceph pg "$pg1" query
+      echo "PG $pg1 appears to be still scrubbing"
+      return 1
+    fi
+    sleep 0.5
+
+    echo "Initiating a periodic scrub of $pg1"
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+    ceph tell $pg1 schedule-deep-scrub || return 1
+    sleep 1
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+    for i in $( seq 1 14 )
+    do
+      sleep 0.5
+      stt=$(ceph pg "$pg1" query | jq '.scrubber')
+      is_active=$(echo $stt | jq '.active')
+      is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas')
+      if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then
+          break
+      fi
+      echo "Still waiting for pg $pg1 to start scrubbing: $stt"
+    done
+    if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then
+      ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+      echo "The scrub is not active or is reserving replicas"
+      return 1
+    fi
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+
+    # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared
+    # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from
+    # reserving its replicas.
+
+    (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty
+
+    # now - the 2'nd scrub - which should be blocked on reserving
+    set_query_debug "$pg2"
+    ceph tell "$pg2" schedule-deep-scrub
+    sleep 0.5
+    (( extr_dbg >= 2 )) && echo "===================================================================================="
+    (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+    sleep 1
+    (( extr_dbg >= 2 )) && echo "===================================================================================="
+    (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+    # make sure pg2 scrub is stuck in the reserving state
+    local stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+    local pg2_is_reserving
+    pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+    if [[ "$pg2_is_reserving" != "true" ]]; then
+      echo "The scheduled scrub for $pg2 should have been stuck"
+      ceph pg dump pgs
+      return 1
+    fi
+
+    # now - issue an operator-initiated scrub on pg2.
+    # The periodic scrub should be aborted, and the operator-initiated scrub should start.
+    echo "Instructing $pg2 to perform a high-priority scrub"
+    ceph tell "$pg2" scrub
+    for i in $( seq 1 10 )
+    do
+      sleep 0.5
+      stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+      pg2_is_active=$(echo $stt2 | jq '.active')
+      pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+      if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then
+            break
+      fi
+      echo "Still waiting: $stt2"
+    done
+
+    if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then
+      echo "The high-priority scrub for $pg2 is not active or is reserving replicas"
+      return 1
+    fi
+    echo "Done"
+}
+
+
+
 main osd-scrub-test "$@"
 
 # Local Variables:
diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh
index 49b8346b8d2..dd37b643e08 100644
--- a/qa/standalone/scrub/scrub-helpers.sh
+++ b/qa/standalone/scrub/scrub-helpers.sh
@@ -240,8 +240,8 @@ function standard_scrub_cluster() {
     local saved_echo_flag=${-//[^x]/}
     set +x
 
-    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
-    run_mgr $dir x || return 1
+    run_mon $dir a --osd_pool_default_size=3 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
 
     local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
             --osd_scrub_interval_randomize_ratio=0 \
@@ -249,9 +249,12 @@ function standard_scrub_cluster() {
             --osd_pool_default_pg_autoscale_mode=off \
             --osd_pg_stat_report_interval_max_seconds=1 \
             --osd_pg_stat_report_interval_max_epochs=1 \
+            --osd_stats_update_period_not_scrubbing=3 \
+            --osd_stats_update_period_scrubbing=1 \
             --osd_scrub_retry_after_noscrub=5 \
             --osd_scrub_retry_pg_state=5 \
             --osd_scrub_retry_delay=3 \
+            --osd_pool_default_size=3 \
             $extra_pars"
 
     for osd in $(seq 0 $(expr $OSDS - 1))
@@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() {
 }
 
 
+# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries:
+# - pg_primary_dict: a dictionary of pgid -> acting_primary
+# - pg_acting_dict: a dictionary of pgid -> acting set
+# - pg_pool_dict: a dictionary of pgid -> pool
+# If the input file is '-', the function will fetch the dump directly from the ceph cluster.
+function build_pg_dicts {
+  local dir=$1
+  local -n pg_primary_dict=$2
+  local -n pg_acting_dict=$3
+  local -n pg_pool_dict=$4
+  local infile=$5
+
+  local extr_dbg=0 # note: 3 and above leave some temp files around
+
+  #turn off '-x' (but remember previous state)
+  local saved_echo_flag=${-//[^x]/}
+  set +x
+
+  # if the infile name is '-', fetch the dump directly from the ceph cluster
+  if [[ $infile == "-" ]]; then
+    local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty"
+    local -r ceph_cmd_out=$(eval $ceph_cmd)
+    local -r ceph_cmd_rc=$?
+    if [[ $ceph_cmd_rc -ne 0 ]]; then
+      echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc"
+    fi
+    (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2
+    l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' `
+  else
+    l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile `
+  fi
+  (( extr_dbg >= 2 )) && echo "L0: $l0"
+
+  mapfile -t l1 < <(echo "$l0" | jq -c '.[]')
+  (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}"
+
+  for item in "${l1[@]}"; do
+    pgid=$(echo "$item" | jq -r '.pgid')
+    acting=$(echo "$item" | jq -r '.acting | @sh')
+    pg_acting_dict["$pgid"]=$acting
+    acting_primary=$(echo "$item" | jq -r '.acting_primary')
+    pg_primary_dict["$pgid"]=$acting_primary
+    pool=$(echo "$item" | jq -r '.pool')
+    pg_pool_dict["$pgid"]=$pool
+  done
+
+  if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+}
+
+
+# a function that counts the number of common active-set elements between two PGs
+# 1 - the first PG
+# 2 - the second PG
+# 3 - the dictionary of active sets
+function count_common_active {
+  local pg1=$1
+  local pg2=$2
+  local -n pg_acting_dict=$3
+  local -n res=$4
+
+  local -a a1=(${pg_acting_dict[$pg1]})
+  local -a a2=(${pg_acting_dict[$pg2]})
+
+  local -i cnt=0
+  for i in "${a1[@]}"; do
+    for j in "${a2[@]}"; do
+      if [[ $i -eq $j ]]; then
+        cnt=$((cnt+1))
+      fi
+    done
+  done
+
+  res=$cnt
+}
+
+
+# given a PG, find another one with a disjoint active set
+# - but allow a possible common Primary
+# 1 - the PG
+# 2 - the dictionary of active sets
+# 3 - [out] - the PG with a disjoint active set
+function find_disjoint_but_primary {
+  local pg=$1
+  local -n ac_dict=$2
+  local -n p_dict=$3
+  local -n res=$4
+
+  for cand in "${!ac_dict[@]}"; do
+    if [[ "$cand" != "$pg" ]]; then
+      local -i common=0
+      count_common_active "$pg" "$cand" ac_dict common
+      if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then
+        res=$cand
+        return
+      fi
+    fi
+  done
+}
+
+
+
 # A debug flag is set for the PG specified, causing the 'pg query' command to display
 # an additional 'scrub sessions counter' field.
 #