104 files changed, 1231 insertions, 157 deletions
diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml
new file mode 100644
index 00000000000..d5ba487b9bf
--- /dev/null
+++ b/qa/config/crimson_bluestore.yaml
@@ -0,0 +1,25 @@
+overrides:
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        # crimson's osd objectstore option
+        crimson osd objectstore: bluestore
+        debug alienstore: 20
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        bluestore compression algorithm: snappy
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bluestore rocksdb cf: false
+        log to stderr: true
+        err to stderr: true
+        log flush on exit: true
+        log to file: false
diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml
index 8cf98f38001..a10c59d77cc 100644
--- a/qa/config/crimson_qa_overrides.yaml
+++ b/qa/config/crimson_qa_overrides.yaml
@@ -9,7 +9,6 @@ overrides:
         osd pool default crimson: true
       osd:
         crimson osd obc lru size: 10
-        debug alienstore: 20
         debug ms: 20
     flavor: crimson
   workunit:
diff --git a/qa/config/seastore.yaml b/qa/config/crimson_seastore.yaml
index 6158563eedf..d1919456ab1 100644
--- a/qa/config/seastore.yaml
+++ b/qa/config/crimson_seastore.yaml
@@ -1,13 +1,13 @@
 overrides:
   ceph:
-    fs: xfs
     conf:
       osd:
-        osd objectstore: seastore
+        # crimson's osd objectstore option
+        crimson osd objectstore: seastore
         debug seastore: 20
         debug seastore onode: 20
         debug seastore odata: 20
-        debug seastore ompap: 20
+        debug seastore omap: 20
         debug seastore tm: 20
         debug seastore t: 20
         debug seastore cleaner: 20
diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs
index c979e5b105f..c558a1382ef 100644
--- a/qa/crontab/teuthology-cronjobs
+++ b/qa/crontab/teuthology-cronjobs
@@ -52,7 +52,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
 00 05  * * 0,2,4   $CW $SS      1 --ceph main    --suite smoke -p 100 --force-priority
 08 05  * * 0       $CW $SS      1 --ceph squid   --suite smoke -p 100 --force-priority
 16 05  * * 0       $CW $SS      1 --ceph reef    --suite smoke -p 100 --force-priority
-24 05  * * 0       $CW $SS      1 --ceph quincy  --suite smoke -p 100 --force-priority
 
 ## ********** windows tests on main branch - weekly
 # 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL
@@ -122,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
 16 00 * * 1        $CW $SS      1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820
 24 00 * * 1        $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820
 32 00 * * 1        $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820
-40 00 * * 1        $CW $SS      1 --ceph quincy --suite upgrade/quincy-p2p -p 820
 
 ### upgrade runs for reef release
 ###### on smithi
diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh
index aedfbc9b5cb..f7424de8ce1 100755
--- a/qa/standalone/osd/osd-bluefs-volume-ops.sh
+++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh
@@ -72,7 +72,7 @@ function TEST_bluestore() {
 
     truncate $dir/0/block -s 4294967296 # 4GB
     ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1
-    truncate $dir/1/block -s 4311744512 # 4GB + 16MB
+    truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240
     ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1
     truncate $dir/2/block -s 4295099392 # 4GB + 129KB
     ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 843e9b9901b..7b77a60f35b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -163,7 +163,7 @@ function wait_for_scrub_mod() {
         fi
         sleep 1
         # are we still the primary?
-        local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+        local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
         if [ $orig_primary != $current_primary ]; then
             echo $orig_primary no longer primary for $pgid
             return 0
@@ -194,7 +194,7 @@ function pg_scrub_mod() {
 
     local last_scrub=$(get_last_scrub_stamp $pgid)
     # locate the primary
-    local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+    local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
     local recovery=false
     ceph pg scrub $pgid
     #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index 8015e023bdd..385479258f2 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() {
             --osd_op_queue=wpq \
             --osd_stats_update_period_not_scrubbing=1 \
             --osd_stats_update_period_scrubbing=1 \
+            --osd_scrub_retry_after_noscrub=1 \
+            --osd_scrub_retry_pg_state=2 \
+            --osd_scrub_retry_delay=2 \
             --osd_scrub_sleep=0.2"
 
     for osd in $(seq 0 $(expr $OSDS - 1))
@@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() {
     declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" )
     wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1
 
-    sleep 2
-
     #
     # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub
     #         scheduled for the future' value
     #
 
-    ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
-    ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
     ceph osd set noscrub || return 1
     sleep 2
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
+    ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
+    sleep 8
     saved_last_stamp=${sched_data['query_last_stamp']}
 
     ceph tell $pgid schedule-scrub
@@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() {
     teardown $dir || return 1
 }
 
+function wait_initial_scrubs() {
+    local -n pg_to_prim_dict=$1
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    # set a long schedule for the periodic scrubs. Wait for the
+    # initial 'no previous scrub is known' scrubs to finish for all PGs.
+    ceph tell osd.* config set osd_scrub_min_interval 7200
+    ceph tell osd.* config set osd_deep_scrub_interval 14400
+    ceph tell osd.* config set osd_max_scrubs 32
+    ceph tell osd.* config set osd_scrub_sleep 0
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max 10
+    ceph tell osd.* config set osd_scrub_chunk_max 10
+
+    for pg in "${!pg_to_prim_dict[@]}"; do
+      (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg"
+      ceph tell $pg scrub || return 1
+    done
+
+    sleep 1
+    (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \
+      jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+
+    tout=20
+    while [ $tout -gt 0 ] ; do
+      sleep 0.5
+      (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \
+        jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+      not_done=$(ceph pg dump pgs --format=json-pretty | \
+        jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l )
+      # note that we should ignore a header line
+      if [ "$not_done" -le 1 ]; then
+        break
+      fi
+      not_done=$(( (not_done - 2) / 4 ))
+      echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)"
+      tout=$((tout - 1))
+    done
+    (( tout == 0 )) && return 1
+    return 0
+}
+
+
+# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued
+# for its replicas:
+# if the operator is requesting a scrub of the same PG, the operator's request
+# should trigger an abort of the ongoing scrub.
+#
+# The test process:
+# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one.
+# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one
+#   should be stuck in replica reservation. We will verify that.
+# - now - the operator is requesting that second PG to be scrubbed. The original (pending)
+#   scrub should be aborted. We would check for:
+#   - the new, operator's scrub to be scheduled
+#   - the replicas' reservers to be released
+function TEST_abort_periodic_for_operator() {
+    local dir=$1
+    local -A cluster_conf=(
+        ['osds_num']="5"
+        ['pgs_in_pool']="16"
+        ['pool_name']="test"
+    )
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1
+    local poolid=${cluster_conf['pool_id']}
+    local poolname=${cluster_conf['pool_name']}
+    echo "Pool: $poolname : $poolid"
+
+    #turn off '-x' (but remember previous state)
+    local saved_echo_flag=${-//[^x]/}
+    set +x
+
+    # fill the pool with some data
+    TESTDATA="testdata.$$"
+    dd if=/dev/urandom of=$TESTDATA bs=320 count=1
+    for i in $( seq 1 256 )
+    do
+        rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null
+    done
+    rm -f $TESTDATA
+    if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+
+    # create the dictionary of the PGs in the pool
+    declare -A pg_pr
+    declare -A pg_ac
+    declare -A pg_po
+    build_pg_dicts "$dir" pg_pr pg_ac pg_po "-"
+    (( extr_dbg >= 2 )) && echo "PGs table:"
+    for pg in "${!pg_pr[@]}"; do
+      (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}"
+    done
+
+    wait_initial_scrubs pg_pr || return 1
+
+    # limit all OSDs to one scrub at a time
+    ceph tell osd.* config set osd_max_scrubs 1
+    ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1
+
+    # configure for slow scrubs
+    ceph tell osd.* config set osd_scrub_sleep 3
+    ceph tell osd.* config set osd_shallow_scrub_chunk_max 2
+    ceph tell osd.* config set osd_scrub_chunk_max 2
+    (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty
+
+    # the first PG to work with:
+    local pg1="1.0"
+    # and another one, that shares its primary, and at least one more active set member
+    local pg2=""
+     for pg in "${!pg_pr[@]}"; do
+      if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then
+        local -i common=0
+        count_common_active $pg $pg1 pg_ac common
+        if [[ $common -gt 1 ]]; then
+          pg2=$pg
+          break
+        fi
+      fi
+    done
+    if [[ -z "$pg2" ]]; then
+      # \todo handle the case when no such PG is found
+      echo "No PG found with the same primary as $pg1"
+      return 1
+    fi
+
+    # the common primary is allowed two concurrent scrubs
+    ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2
+    echo "The two PGs to manipulate are $pg1 and $pg2"
+
+    set_query_debug "$pg1"
+    # wait till the information published by pg1 is updated to show it as
+    # not being scrubbed
+    local is_act
+    for i in $( seq 1 3 )
+    do
+      is_act=$(ceph pg "$pg1" query | jq '.scrubber.active')
+      if [[ "$is_act" = "false" ]]; then
+          break
+      fi
+      echo "Still waiting for pg $pg1 to finish scrubbing"
+      sleep 0.7
+    done
+    ceph pg dump pgs
+    if [[ "$is_act" != "false" ]]; then
+      ceph pg "$pg1" query
+      echo "PG $pg1 appears to be still scrubbing"
+      return 1
+    fi
+    sleep 0.5
+
+    echo "Initiating a periodic scrub of $pg1"
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+    ceph tell $pg1 schedule-deep-scrub || return 1
+    sleep 1
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+    for i in $( seq 1 14 )
+    do
+      sleep 0.5
+      stt=$(ceph pg "$pg1" query | jq '.scrubber')
+      is_active=$(echo $stt | jq '.active')
+      is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas')
+      if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then
+          break
+      fi
+      echo "Still waiting for pg $pg1 to start scrubbing: $stt"
+    done
+    if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then
+      ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+      echo "The scrub is not active or is reserving replicas"
+      return 1
+    fi
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+
+    # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared
+    # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from
+    # reserving its replicas.
+
+    (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty
+
+    # now - the 2'nd scrub - which should be blocked on reserving
+    set_query_debug "$pg2"
+    ceph tell "$pg2" schedule-deep-scrub
+    sleep 0.5
+    (( extr_dbg >= 2 )) && echo "===================================================================================="
+    (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+    sleep 1
+    (( extr_dbg >= 2 )) && echo "===================================================================================="
+    (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+    (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+    # make sure pg2 scrub is stuck in the reserving state
+    local stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+    local pg2_is_reserving
+    pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+    if [[ "$pg2_is_reserving" != "true" ]]; then
+      echo "The scheduled scrub for $pg2 should have been stuck"
+      ceph pg dump pgs
+      return 1
+    fi
+
+    # now - issue an operator-initiated scrub on pg2.
+    # The periodic scrub should be aborted, and the operator-initiated scrub should start.
+    echo "Instructing $pg2 to perform a high-priority scrub"
+    ceph tell "$pg2" scrub
+    for i in $( seq 1 10 )
+    do
+      sleep 0.5
+      stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+      pg2_is_active=$(echo $stt2 | jq '.active')
+      pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+      if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then
+            break
+      fi
+      echo "Still waiting: $stt2"
+    done
+
+    if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then
+      echo "The high-priority scrub for $pg2 is not active or is reserving replicas"
+      return 1
+    fi
+    echo "Done"
+}
+
+
+
 main osd-scrub-test "$@"
 
 # Local Variables:
diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh
index 49b8346b8d2..dd37b643e08 100644
--- a/qa/standalone/scrub/scrub-helpers.sh
+++ b/qa/standalone/scrub/scrub-helpers.sh
@@ -240,8 +240,8 @@ function standard_scrub_cluster() {
     local saved_echo_flag=${-//[^x]/}
     set +x
 
-    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
-    run_mgr $dir x || return 1
+    run_mon $dir a --osd_pool_default_size=3 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
 
     local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
             --osd_scrub_interval_randomize_ratio=0 \
@@ -249,9 +249,12 @@ function standard_scrub_cluster() {
             --osd_pool_default_pg_autoscale_mode=off \
             --osd_pg_stat_report_interval_max_seconds=1 \
             --osd_pg_stat_report_interval_max_epochs=1 \
+            --osd_stats_update_period_not_scrubbing=3 \
+            --osd_stats_update_period_scrubbing=1 \
             --osd_scrub_retry_after_noscrub=5 \
             --osd_scrub_retry_pg_state=5 \
             --osd_scrub_retry_delay=3 \
+            --osd_pool_default_size=3 \
             $extra_pars"
 
     for osd in $(seq 0 $(expr $OSDS - 1))
@@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() {
 }
 
 
+# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries:
+# - pg_primary_dict: a dictionary of pgid -> acting_primary
+# - pg_acting_dict: a dictionary of pgid -> acting set
+# - pg_pool_dict: a dictionary of pgid -> pool
+# If the input file is '-', the function will fetch the dump directly from the ceph cluster.
+function build_pg_dicts {
+  local dir=$1
+  local -n pg_primary_dict=$2
+  local -n pg_acting_dict=$3
+  local -n pg_pool_dict=$4
+  local infile=$5
+
+  local extr_dbg=0 # note: 3 and above leave some temp files around
+
+  #turn off '-x' (but remember previous state)
+  local saved_echo_flag=${-//[^x]/}
+  set +x
+
+  # if the infile name is '-', fetch the dump directly from the ceph cluster
+  if [[ $infile == "-" ]]; then
+    local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty"
+    local -r ceph_cmd_out=$(eval $ceph_cmd)
+    local -r ceph_cmd_rc=$?
+    if [[ $ceph_cmd_rc -ne 0 ]]; then
+      echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc"
+    fi
+    (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2
+    l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' `
+  else
+    l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile `
+  fi
+  (( extr_dbg >= 2 )) && echo "L0: $l0"
+
+  mapfile -t l1 < <(echo "$l0" | jq -c '.[]')
+  (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}"
+
+  for item in "${l1[@]}"; do
+    pgid=$(echo "$item" | jq -r '.pgid')
+    acting=$(echo "$item" | jq -r '.acting | @sh')
+    pg_acting_dict["$pgid"]=$acting
+    acting_primary=$(echo "$item" | jq -r '.acting_primary')
+    pg_primary_dict["$pgid"]=$acting_primary
+    pool=$(echo "$item" | jq -r '.pool')
+    pg_pool_dict["$pgid"]=$pool
+  done
+
+  if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+}
+
+
+# a function that counts the number of common active-set elements between two PGs
+# 1 - the first PG
+# 2 - the second PG
+# 3 - the dictionary of active sets
+function count_common_active {
+  local pg1=$1
+  local pg2=$2
+  local -n pg_acting_dict=$3
+  local -n res=$4
+
+  local -a a1=(${pg_acting_dict[$pg1]})
+  local -a a2=(${pg_acting_dict[$pg2]})
+
+  local -i cnt=0
+  for i in "${a1[@]}"; do
+    for j in "${a2[@]}"; do
+      if [[ $i -eq $j ]]; then
+        cnt=$((cnt+1))
+      fi
+    done
+  done
+
+  res=$cnt
+}
+
+
+# given a PG, find another one with a disjoint active set
+# - but allow a possible common Primary
+# 1 - the PG
+# 2 - the dictionary of active sets
+# 3 - [out] - the PG with a disjoint active set
+function find_disjoint_but_primary {
+  local pg=$1
+  local -n ac_dict=$2
+  local -n p_dict=$3
+  local -n res=$4
+
+  for cand in "${!ac_dict[@]}"; do
+    if [[ "$cand" != "$pg" ]]; then
+      local -i common=0
+      count_common_active "$pg" "$cand" ac_dict common
+      if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then
+        res=$cand
+        return
+      fi
+    fi
+  done
+}
+
+
+
 # A debug flag is set for the PG specified, causing the 'pg query' command to display
 # an additional 'scrub sessions counter' field.
 #
diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa
index fea2489fdf6..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/.qa
+++ b/qa/suites/crimson-rados-experimental/.qa
@@ -1 +1 @@
-../.qa
-\ No newline at end of file
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
deleted file mode 120000
index bd9854e7029..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/supported/centos_latest.yaml
-\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
deleted file mode 100644
index d8e5898b99f..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-overrides:
-  ceph-deploy:
-    conf:
-      global:
-        osd pool default size: 2
-        osd crush chooseleaf type: 0
-        osd pool default pg num:  128
-        osd pool default pgp num:  128
-  ceph:
-    conf:
-      osd:
-        osd shutdown pgref assert: true
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0]
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
deleted file mode 100644
index c22f08eecf8..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
-  install:
-    ceph:
-      flavor: crimson
-tasks:
-- install:
-- ceph:
-    conf:
-      osd:
-        debug monc: 20
-      mon:
-        mon min osdmap epochs: 50
-        paxos service trim min: 10
-        # prune full osdmaps regularly
-        mon osdmap full prune min: 15
-        mon osdmap full prune interval: 2
-        mon osdmap full prune txsize: 2
-    flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml
deleted file mode 120000
index 6a70c381709..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/config/seastore.yaml
-\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
deleted file mode 100644
index ad8c921425b..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-overrides:
-  ceph:
-    log-ignorelist:
-    - reached quota
-    - but it is still running
-    - overall HEALTH_
-    - \(POOL_FULL\)
-    - \(SMALLER_PGP_NUM\)
-    - \(CACHE_POOL_NO_HIT_SET\)
-    - \(CACHE_POOL_NEAR_FULL\)
-    - \(POOL_APP_NOT_ENABLED\)
-    - \(PG_AVAILABILITY\)
-    - \(PG_DEGRADED\)
-    conf:
-      client:
-        debug ms: 1
-      mon:
-        mon warn on pool no app: false
-      osd:
-        osd class load list: "*"
-        osd class default list: "*"
-        osd blocked scrub grace period: 3600
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - rados/test.sh
-        - rados/test_pool_quota.sh
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
deleted file mode 100644
index 25efcdac83d..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
-  ceph:
-    crush_tunables: optimal
-    conf:
-      mon:
-        mon osd initial require min compat client: luminous
-      osd:
-        osd_discard_disconnected_ops: false
-tasks:
-- rados:
-    clients: [client.0]
-    ops: 4000
-    objects: 500
-    max_attr_len: 8192
-    op_weights:
-      read: 45
-      write: 45
-      delete: 10
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/suites/crimson-rados-experimental/thrash/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/%
+++ b/qa/suites/crimson-rados-experimental/thrash/%
diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
new file mode 120000
index 00000000000..5393a75548a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/2-size-2-min-size.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
new file mode 120000
index 00000000000..5ff70eadf75
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
@@ -0,0 +1 @@
+.qa/overrides/3-size-2-min-size.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
new file mode 120000
index 00000000000..47afd70202d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/more-active-recovery.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..0bbc72db754
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_async_recovery_min_cost: 1
+        osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
new file mode 100644
index 00000000000..4aed086bcc3
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_async_recovery_min_cost: 1
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..88f15f2f691
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
index 9774de6887b..79641f695ab 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
@@ -6,6 +6,15 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+        crimson alien thread cpu cores: 6-7
+      osd.0:
+        crimson seastar cpu cores: 0-2
+      osd.1:
+        crimson seastar cpu cores: 3-5
+      osd.2:
+        crimson seastar cpu cores: 0-2
+      osd.3:
+        crimson seastar cpu cores: 3-5
       global:
         ms cluster mode: crc
         ms service mode: crc
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
new file mode 100644
index 00000000000..e559d9126e8
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
@@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
new file mode 120000
index 00000000000..a5b729b9efa
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
@@ -0,0 +1 @@
+.qa/distros/crimson-supported-all-distro/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
index 2bf67af1b18..2bf67af1b18 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
new file mode 100644
index 00000000000..ecad09cfe3a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
@@ -0,0 +1,11 @@
+overrides:
+  install:
+    ceph:
+      flavor: crimson
+tasks:
+- install:
+- ceph:
+    conf:
+      osd:
+        debug monc: 20
+    flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
new file mode 100644
index 00000000000..0c2062240ee
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
@@ -0,0 +1,16 @@
+# no need to verify os + flavor + sha1
+verify_ceph_hash: false
+tasks:
+- cephadm:
+    conf:
+      mgr:
+        debug ms: 1
+        debug mgr: 20
+        debug osd: 10
+- cephadm.shell:
+    mon.a:
+      - ceph orch status
+      - ceph orch ps
+      - ceph orch ls
+      - ceph orch host ls
+      - ceph orch device ls
diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
new file mode 100644
index 00000000000..aa44b6101ff
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
@@ -0,0 +1,34 @@
+overrides:
+  ceph:
+    log-ignorelist:
+    - but it is still running
+    - objects unfound and apparently lost
+    conf:
+      osd:
+        osd debug reject backfill probability: .3
+        osd scrub min interval: 60
+        osd scrub max interval: 120
+        osd max backfills: 3
+        osd snap trim sleep: 2
+        osd delete sleep: 1
+      mon:
+        mon min osdmap epochs: 50
+        paxos service trim min: 10
+        # prune full osdmaps regularly
+        mon osdmap full prune min: 15
+        mon osdmap full prune interval: 2
+        mon osdmap full prune txsize: 2
+tasks:
+- thrashosds:
+    timeout: 2400
+    dump_ops_enable: false
+    sighup_delay: 0
+    min_in: 3
+    noscrub_toggle_delay: 0
+    chance_thrash_pg_upmap: 0
+    reweight_osd: 0
+    thrash_primary_affinity: false
+    ceph_objectstore_tool: false
+    chance_inject_pause_short: 0
+    chance_thrash_cluster_full: 0
+    chance_reset_purged_snaps_last: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
new file mode 120000
index 00000000000..9124eb1aa29
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+.qa/tasks/thrashosds-health.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
new file mode 100644
index 00000000000..8c9764ade84
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
@@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        admin socket: /var/run/ceph/ceph-$name.asok
+tasks:
+- radosbench:
+    clients: [client.0]
+    time: 150
+- admin_socket:
+    client.0:
+      objecter_requests:
+        test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
new file mode 100644
index 00000000000..d35e8421ab4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
@@ -0,0 +1,20 @@
+overrides:
+  conf:
+    osd:
+      osd deep scrub update digest min age: 0
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    pool_snaps: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
new file mode 100644
index 00000000000..902c4b56a1e
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
@@ -0,0 +1,49 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug objecter: 20
+        debug rados: 20
+tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
new file mode 100644
index 00000000000..071f55e3928
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug objecter: 20
+        debug rados: 20
+tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
new file mode 100644
index 00000000000..afe04229898
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    balance_reads: true
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
new file mode 100644
index 00000000000..445b582ea42
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    localize_reads: true
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
new file mode 100644
index 00000000000..e7e8070fd76
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
@@ -0,0 +1,23 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
new file mode 100644
index 00000000000..1161c3cc253
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    balance_reads: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
new file mode 100644
index 00000000000..80af0def0e4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    localize_reads: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
new file mode 100644
index 00000000000..0694ffcd0d6
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
@@ -0,0 +1,14 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
new file mode 100644
index 00000000000..606dcae6922
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
@@ -0,0 +1,8 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_fadvise_dontneed: true
+    op_weights:
+      write: 100
diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml
-\ No newline at end of file
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml
-\ No newline at end of file
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml
-\ No newline at end of file
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml
-\ No newline at end of file
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml
-\ No newline at end of file
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml
-\ No newline at end of file
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore
deleted file mode 120000
index dbccf5ad928..00000000000
--- a/qa/suites/crimson-rados/singleton/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/objectstore
-\ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml
new file mode 120000
index 00000000000..481e393be4a
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml
-\ No newline at end of file
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
new file mode 120000
index 00000000000..abd86d7d986
--- /dev/null
+++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1 @@
+.qa/overrides/short_pg_log.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml
-\ No newline at end of file
+.qa/config/crimson_bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml
-\ No newline at end of file
+.qa/config/crimson_seastore.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
index 55dde639c23..b7a0338566c 100644
--- a/qa/suites/fs/multifs/tasks/failover.yaml
+++ b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -8,6 +8,7 @@ overrides:
       - \(MDS_DAMAGE\)
       - \(FS_DEGRADED\)
       - \(MDS_CACHE_OVERSIZED\)
+      - \(MDS_ESTIMATED_REPLAY_TIME\)
   ceph-fuse:
     disabled: true
 tasks:
diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml
index aa966bff214..2dd668c9f88 100644
--- a/qa/suites/fs/nfs/tasks/nfs.yaml
+++ b/qa/suites/fs/nfs/tasks/nfs.yaml
@@ -1,3 +1,10 @@
+overrides:
+  install:
+    extra_system_packages:
+      rpm:
+        - fio
+      deb:
+        - fio
 tasks:
   - cephfs_test_runner:
       modules:
diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
index 602d3416263..aa327b0cdf5 100644
--- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
+++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
@@ -5,6 +5,7 @@ overrides:
         - "mds.dir_split"
 tasks:
 - workunit:
+    timeout: 5h
     clients:
       all:
         - kernel_untar_build.sh
diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
index 372bf2561fa..8b3c4c11ac6 100644
--- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
+++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
@@ -15,6 +15,7 @@ overrides:
       # causing tests to fail due to health warns, even if
       # the tests themselves are successful.
       - \(OSDMAP_FLAGS\)
+      - \(PG_DEGRADED\)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/verify/clusters/fixed-4.yaml b/qa/suites/rados/verify/clusters/fixed-4.yaml
new file mode 120000
index 00000000000..aa88300715a
--- /dev/null
+++ b/qa/suites/rados/verify/clusters/fixed-4.yaml
@@ -0,0 +1 @@
+.qa/clusters/fixed-4.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml
index e2dc29b5f7e..17cf141b0cd 100644
--- a/qa/suites/rados/verify/validater/valgrind.yaml
+++ b/qa/suites/rados/verify/validater/valgrind.yaml
@@ -27,6 +27,7 @@ overrides:
       - \(SLOW_OPS\)
       - slow request
       - OSD bench result
+      - OSD_DOWN
     valgrind:
       mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
       osd: [--tool=memcheck]
diff --git a/qa/suites/rgw/lua/tasks/0-install.yaml b/qa/suites/rgw/lua/tasks/0-install.yaml
index fa6e279145c..d85ebcc5998 100644
--- a/qa/suites/rgw/lua/tasks/0-install.yaml
+++ b/qa/suites/rgw/lua/tasks/0-install.yaml
@@ -3,7 +3,7 @@ tasks:
 - ceph:
 - openssl_keys:
 - rgw: [client.0]
-- tox: [client.0]    
+- tox: [client.0]
 
 overrides:
   ceph:
@@ -11,3 +11,11 @@ overrides:
       global:
         osd_min_pg_log_entries: 10
         osd_max_pg_log_entries: 10
+  install:
+    ceph:
+      extra_system_packages:
+        rpm:
+        - luarocks
+        deb:
+        - liblua5.3-dev
+        - luarocks
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml
new file mode 100644
index 00000000000..5c83d5c0d23
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml
@@ -0,0 +1,20 @@
+tasks:
+- install:
+- ceph:
+- openssl_keys:
+- rgw:
+    client.0:
+
+overrides:
+  install:
+    ceph:
+      extra_system_packages:
+        rpm:
+        - java
+        deb:
+        - default-jre
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 10
+        osd_max_pg_log_entries: 10
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros
new file mode 120000
index 00000000000..46280a42a96
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros
@@ -0,0 +1 @@
+../../.qa/distros/supported-random-distro$/
+\ No newline at end of file
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml
new file mode 100644
index 00000000000..01d6fc637de
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml
@@ -0,0 +1,8 @@
+tasks:
+- kafka-failover:
+    client.0:
+      kafka_version: 3.8.1
+- notification-tests:
+    client.0:
+      extra_attr: ["kafka_failover"]
+      rgw_server: client.0
diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
index 40fbcefe728..62fb6427f72 100644
--- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
@@ -32,13 +32,22 @@ overrides:
       osd:
         osd shutdown pgref assert: true
     log-ignorelist:
-      - \(POOL_APP_NOT_ENABLED\)
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
       - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
+      - PG_AVAILABILITY
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
       - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
index e27c7c0f092..f7167975aa9 100644
--- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
@@ -1,11 +1,8 @@
 overrides:
   ceph:
     log-ignorelist:
-      - mons down
-      - mon down
-      - MON_DOWN
-      - out of quorum
-      - PG_AVAILABILITY
+      - Telemetry requires re-opt-in
+      - telemetry module includes new collections
 tasks:
 - install:
     branch: quincy
diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
index 005514292ce..5641471629e 100644
--- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
@@ -1,17 +1,25 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(POOL_APP_NOT_ENABLED\)
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
       - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
+      - PG_AVAILABILITY
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
       - OSDMAP_FLAGS
-      - PG_AVAILABILITY
+      - OSD_UPGRADE_FINISHED
 tasks:
 - install:
     branch: quincy
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 146bd57960d..62fb6427f72 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -32,4 +32,22 @@ overrides:
       osd:
         osd shutdown pgref assert: true
     log-ignorelist:
-        - PG_DEGRADED
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
+      - OSD_DOWN
+      - mons down
+      - mon down
+      - MON_DOWN
+      - out of quorum
+      - PG_AVAILABILITY
+      - PG_DEGRADED
+      - Reduced data availability
+      - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
+      - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index ce4e0cc228b..b5160c2dd00 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -1,12 +1,8 @@
 overrides:
   ceph:
     log-ignorelist:
-      - mons down
-      - mon down
-      - MON_DOWN
-      - out of quorum
-      - PG_AVAILABILITY
-      - PG_DEGRADED
+      - Telemetry requires re-opt-in
+      - telemetry module includes new collections
 tasks:
 - install:
     branch: reef
diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
index 5e995da7d2c..fa93b2f2ece 100644
--- a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
@@ -1,20 +1,19 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(MDS_ALL_DOWN\)
-      - \(MDS_UP_LESS_THAN_MAX\)
-      - \(OSD_SLOW_PING_TIME
+      - MDS_ALL_DOWN
+      - MDS_UP_LESS_THAN_MAX
+      - OSD_SLOW_PING_TIME
       - reached quota
+      - running out of quota
       - overall HEALTH_
-      - \(CACHE_POOL_NO_HIT_SET\)
-      - \(POOL_FULL\)
-      - \(SMALLER_PGP_NUM\)
-      - \(SLOW_OPS\)
-      - \(CACHE_POOL_NEAR_FULL\)
-      - \(POOL_APP_NOT_ENABLED\)
-      - \(PG_AVAILABILITY\)
-      - \(OBJECT_MISPLACED\)
+      - CACHE_POOL_NO_HIT_SET
+      - pool\(s\) full
+      - POOL_FULL
+      - SMALLER_PGP_NUM
+      - SLOW_OPS
+      - CACHE_POOL_NEAR_FULL
+      - OBJECT_MISPLACED
       - slow request
-      - \(MON_DOWN\)
       - noscrub
       - nodeep-scrub
diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
index 992f9e1bc36..59ccfe2cd02 100644
--- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
@@ -1,11 +1,25 @@
 overrides:
   ceph:
     log-ignorelist:
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
+      - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
+      - Reduced data availability
+      - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
+      - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
 tasks:
 - install:
     branch: reef
diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
index 5e995da7d2c..fa93b2f2ece 100644
--- a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
@@ -1,20 +1,19 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(MDS_ALL_DOWN\)
-      - \(MDS_UP_LESS_THAN_MAX\)
-      - \(OSD_SLOW_PING_TIME
+      - MDS_ALL_DOWN
+      - MDS_UP_LESS_THAN_MAX
+      - OSD_SLOW_PING_TIME
       - reached quota
+      - running out of quota
       - overall HEALTH_
-      - \(CACHE_POOL_NO_HIT_SET\)
-      - \(POOL_FULL\)
-      - \(SMALLER_PGP_NUM\)
-      - \(SLOW_OPS\)
-      - \(CACHE_POOL_NEAR_FULL\)
-      - \(POOL_APP_NOT_ENABLED\)
-      - \(PG_AVAILABILITY\)
-      - \(OBJECT_MISPLACED\)
+      - CACHE_POOL_NO_HIT_SET
+      - pool\(s\) full
+      - POOL_FULL
+      - SMALLER_PGP_NUM
+      - SLOW_OPS
+      - CACHE_POOL_NEAR_FULL
+      - OBJECT_MISPLACED
       - slow request
-      - \(MON_DOWN\)
       - noscrub
       - nodeep-scrub
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index 9b04e3dc675..8f666d2fa9b 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1206,8 +1206,18 @@ def cluster(ctx, config):
             args.extend([
                 run.Raw('|'), 'head', '-n', '1',
             ])
-            stdout = mon0_remote.sh(args)
-            return stdout or None
+            r = mon0_remote.run(
+                stdout=BytesIO(),
+                args=args,
+                stderr=StringIO(),
+            )
+            stdout = r.stdout.getvalue().decode()
+            if stdout:
+                return stdout
+            stderr = r.stderr.getvalue()
+            if stderr:
+                return stderr
+            return None
 
         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                              config['log_ignorelist']) is not None:
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py
index dab61c2c700..0cde6050718 100644
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -475,12 +475,16 @@ def ceph_log(ctx, config):
                 run.Raw('|'), 'head', '-n', '1',
             ])
             r = ctx.ceph[cluster_name].bootstrap_remote.run(
-                stdout=StringIO(),
+                stdout=BytesIO(),
                 args=args,
+                stderr=StringIO(),
             )
-            stdout = r.stdout.getvalue()
-            if stdout != '':
+            stdout = r.stdout.getvalue().decode()
+            if stdout:
                 return stdout
+            stderr = r.stderr.getvalue()
+            if stderr:
+                return stderr
             return None
 
         # NOTE: technically the first and third arg to first_in_ceph_log
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
index 346f139874b..468378fce3d 100644
--- a/qa/tasks/cephfs/test_exports.py
+++ b/qa/tasks/cephfs/test_exports.py
@@ -153,6 +153,8 @@ class TestExportPin(CephFSTestCase):
         # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap
         # to be written out every event. Yuck!
         self.config_set('mds', 'mds_debug_subtrees', False)
+        # make sure ESubtreeMap is written frequently enough:
+        self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4')
         self.config_rm('mds', 'mds bal split size') # don't split /top
         self.mount_a.run_shell_payload("rm -rf 1")
 
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 29af1e76a4f..46139163ddd 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,4 @@
+import re
 import time
 import signal
 import logging
@@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase):
 
         self.fs.wait_for_daemons(timeout=90)
 
+class TestFailoverBeaconHealth(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def initiate_journal_replay(self, num_files=100):
+        """ Initiate journal replay by creating files and restarting mds server."""
+
+        self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
+        self.mounts[0].test_files = [str(x) for x in range(num_files)]
+        self.mounts[0].create_files()
+        self.fs.fail()
+        self.fs.set_joinable()
+
+    def test_replay_beacon_estimated_time(self):
+        """
+        That beacon emits warning message with estimated time to complete replay
+        """
+        self.initiate_journal_replay()
+        self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
+        # remove the config so that replay finishes and the cluster
+        # is HEALTH_OK
+        self.config_rm("mds", "mds_delay_journal_replay_for_testing")
+        self.wait_for_health_clear(timeout=60)
+
+    def test_replay_estimated_time_accuracy(self):
+        self.initiate_journal_replay(250)
+        def replay_complete():
+            health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+            codes = [s for s in health['checks']]
+            return 'MDS_ESTIMATED_REPLAY_TIME' not in codes
+
+        def get_estimated_time():
+            completion_percentage = 0.0
+            time_duration = pending_duration = 0
+            with safe_while(sleep=5, tries=360) as proceed:
+                while proceed():
+                    health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+                    codes = [s for s in health['checks']]
+                    if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
+                        message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
+                        ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
+                        m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
+                        if not m:
+                            continue
+                        completion_percentage = float(m.group(1))
+                        time_duration = int(m.group(3))
+                        pending_duration = int(m.group(4))
+                        log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
+                        if completion_percentage >= 50:
+                            return (completion_percentage, time_duration, pending_duration)
+        _, _, pending_duration = get_estimated_time()
+        # wait for 25% more time to avoid false negative failures
+        self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)
+
 class TestFailover(CephFSTestCase):
     CLIENTS_REQUIRED = 1
     MDSS_REQUIRED = 2
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index faa35be6926..0a1c07dce04 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -369,6 +369,45 @@ class TestNFS(MgrTestCase):
         except CommandFailedError as e:
             self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
 
+    def _mnt_nfs(self, pseudo_path, port, ip):
+        '''
+        Mount created export
+        :param pseudo_path: It is the pseudo root name
+        :param port: Port of deployed nfs cluster
+        :param ip: IP of deployed nfs cluster
+        '''
+        tries = 3
+        while True:
+            try:
+                self.ctx.cluster.run(
+                    args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}',
+                          f'{ip}:{pseudo_path}', '/mnt'])
+                break
+            except CommandFailedError:
+                if tries:
+                    tries -= 1
+                    time.sleep(2)
+                    continue
+                raise
+
+        self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
+
+    def _test_fio(self, pseudo_path, port, ip):
+        '''
+        run fio with libaio on /mnt/fio
+        :param mnt_path: nfs mount point
+        '''
+        try:
+            self._mnt_nfs(pseudo_path, port, ip)
+            self.ctx.cluster.run(args=['mkdir', '/mnt/fio'])
+            fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50']
+            self.ctx.cluster.run(args=fio_cmd)
+        except CommandFailedError as e:
+            self.fail(f"expected fio to be successful but failed with {e.exitstatus}")
+        finally:
+            self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio'])
+            self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
+
     def _write_to_read_only_export(self, pseudo_path, port, ip):
         '''
         Check if write to read only export fails
@@ -627,6 +666,18 @@ class TestNFS(MgrTestCase):
         self._test_data_read_write(self.pseudo_path, port, ip)
         self._test_delete_cluster()
 
+    def test_async_io_fio(self):
+        '''
+        Test async io using fio. Expect completion without hang or crash
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._test_fio(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
     def test_cluster_info(self):
         '''
         Test cluster info outputs correct ip and hostname
diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py
new file mode 100644
index 00000000000..3ca60ab84fc
--- /dev/null
+++ b/qa/tasks/kafka_failover.py
@@ -0,0 +1,244 @@
+"""
+Deploy and configure Kafka for Teuthology
+"""
+import contextlib
+import logging
+import time
+import os
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def get_kafka_version(config):
+    for client, client_config in config.items():
+        if 'kafka_version' in client_config:
+            kafka_version = client_config.get('kafka_version')
+    return kafka_version
+
+kafka_prefix = 'kafka_2.13-'
+
+def get_kafka_dir(ctx, config):
+    kafka_version = get_kafka_version(config)
+    current_version = kafka_prefix + kafka_version
+    return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version)
+
+
+@contextlib.contextmanager
+def install_kafka(ctx, config):
+    """
+    Downloading the kafka tar file.
+    """
+    assert isinstance(config, dict)
+    log.info('Installing Kafka...')
+
+    # programmatically find a nearby mirror so as not to hammer archive.apache.org
+    apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+        "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+    log.info("determining apache mirror by running: " + apache_mirror_cmd)
+    apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+    log.info("chosen apache mirror is " + apache_mirror_url_front)
+
+    for (client, _) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        test_dir=teuthology.get_testdir(ctx)
+        current_version = get_kafka_version(config)
+
+        kafka_file =  kafka_prefix + current_version + '.tgz'
+
+        link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+            current_version + '/' + kafka_file
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1],
+        )
+
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file],
+        )
+
+        kafka_dir = get_kafka_dir(ctx, config)
+        # create config for second broker
+        second_broker_config_name = "server2.properties"
+        second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir)
+        second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/")
+
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'), 
+             'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), 
+             'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir)
+            ],
+        )
+
+        # edit config
+        ctx.cluster.only(client).run(
+            args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+                  'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+                  'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+                  'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+                  'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name)
+            ]
+        )
+
+    try:
+        yield
+    finally:
+        log.info('Removing packaged dependencies of Kafka...')
+        test_dir=get_kafka_dir(ctx, config)
+        current_version = get_kafka_version(config)
+        for (client,_) in config.items():
+            ctx.cluster.only(client).run(
+                args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)],
+            )
+
+            ctx.cluster.only(client).run(
+                args=['rm', '-rf', test_dir],
+            )
+
+            ctx.cluster.only(client).run(
+                args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)],
+            )
+
+
+@contextlib.contextmanager
+def run_kafka(ctx,config):
+    """
+    This includes two parts:
+    1. Starting Zookeeper service
+    2. Starting Kafka service
+    """
+    assert isinstance(config, dict)
+    log.info('Bringing up Zookeeper and Kafka services...')
+    for (client,_) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        kafka_dir = get_kafka_dir(ctx, config)
+
+        second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir)
+        second_broker_java_log_dir = "{}/java_logs".format(second_broker_data)
+
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+             './zookeeper-server-start.sh',
+             '{tir}/config/zookeeper.properties'.format(tir=kafka_dir),
+             run.Raw('&'), 'exit'
+            ],
+        )
+
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+             './kafka-server-start.sh',
+             '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)),
+             run.Raw('&'), 'exit'
+            ],
+        )
+        
+        ctx.cluster.only(client).run(
+            args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+             run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)), 
+             './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir),
+             run.Raw('&'), 'exit'
+            ],
+        )
+
+    try:
+        yield
+    finally:
+        log.info('Stopping Zookeeper and Kafka Services...')
+
+        for (client, _) in config.items():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+
+            ctx.cluster.only(client).run(
+                args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+                 './kafka-server-stop.sh',  
+                 '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)),
+                ],
+            )
+
+            time.sleep(5)
+
+            ctx.cluster.only(client).run(
+                args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), 
+                 './zookeeper-server-stop.sh',
+                 '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)),
+                ],
+            )
+
+            time.sleep(5)
+
+            ctx.cluster.only(client).run(args=['killall', '-9', 'java'])
+
+
+@contextlib.contextmanager
+def run_admin_cmds(ctx,config):
+    """
+    Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic.
+    """
+    assert isinstance(config, dict)
+    log.info('Checking kafka server through producer/consumer commands...')
+    for (client,_) in config.items():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+
+        ctx.cluster.only(client).run(
+            args=[
+                'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), 
+                './kafka-topics.sh', '--create', '--topic', 'quickstart-events',
+                '--bootstrap-server', 'localhost:9092'
+            ],
+        )
+
+        ctx.cluster.only(client).run(
+            args=[
+                'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+                'echo', "First", run.Raw('|'),
+                './kafka-console-producer.sh', '--topic', 'quickstart-events',
+                '--bootstrap-server', 'localhost:9092'
+            ],
+        )
+
+        ctx.cluster.only(client).run(
+            args=[
+                'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+                './kafka-console-consumer.sh', '--topic', 'quickstart-events',
+                '--from-beginning',
+                '--bootstrap-server', 'localhost:9092',
+                run.Raw('&'), 'exit'
+            ],
+        )
+
+    try:
+        yield
+    finally:
+        pass
+
+
+@contextlib.contextmanager
+def task(ctx,config):
+    """
+    Following is the way how to run kafka::
+    tasks:
+    - kafka:
+        client.0:
+          kafka_version: 2.6.0
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task kafka only supports a list or dictionary for configuration"
+
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+
+    log.debug('Kafka config is %s', config)
+
+    with contextutil.nested(
+        lambda: install_kafka(ctx=ctx, config=config),
+        lambda: run_kafka(ctx=ctx, config=config),
+        lambda: run_admin_cmds(ctx=ctx, config=config),
+        ):
+        yield
+
diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py
index b4697a6f797..f1eae3c89c4 100644
--- a/qa/tasks/notification_tests.py
+++ b/qa/tasks/notification_tests.py
@@ -220,7 +220,7 @@ def run_tests(ctx, config):
     for client, client_config in config.items():
         (remote,) = ctx.cluster.only(client).remotes.keys()
 
-        attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"]
+        attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"]
 
         if 'extra_attr' in client_config:
             attr = client_config.get('extra_attr')
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py
index c58a7267b4e..691a6f7dd86 100644
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -315,7 +315,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
 
     def _get_devices(self, remote):
         GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \
-            "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'"
+            "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'"
         devices = remote.sh(GET_DEVICE_CMD).split()
         return devices
     
diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py
index e83a54efc2b..f93ca017fa2 100644
--- a/qa/tasks/rgw_multisite.py
+++ b/qa/tasks/rgw_multisite.py
@@ -361,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config):
     if endpoints:
         # replace client names with their gateway endpoints
         config['endpoints'] = extract_gateway_endpoints(gateways, endpoints)
+    if not config.get('api_name'): # otherwise it will be set to an empty string
+        config['api_name'] = config['name']
     zonegroup = multisite.ZoneGroup(config['name'], period)
     # `zonegroup set` needs --default on command line, and 'is_master' in json
     args = is_default_arg(config)
diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py
index 6cb75173966..fae5ef3bf00 100644
--- a/qa/tasks/rook.py
+++ b/qa/tasks/rook.py
@@ -8,7 +8,7 @@ import json
 import logging
 import os
 import yaml
-from io import BytesIO
+from io import BytesIO, StringIO
 
 from tarfile import ReadError
 from tasks.ceph_manager import CephManager
@@ -235,10 +235,14 @@ def ceph_log(ctx, config):
             r = ctx.rook[cluster_name].remote.run(
                 stdout=BytesIO(),
                 args=args,
+                stderr=StringIO(),
             )
             stdout = r.stdout.getvalue().decode()
             if stdout:
                 return stdout
+            stderr = r.stderr.getvalue()
+            if stderr:
+                return stderr
             return None
 
         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh
index 794353348b4..9e7a1f5134e 100755
--- a/qa/workunits/nvmeof/basic_tests.sh
+++ b/qa/workunits/nvmeof/basic_tests.sh
@@ -39,7 +39,7 @@ connect_all() {
     sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
     sleep 5
     expected_devices_count=$1
-    actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l) 
+    actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l)
     if [ "$actual_devices" -ne "$expected_devices_count" ]; then
         sudo nvme list --output-format=json
         return 1
@@ -74,7 +74,7 @@ test_run connect
 test_run list_subsys 1
 test_run disconnect_all
 test_run list_subsys 0
-devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT)) 
+devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) 
 test_run connect_all $devices_count
 gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
 multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) 
diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh
index 03fb58693bd..f7f783afc67 100755
--- a/qa/workunits/nvmeof/fio_test.sh
+++ b/qa/workunits/nvmeof/fio_test.sh
@@ -34,7 +34,7 @@ done
 
 fio_file=$(mktemp -t nvmeof-fio-XXXX)
 all_drives_list=$(sudo nvme list --output-format=json | 
-    jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath')
+    jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace')
 
 # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), 
 # then fio runs on namespaces only in the defined range (which is 1 to 3 here). 
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index 2aa27d3d655..0ceb9ff54cf 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -914,6 +914,11 @@ test_namespace() {
 
     rbd group create rbd/test1/group1
     rbd group image add rbd/test1/group1 rbd/test1/image1
+    rbd group image add --group-pool rbd --group-namespace test1 --group group1 \
+        --image-pool rbd --image-namespace test1 --image image2
+    rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \
+        --image-pool rbd --image-namespace test1 --image image1
+    rbd group image rm rbd/test1/group1 rbd/test1/image2
     rbd group rm rbd/test1/group1
 
     rbd trash move rbd/test1/image1