diff options
Diffstat (limited to 'qa')
104 files changed, 1231 insertions, 157 deletions
diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml new file mode 100644 index 00000000000..d5ba487b9bf --- /dev/null +++ b/qa/config/crimson_bluestore.yaml @@ -0,0 +1,25 @@ +overrides: + ceph: + fs: xfs + conf: + osd: + # crimson's osd objectstore option + crimson osd objectstore: bluestore + debug alienstore: 20 + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore compression mode: aggressive + bluestore fsck on mount: true + bluestore compression algorithm: snappy + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + bluestore rocksdb cf: false + log to stderr: true + err to stderr: true + log flush on exit: true + log to file: false diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml index 8cf98f38001..a10c59d77cc 100644 --- a/qa/config/crimson_qa_overrides.yaml +++ b/qa/config/crimson_qa_overrides.yaml @@ -9,7 +9,6 @@ overrides: osd pool default crimson: true osd: crimson osd obc lru size: 10 - debug alienstore: 20 debug ms: 20 flavor: crimson workunit: diff --git a/qa/config/seastore.yaml b/qa/config/crimson_seastore.yaml index 6158563eedf..d1919456ab1 100644 --- a/qa/config/seastore.yaml +++ b/qa/config/crimson_seastore.yaml @@ -1,13 +1,13 @@ overrides: ceph: - fs: xfs conf: osd: - osd objectstore: seastore + # crimson's osd objectstore option + crimson osd objectstore: seastore debug seastore: 20 debug seastore onode: 20 debug seastore odata: 20 - debug seastore ompap: 20 + debug seastore omap: 20 debug seastore tm: 20 debug seastore t: 20 debug seastore cleaner: 20 diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs index c979e5b105f..c558a1382ef 100644 --- a/qa/crontab/teuthology-cronjobs +++ b/qa/crontab/teuthology-cronjobs @@ -52,7 +52,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 00 05 * * 0,2,4 $CW $SS 1 --ceph main --suite smoke -p 100 --force-priority 08 05 * * 0 $CW $SS 1 --ceph squid --suite smoke -p 100 --force-priority 16 05 * * 0 $CW $SS 1 --ceph reef --suite smoke -p 100 --force-priority -24 05 * * 0 $CW $SS 1 --ceph quincy --suite smoke -p 100 --force-priority ## ********** windows tests on main branch - weekly # 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL @@ -122,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 16 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820 24 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820 32 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820 -40 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade/quincy-p2p -p 820 ### upgrade runs for reef release ###### on smithi diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh index aedfbc9b5cb..f7424de8ce1 100755 --- a/qa/standalone/osd/osd-bluefs-volume-ops.sh +++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh @@ -72,7 +72,7 @@ function TEST_bluestore() { truncate $dir/0/block -s 4294967296 # 4GB ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1 - truncate $dir/1/block -s 4311744512 # 4GB + 16MB + truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240 ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1 truncate $dir/2/block -s 4295099392 # 4GB + 129KB ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1 diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 843e9b9901b..7b77a60f35b 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -163,7 +163,7 @@ function wait_for_scrub_mod() { fi sleep 1 # are we still the primary? - local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` if [ $orig_primary != $current_primary ]; then echo $orig_primary no longer primary for $pgid return 0 @@ -194,7 +194,7 @@ function pg_scrub_mod() { local last_scrub=$(get_last_scrub_stamp $pgid) # locate the primary - local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` local recovery=false ceph pg scrub $pgid #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 8015e023bdd..385479258f2 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() { --osd_op_queue=wpq \ --osd_stats_update_period_not_scrubbing=1 \ --osd_stats_update_period_scrubbing=1 \ + --osd_scrub_retry_after_noscrub=1 \ + --osd_scrub_retry_pg_state=2 \ + --osd_scrub_retry_delay=2 \ --osd_scrub_sleep=0.2" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() { declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" ) wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1 - sleep 2 - # # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub # scheduled for the future' value # - ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 - ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 ceph osd set noscrub || return 1 sleep 2 + ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 + ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 + sleep 8 saved_last_stamp=${sched_data['query_last_stamp']} ceph tell $pgid schedule-scrub @@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() { teardown $dir || return 1 } +function wait_initial_scrubs() { + local -n pg_to_prim_dict=$1 + local extr_dbg=1 # note: 3 and above leave some temp files around + + # set a long schedule for the periodic scrubs. Wait for the + # initial 'no previous scrub is known' scrubs to finish for all PGs. + ceph tell osd.* config set osd_scrub_min_interval 7200 + ceph tell osd.* config set osd_deep_scrub_interval 14400 + ceph tell osd.* config set osd_max_scrubs 32 + ceph tell osd.* config set osd_scrub_sleep 0 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 10 + ceph tell osd.* config set osd_scrub_chunk_max 10 + + for pg in "${!pg_to_prim_dict[@]}"; do + (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg" + ceph tell $pg scrub || return 1 + done + + sleep 1 + (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + + tout=20 + while [ $tout -gt 0 ] ; do + sleep 0.5 + (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + not_done=$(ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l ) + # note that we should ignore a header line + if [ "$not_done" -le 1 ]; then + break + fi + not_done=$(( (not_done - 2) / 4 )) + echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)" + tout=$((tout - 1)) + done + (( tout == 0 )) && return 1 + return 0 +} + + +# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued +# for its replicas: +# if the operator is requesting a scrub of the same PG, the operator's request +# should trigger an abort of the ongoing scrub. +# +# The test process: +# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one. +# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one +# should be stuck in replica reservation. We will verify that. +# - now - the operator is requesting that second PG to be scrubbed. The original (pending) +# scrub should be aborted. We would check for: +# - the new, operator's scrub to be scheduled +# - the replicas' reservers to be released +function TEST_abort_periodic_for_operator() { + local dir=$1 + local -A cluster_conf=( + ['osds_num']="5" + ['pgs_in_pool']="16" + ['pool_name']="test" + ) + local extr_dbg=1 # note: 3 and above leave some temp files around + + standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1 + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} + echo "Pool: $poolname : $poolid" + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # fill the pool with some data + TESTDATA="testdata.$$" + dd if=/dev/urandom of=$TESTDATA bs=320 count=1 + for i in $( seq 1 256 ) + do + rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null + done + rm -f $TESTDATA + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + + # create the dictionary of the PGs in the pool + declare -A pg_pr + declare -A pg_ac + declare -A pg_po + build_pg_dicts "$dir" pg_pr pg_ac pg_po "-" + (( extr_dbg >= 2 )) && echo "PGs table:" + for pg in "${!pg_pr[@]}"; do + (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}" + done + + wait_initial_scrubs pg_pr || return 1 + + # limit all OSDs to one scrub at a time + ceph tell osd.* config set osd_max_scrubs 1 + ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1 + + # configure for slow scrubs + ceph tell osd.* config set osd_scrub_sleep 3 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 2 + ceph tell osd.* config set osd_scrub_chunk_max 2 + (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty + + # the first PG to work with: + local pg1="1.0" + # and another one, that shares its primary, and at least one more active set member + local pg2="" + for pg in "${!pg_pr[@]}"; do + if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then + local -i common=0 + count_common_active $pg $pg1 pg_ac common + if [[ $common -gt 1 ]]; then + pg2=$pg + break + fi + fi + done + if [[ -z "$pg2" ]]; then + # \todo handle the case when no such PG is found + echo "No PG found with the same primary as $pg1" + return 1 + fi + + # the common primary is allowed two concurrent scrubs + ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2 + echo "The two PGs to manipulate are $pg1 and $pg2" + + set_query_debug "$pg1" + # wait till the information published by pg1 is updated to show it as + # not being scrubbed + local is_act + for i in $( seq 1 3 ) + do + is_act=$(ceph pg "$pg1" query | jq '.scrubber.active') + if [[ "$is_act" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to finish scrubbing" + sleep 0.7 + done + ceph pg dump pgs + if [[ "$is_act" != "false" ]]; then + ceph pg "$pg1" query + echo "PG $pg1 appears to be still scrubbing" + return 1 + fi + sleep 0.5 + + echo "Initiating a periodic scrub of $pg1" + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + ceph tell $pg1 schedule-deep-scrub || return 1 + sleep 1 + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + for i in $( seq 1 14 ) + do + sleep 0.5 + stt=$(ceph pg "$pg1" query | jq '.scrubber') + is_active=$(echo $stt | jq '.active') + is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas') + if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to start scrubbing: $stt" + done + if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then + ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + echo "The scrub is not active or is reserving replicas" + return 1 + fi + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + + # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared + # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from + # reserving its replicas. + + (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty + + # now - the 2'nd scrub - which should be blocked on reserving + set_query_debug "$pg2" + ceph tell "$pg2" schedule-deep-scrub + sleep 0.5 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + sleep 1 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + # make sure pg2 scrub is stuck in the reserving state + local stt2=$(ceph pg "$pg2" query | jq '.scrubber') + local pg2_is_reserving + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_reserving" != "true" ]]; then + echo "The scheduled scrub for $pg2 should have been stuck" + ceph pg dump pgs + return 1 + fi + + # now - issue an operator-initiated scrub on pg2. + # The periodic scrub should be aborted, and the operator-initiated scrub should start. + echo "Instructing $pg2 to perform a high-priority scrub" + ceph tell "$pg2" scrub + for i in $( seq 1 10 ) + do + sleep 0.5 + stt2=$(ceph pg "$pg2" query | jq '.scrubber') + pg2_is_active=$(echo $stt2 | jq '.active') + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then + break + fi + echo "Still waiting: $stt2" + done + + if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then + echo "The high-priority scrub for $pg2 is not active or is reserving replicas" + return 1 + fi + echo "Done" +} + + + main osd-scrub-test "$@" # Local Variables: diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh index 49b8346b8d2..dd37b643e08 100644 --- a/qa/standalone/scrub/scrub-helpers.sh +++ b/qa/standalone/scrub/scrub-helpers.sh @@ -240,8 +240,8 @@ function standard_scrub_cluster() { local saved_echo_flag=${-//[^x]/} set +x - run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \ --osd_scrub_interval_randomize_ratio=0 \ @@ -249,9 +249,12 @@ function standard_scrub_cluster() { --osd_pool_default_pg_autoscale_mode=off \ --osd_pg_stat_report_interval_max_seconds=1 \ --osd_pg_stat_report_interval_max_epochs=1 \ + --osd_stats_update_period_not_scrubbing=3 \ + --osd_stats_update_period_scrubbing=1 \ --osd_scrub_retry_after_noscrub=5 \ --osd_scrub_retry_pg_state=5 \ --osd_scrub_retry_delay=3 \ + --osd_pool_default_size=3 \ $extra_pars" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() { } +# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries: +# - pg_primary_dict: a dictionary of pgid -> acting_primary +# - pg_acting_dict: a dictionary of pgid -> acting set +# - pg_pool_dict: a dictionary of pgid -> pool +# If the input file is '-', the function will fetch the dump directly from the ceph cluster. +function build_pg_dicts { + local dir=$1 + local -n pg_primary_dict=$2 + local -n pg_acting_dict=$3 + local -n pg_pool_dict=$4 + local infile=$5 + + local extr_dbg=0 # note: 3 and above leave some temp files around + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # if the infile name is '-', fetch the dump directly from the ceph cluster + if [[ $infile == "-" ]]; then + local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty" + local -r ceph_cmd_out=$(eval $ceph_cmd) + local -r ceph_cmd_rc=$? + if [[ $ceph_cmd_rc -ne 0 ]]; then + echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc" + fi + (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2 + l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' ` + else + l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile ` + fi + (( extr_dbg >= 2 )) && echo "L0: $l0" + + mapfile -t l1 < <(echo "$l0" | jq -c '.[]') + (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}" + + for item in "${l1[@]}"; do + pgid=$(echo "$item" | jq -r '.pgid') + acting=$(echo "$item" | jq -r '.acting | @sh') + pg_acting_dict["$pgid"]=$acting + acting_primary=$(echo "$item" | jq -r '.acting_primary') + pg_primary_dict["$pgid"]=$acting_primary + pool=$(echo "$item" | jq -r '.pool') + pg_pool_dict["$pgid"]=$pool + done + + if [[ -n "$saved_echo_flag" ]]; then set -x; fi +} + + +# a function that counts the number of common active-set elements between two PGs +# 1 - the first PG +# 2 - the second PG +# 3 - the dictionary of active sets +function count_common_active { + local pg1=$1 + local pg2=$2 + local -n pg_acting_dict=$3 + local -n res=$4 + + local -a a1=(${pg_acting_dict[$pg1]}) + local -a a2=(${pg_acting_dict[$pg2]}) + + local -i cnt=0 + for i in "${a1[@]}"; do + for j in "${a2[@]}"; do + if [[ $i -eq $j ]]; then + cnt=$((cnt+1)) + fi + done + done + + res=$cnt +} + + +# given a PG, find another one with a disjoint active set +# - but allow a possible common Primary +# 1 - the PG +# 2 - the dictionary of active sets +# 3 - [out] - the PG with a disjoint active set +function find_disjoint_but_primary { + local pg=$1 + local -n ac_dict=$2 + local -n p_dict=$3 + local -n res=$4 + + for cand in "${!ac_dict[@]}"; do + if [[ "$cand" != "$pg" ]]; then + local -i common=0 + count_common_active "$pg" "$cand" ac_dict common + if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then + res=$cand + return + fi + fi + done +} + + + # A debug flag is set for the PG specified, causing the 'pg query' command to display # an additional 'scrub sessions counter' field. # diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa index fea2489fdf6..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/.qa +++ b/qa/suites/crimson-rados-experimental/.qa @@ -1 +1 @@ -../.qa
\ No newline at end of file +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml deleted file mode 120000 index bd9854e7029..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/supported/centos_latest.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml deleted file mode 100644 index d8e5898b99f..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml +++ /dev/null @@ -1,14 +0,0 @@ -overrides: - ceph-deploy: - conf: - global: - osd pool default size: 2 - osd crush chooseleaf type: 0 - osd pool default pg num: 128 - osd pool default pgp num: 128 - ceph: - conf: - osd: - osd shutdown pgref assert: true -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0] diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml deleted file mode 100644 index c22f08eecf8..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - install: - ceph: - flavor: crimson -tasks: -- install: -- ceph: - conf: - osd: - debug monc: 20 - mon: - mon min osdmap epochs: 50 - paxos service trim min: 10 - # prune full osdmaps regularly - mon osdmap full prune min: 15 - mon osdmap full prune interval: 2 - mon osdmap full prune txsize: 2 - flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml deleted file mode 120000 index 6a70c381709..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/config/seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml deleted file mode 100644 index ad8c921425b..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml +++ /dev/null @@ -1,28 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - reached quota - - but it is still running - - overall HEALTH_ - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(CACHE_POOL_NO_HIT_SET\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(PG_DEGRADED\) - conf: - client: - debug ms: 1 - mon: - mon warn on pool no app: false - osd: - osd class load list: "*" - osd class default list: "*" - osd blocked scrub grace period: 3600 -tasks: -- workunit: - clients: - client.0: - - rados/test.sh - - rados/test_pool_quota.sh diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml deleted file mode 100644 index 25efcdac83d..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - ceph: - crush_tunables: optimal - conf: - mon: - mon osd initial require min compat client: luminous - osd: - osd_discard_disconnected_ops: false -tasks: -- rados: - clients: [client.0] - ops: 4000 - objects: 500 - max_attr_len: 8192 - op_weights: - read: 45 - write: 45 - delete: 10 diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/suites/crimson-rados-experimental/thrash/% index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/% +++ b/qa/suites/crimson-rados-experimental/thrash/% diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled new file mode 120000 index 00000000000..5393a75548a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/2-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml new file mode 120000 index 00000000000..5ff70eadf75 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml @@ -0,0 +1 @@ +.qa/overrides/3-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled new file mode 120000 index 00000000000..47afd70202d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/more-active-recovery.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..0bbc72db754 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled @@ -0,0 +1,6 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled new file mode 100644 index 00000000000..4aed086bcc3 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..88f15f2f691 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml index 9774de6887b..79641f695ab 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml @@ -6,6 +6,15 @@ overrides: conf: osd: osd shutdown pgref assert: true + crimson alien thread cpu cores: 6-7 + osd.0: + crimson seastar cpu cores: 0-2 + osd.1: + crimson seastar cpu cores: 3-5 + osd.2: + crimson seastar cpu cores: 0-2 + osd.3: + crimson seastar cpu cores: 3-5 global: ms cluster mode: crc ms service mode: crc diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled new file mode 100644 index 00000000000..e559d9126e8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled @@ -0,0 +1,4 @@ +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro new file mode 120000 index 00000000000..a5b729b9efa --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro @@ -0,0 +1 @@ +.qa/distros/crimson-supported-all-distro/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml index 2bf67af1b18..2bf67af1b18 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml new file mode 100644 index 00000000000..ecad09cfe3a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml @@ -0,0 +1,11 @@ +overrides: + install: + ceph: + flavor: crimson +tasks: +- install: +- ceph: + conf: + osd: + debug monc: 20 + flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled new file mode 100644 index 00000000000..0c2062240ee --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled @@ -0,0 +1,16 @@ +# no need to verify os + flavor + sha1 +verify_ceph_hash: false +tasks: +- cephadm: + conf: + mgr: + debug ms: 1 + debug mgr: 20 + debug osd: 10 +- cephadm.shell: + mon.a: + - ceph orch status + - ceph orch ps + - ceph orch ls + - ceph orch host ls + - ceph orch device ls diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml new file mode 100644 index 00000000000..aa44b6101ff --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml @@ -0,0 +1,34 @@ +overrides: + ceph: + log-ignorelist: + - but it is still running + - objects unfound and apparently lost + conf: + osd: + osd debug reject backfill probability: .3 + osd scrub min interval: 60 + osd scrub max interval: 120 + osd max backfills: 3 + osd snap trim sleep: 2 + osd delete sleep: 1 + mon: + mon min osdmap epochs: 50 + paxos service trim min: 10 + # prune full osdmaps regularly + mon osdmap full prune min: 15 + mon osdmap full prune interval: 2 + mon osdmap full prune txsize: 2 +tasks: +- thrashosds: + timeout: 2400 + dump_ops_enable: false + sighup_delay: 0 + min_in: 3 + noscrub_toggle_delay: 0 + chance_thrash_pg_upmap: 0 + reweight_osd: 0 + thrash_primary_affinity: false + ceph_objectstore_tool: false + chance_inject_pause_short: 0 + chance_thrash_cluster_full: 0 + chance_reset_purged_snaps_last: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml new file mode 120000 index 00000000000..9124eb1aa29 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml @@ -0,0 +1 @@ +.qa/tasks/thrashosds-health.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml new file mode 100644 index 00000000000..8c9764ade84 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + client.0: + admin socket: /var/run/ceph/ceph-$name.asok +tasks: +- radosbench: + clients: [client.0] + time: 150 +- admin_socket: + client.0: + objecter_requests: + test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}" diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml new file mode 100644 index 00000000000..d35e8421ab4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml @@ -0,0 +1,20 @@ +overrides: + conf: + osd: + osd deep scrub update digest min age: 0 +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + pool_snaps: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml new file mode 100644 index 00000000000..902c4b56a1e --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml @@ -0,0 +1,49 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml new file mode 100644 index 00000000000..071f55e3928 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml new file mode 100644 index 00000000000..afe04229898 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + balance_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml new file mode 100644 index 00000000000..445b582ea42 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + localize_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml new file mode 100644 index 00000000000..e7e8070fd76 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml @@ -0,0 +1,23 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml new file mode 100644 index 00000000000..1161c3cc253 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + balance_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml new file mode 100644 index 00000000000..80af0def0e4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + localize_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml new file mode 100644 index 00000000000..0694ffcd0d6 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml @@ -0,0 +1,14 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml new file mode 100644 index 00000000000..606dcae6922 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml @@ -0,0 +1,8 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 500 + write_fadvise_dontneed: true + op_weights: + write: 100 diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore deleted file mode 120000 index dbccf5ad928..00000000000 --- a/qa/suites/crimson-rados/singleton/objectstore +++ /dev/null @@ -1 +0,0 @@ -../thrash/objectstore
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml new file mode 120000 index 00000000000..481e393be4a --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml new file mode 120000 index 00000000000..abd86d7d986 --- /dev/null +++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml @@ -0,0 +1 @@ +.qa/overrides/short_pg_log.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 55dde639c23..b7a0338566c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -8,6 +8,7 @@ overrides: - \(MDS_DAMAGE\) - \(FS_DEGRADED\) - \(MDS_CACHE_OVERSIZED\) + - \(MDS_ESTIMATED_REPLAY_TIME\) ceph-fuse: disabled: true tasks: diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml index aa966bff214..2dd668c9f88 100644 --- a/qa/suites/fs/nfs/tasks/nfs.yaml +++ b/qa/suites/fs/nfs/tasks/nfs.yaml @@ -1,3 +1,10 @@ +overrides: + install: + extra_system_packages: + rpm: + - fio + deb: + - fio tasks: - cephfs_test_runner: modules: diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml index 602d3416263..aa327b0cdf5 100644 --- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml +++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml @@ -5,6 +5,7 @@ overrides: - "mds.dir_split" tasks: - workunit: + timeout: 5h clients: all: - kernel_untar_build.sh diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml index 372bf2561fa..8b3c4c11ac6 100644 --- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml +++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml @@ -15,6 +15,7 @@ overrides: # causing tests to fail due to health warns, even if # the tests themselves are successful. - \(OSDMAP_FLAGS\) + - \(PG_DEGRADED\) tasks: - workunit: clients: diff --git a/qa/suites/rados/verify/clusters/fixed-4.yaml b/qa/suites/rados/verify/clusters/fixed-4.yaml new file mode 120000 index 00000000000..aa88300715a --- /dev/null +++ b/qa/suites/rados/verify/clusters/fixed-4.yaml @@ -0,0 +1 @@ +.qa/clusters/fixed-4.yaml
\ No newline at end of file diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml index e2dc29b5f7e..17cf141b0cd 100644 --- a/qa/suites/rados/verify/validater/valgrind.yaml +++ b/qa/suites/rados/verify/validater/valgrind.yaml @@ -27,6 +27,7 @@ overrides: - \(SLOW_OPS\) - slow request - OSD bench result + - OSD_DOWN valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] osd: [--tool=memcheck] diff --git a/qa/suites/rgw/lua/tasks/0-install.yaml b/qa/suites/rgw/lua/tasks/0-install.yaml index fa6e279145c..d85ebcc5998 100644 --- a/qa/suites/rgw/lua/tasks/0-install.yaml +++ b/qa/suites/rgw/lua/tasks/0-install.yaml @@ -3,7 +3,7 @@ tasks: - ceph: - openssl_keys: - rgw: [client.0] -- tox: [client.0] +- tox: [client.0] overrides: ceph: @@ -11,3 +11,11 @@ overrides: global: osd_min_pg_log_entries: 10 osd_max_pg_log_entries: 10 + install: + ceph: + extra_system_packages: + rpm: + - luarocks + deb: + - liblua5.3-dev + - luarocks diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml new file mode 100644 index 00000000000..5c83d5c0d23 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml @@ -0,0 +1,20 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: + client.0: + +overrides: + install: + ceph: + extra_system_packages: + rpm: + - java + deb: + - default-jre + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros new file mode 120000 index 00000000000..46280a42a96 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros @@ -0,0 +1 @@ +../../.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml new file mode 100644 index 00000000000..01d6fc637de --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml @@ -0,0 +1,8 @@ +tasks: +- kafka-failover: + client.0: + kafka_version: 3.8.1 +- notification-tests: + client.0: + extra_attr: ["kafka_failover"] + rgw_server: client.0 diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml index 40fbcefe728..62fb6427f72 100644 --- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml @@ -32,13 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml index e27c7c0f092..f7167975aa9 100644 --- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml @@ -1,11 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml index 005514292ce..5641471629e 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml @@ -1,17 +1,25 @@ overrides: ceph: log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS - - PG_AVAILABILITY + - OSD_UPGRADE_FINISHED tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml index 146bd57960d..62fb6427f72 100644 --- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml @@ -32,4 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - PG_DEGRADED + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml index ce4e0cc228b..b5160c2dd00 100644 --- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml @@ -1,12 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY - - PG_DEGRADED + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml index 5e995da7d2c..fa93b2f2ece 100644 --- a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml +++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml @@ -1,20 +1,19 @@ overrides: ceph: log-ignorelist: - - \(MDS_ALL_DOWN\) - - \(MDS_UP_LESS_THAN_MAX\) - - \(OSD_SLOW_PING_TIME + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME - reached quota + - running out of quota - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(SLOW_OPS\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(OBJECT_MISPLACED\) + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED - slow request - - \(MON_DOWN\) - noscrub - nodeep-scrub diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml index 992f9e1bc36..59ccfe2cd02 100644 --- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml @@ -1,11 +1,25 @@ overrides: ceph: log-ignorelist: + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml index 5e995da7d2c..fa93b2f2ece 100644 --- a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml @@ -1,20 +1,19 @@ overrides: ceph: log-ignorelist: - - \(MDS_ALL_DOWN\) - - \(MDS_UP_LESS_THAN_MAX\) - - \(OSD_SLOW_PING_TIME + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME - reached quota + - running out of quota - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(SLOW_OPS\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(OBJECT_MISPLACED\) + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED - slow request - - \(MON_DOWN\) - noscrub - nodeep-scrub diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 9b04e3dc675..8f666d2fa9b 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1206,8 +1206,18 @@ def cluster(ctx, config): args.extend([ run.Raw('|'), 'head', '-n', '1', ]) - stdout = mon0_remote.sh(args) - return stdout or None + r = mon0_remote.run( + stdout=BytesIO(), + args=args, + stderr=StringIO(), + ) + stdout = r.stdout.getvalue().decode() + if stdout: + return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr + return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_ignorelist']) is not None: diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index dab61c2c700..0cde6050718 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -475,12 +475,16 @@ def ceph_log(ctx, config): run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( - stdout=StringIO(), + stdout=BytesIO(), args=args, + stderr=StringIO(), ) - stdout = r.stdout.getvalue() - if stdout != '': + stdout = r.stdout.getvalue().decode() + if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None # NOTE: technically the first and third arg to first_in_ceph_log diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 346f139874b..468378fce3d 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -153,6 +153,8 @@ class TestExportPin(CephFSTestCase): # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap # to be written out every event. Yuck! self.config_set('mds', 'mds_debug_subtrees', False) + # make sure ESubtreeMap is written frequently enough: + self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4') self.config_rm('mds', 'mds bal split size') # don't split /top self.mount_a.run_shell_payload("rm -rf 1") diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 29af1e76a4f..46139163ddd 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,4 @@ +import re import time import signal import logging @@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase): self.fs.wait_for_daemons(timeout=90) +class TestFailoverBeaconHealth(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def initiate_journal_replay(self, num_files=100): + """ Initiate journal replay by creating files and restarting mds server.""" + + self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000") + self.mounts[0].test_files = [str(x) for x in range(num_files)] + self.mounts[0].create_files() + self.fs.fail() + self.fs.set_joinable() + + def test_replay_beacon_estimated_time(self): + """ + That beacon emits warning message with estimated time to complete replay + """ + self.initiate_journal_replay() + self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60) + # remove the config so that replay finishes and the cluster + # is HEALTH_OK + self.config_rm("mds", "mds_delay_journal_replay_for_testing") + self.wait_for_health_clear(timeout=60) + + def test_replay_estimated_time_accuracy(self): + self.initiate_journal_replay(250) + def replay_complete(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + return 'MDS_ESTIMATED_REPLAY_TIME' not in codes + + def get_estimated_time(): + completion_percentage = 0.0 + time_duration = pending_duration = 0 + with safe_while(sleep=5, tries=360) as proceed: + while proceed(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + if 'MDS_ESTIMATED_REPLAY_TIME' in codes: + message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message'] + ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s" + m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message) + if not m: + continue + completion_percentage = float(m.group(1)) + time_duration = int(m.group(3)) + pending_duration = int(m.group(4)) + log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}") + if completion_percentage >= 50: + return (completion_percentage, time_duration, pending_duration) + _, _, pending_duration = get_estimated_time() + # wait for 25% more time to avoid false negative failures + self.wait_until_true(replay_complete, timeout=pending_duration * 1.25) + class TestFailover(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index faa35be6926..0a1c07dce04 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -369,6 +369,45 @@ class TestNFS(MgrTestCase): except CommandFailedError as e: self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}") + def _mnt_nfs(self, pseudo_path, port, ip): + ''' + Mount created export + :param pseudo_path: It is the pseudo root name + :param port: Port of deployed nfs cluster + :param ip: IP of deployed nfs cluster + ''' + tries = 3 + while True: + try: + self.ctx.cluster.run( + args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}', + f'{ip}:{pseudo_path}', '/mnt']) + break + except CommandFailedError: + if tries: + tries -= 1 + time.sleep(2) + continue + raise + + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) + + def _test_fio(self, pseudo_path, port, ip): + ''' + run fio with libaio on /mnt/fio + :param mnt_path: nfs mount point + ''' + try: + self._mnt_nfs(pseudo_path, port, ip) + self.ctx.cluster.run(args=['mkdir', '/mnt/fio']) + fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50'] + self.ctx.cluster.run(args=fio_cmd) + except CommandFailedError as e: + self.fail(f"expected fio to be successful but failed with {e.exitstatus}") + finally: + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio']) + self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + def _write_to_read_only_export(self, pseudo_path, port, ip): ''' Check if write to read only export fails @@ -627,6 +666,18 @@ class TestNFS(MgrTestCase): self._test_data_read_write(self.pseudo_path, port, ip) self._test_delete_cluster() + def test_async_io_fio(self): + ''' + Test async io using fio. Expect completion without hang or crash + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_fio(self.pseudo_path, port, ip) + self._test_delete_cluster() + def test_cluster_info(self): ''' Test cluster info outputs correct ip and hostname diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py new file mode 100644 index 00000000000..3ca60ab84fc --- /dev/null +++ b/qa/tasks/kafka_failover.py @@ -0,0 +1,244 @@ +""" +Deploy and configure Kafka for Teuthology +""" +import contextlib +import logging +import time +import os + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def get_kafka_version(config): + for client, client_config in config.items(): + if 'kafka_version' in client_config: + kafka_version = client_config.get('kafka_version') + return kafka_version + +kafka_prefix = 'kafka_2.13-' + +def get_kafka_dir(ctx, config): + kafka_version = get_kafka_version(config) + current_version = kafka_prefix + kafka_version + return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version) + + +@contextlib.contextmanager +def install_kafka(ctx, config): + """ + Downloading the kafka tar file. + """ + assert isinstance(config, dict) + log.info('Installing Kafka...') + + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + test_dir=teuthology.get_testdir(ctx) + current_version = get_kafka_version(config) + + kafka_file = kafka_prefix + current_version + '.tgz' + + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file], + ) + + kafka_dir = get_kafka_dir(ctx, config) + # create config for second broker + second_broker_config_name = "server2.properties" + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/") + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'), + 'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir) + ], + ) + + # edit config + ctx.cluster.only(client).run( + args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name) + ] + ) + + try: + yield + finally: + log.info('Removing packaged dependencies of Kafka...') + test_dir=get_kafka_dir(ctx, config) + current_version = get_kafka_version(config) + for (client,_) in config.items(): + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', test_dir], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)], + ) + + +@contextlib.contextmanager +def run_kafka(ctx,config): + """ + This includes two parts: + 1. Starting Zookeeper service + 2. Starting Kafka service + """ + assert isinstance(config, dict) + log.info('Bringing up Zookeeper and Kafka services...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + kafka_dir = get_kafka_dir(ctx, config) + + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_java_log_dir = "{}/java_logs".format(second_broker_data) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './zookeeper-server-start.sh', + '{tir}/config/zookeeper.properties'.format(tir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './kafka-server-start.sh', + '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)), + './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + log.info('Stopping Zookeeper and Kafka Services...') + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-server-stop.sh', + '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './zookeeper-server-stop.sh', + '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run(args=['killall', '-9', 'java']) + + +@contextlib.contextmanager +def run_admin_cmds(ctx,config): + """ + Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic. + """ + assert isinstance(config, dict) + log.info('Checking kafka server through producer/consumer commands...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-topics.sh', '--create', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + 'echo', "First", run.Raw('|'), + './kafka-console-producer.sh', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-console-consumer.sh', '--topic', 'quickstart-events', + '--from-beginning', + '--bootstrap-server', 'localhost:9092', + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + pass + + +@contextlib.contextmanager +def task(ctx,config): + """ + Following is the way how to run kafka:: + tasks: + - kafka: + client.0: + kafka_version: 2.6.0 + """ + assert config is None or isinstance(config, list) \ + or isinstance(config, dict), \ + "task kafka only supports a list or dictionary for configuration" + + all_clients = ['client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + if config is None: + config = all_clients + if isinstance(config, list): + config = dict.fromkeys(config) + + log.debug('Kafka config is %s', config) + + with contextutil.nested( + lambda: install_kafka(ctx=ctx, config=config), + lambda: run_kafka(ctx=ctx, config=config), + lambda: run_admin_cmds(ctx=ctx, config=config), + ): + yield + diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py index b4697a6f797..f1eae3c89c4 100644 --- a/qa/tasks/notification_tests.py +++ b/qa/tasks/notification_tests.py @@ -220,7 +220,7 @@ def run_tests(ctx, config): for client, client_config in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() - attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] + attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] if 'extra_attr' in client_config: attr = client_config.get('extra_attr') diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index c58a7267b4e..691a6f7dd86 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -315,7 +315,7 @@ class NvmeofThrasher(Thrasher, Greenlet): def _get_devices(self, remote): GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \ - "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'" + "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'" devices = remote.sh(GET_DEVICE_CMD).split() return devices diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py index e83a54efc2b..f93ca017fa2 100644 --- a/qa/tasks/rgw_multisite.py +++ b/qa/tasks/rgw_multisite.py @@ -361,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config): if endpoints: # replace client names with their gateway endpoints config['endpoints'] = extract_gateway_endpoints(gateways, endpoints) + if not config.get('api_name'): # otherwise it will be set to an empty string + config['api_name'] = config['name'] zonegroup = multisite.ZoneGroup(config['name'], period) # `zonegroup set` needs --default on command line, and 'is_master' in json args = is_default_arg(config) diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 6cb75173966..fae5ef3bf00 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -8,7 +8,7 @@ import json import logging import os import yaml -from io import BytesIO +from io import BytesIO, StringIO from tarfile import ReadError from tasks.ceph_manager import CephManager @@ -235,10 +235,14 @@ def ceph_log(ctx, config): r = ctx.rook[cluster_name].remote.run( stdout=BytesIO(), args=args, + stderr=StringIO(), ) stdout = r.stdout.getvalue().decode() if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh index 794353348b4..9e7a1f5134e 100755 --- a/qa/workunits/nvmeof/basic_tests.sh +++ b/qa/workunits/nvmeof/basic_tests.sh @@ -39,7 +39,7 @@ connect_all() { sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600 sleep 5 expected_devices_count=$1 - actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l) + actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l) if [ "$actual_devices" -ne "$expected_devices_count" ]; then sudo nvme list --output-format=json return 1 @@ -74,7 +74,7 @@ test_run connect test_run list_subsys 1 test_run disconnect_all test_run list_subsys 0 -devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT)) +devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) test_run connect_all $devices_count gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh index 03fb58693bd..f7f783afc67 100755 --- a/qa/workunits/nvmeof/fio_test.sh +++ b/qa/workunits/nvmeof/fio_test.sh @@ -34,7 +34,7 @@ done fio_file=$(mktemp -t nvmeof-fio-XXXX) all_drives_list=$(sudo nvme list --output-format=json | - jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath') + jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace') # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), # then fio runs on namespaces only in the defined range (which is 1 to 3 here). diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index 2aa27d3d655..0ceb9ff54cf 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -914,6 +914,11 @@ test_namespace() { rbd group create rbd/test1/group1 rbd group image add rbd/test1/group1 rbd/test1/image1 + rbd group image add --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image2 + rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image1 + rbd group image rm rbd/test1/group1 rbd/test1/image2 rbd group rm rbd/test1/group1 rbd trash move rbd/test1/image1 |