diff options
Diffstat (limited to 'qa')
217 files changed, 3348 insertions, 629 deletions
diff --git a/qa/Makefile b/qa/Makefile index ad655b7e743..05dc834adbd 100644 --- a/qa/Makefile +++ b/qa/Makefile @@ -1,4 +1,4 @@ -DIRS= workunits btrfs +DIRS= workunits all: for d in $(DIRS) ; do ( cd $$d ; $(MAKE) all ) ; done diff --git a/qa/btrfs/.gitignore b/qa/btrfs/.gitignore deleted file mode 100644 index 530c1b5b4ed..00000000000 --- a/qa/btrfs/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/clone_range -/test_async_snap -/create_async_snap diff --git a/qa/btrfs/Makefile b/qa/btrfs/Makefile deleted file mode 100644 index be95ecfd3cd..00000000000 --- a/qa/btrfs/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -CFLAGS = -Wall -Wextra -D_GNU_SOURCE - -TARGETS = clone_range test_async_snap create_async_snap - -.c: - $(CC) $(CFLAGS) $@.c -o $@ - -all: $(TARGETS) - -clean: - rm $(TARGETS) diff --git a/qa/btrfs/clone_range.c b/qa/btrfs/clone_range.c deleted file mode 100644 index 0a88e160131..00000000000 --- a/qa/btrfs/clone_range.c +++ /dev/null @@ -1,35 +0,0 @@ -#include <fcntl.h> -#include <stdlib.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" -#include <stdio.h> -#include <errno.h> - -int main(int argc, char **argv) -{ - struct btrfs_ioctl_clone_range_args ca; - int dfd; - int r; - - if (argc < 6) { - printf("usage: %s <srcfn> <srcoffset> <srclen> <destfn> <destoffset>\n", argv[0]); - exit(1); - } - - ca.src_fd = open(argv[1], O_RDONLY); - ca.src_offset = atoi(argv[2]); - ca.src_length = atoi(argv[3]); - dfd = open(argv[4], O_WRONLY|O_CREAT); - ca.dest_offset = atoi(argv[5]); - - r = ioctl(dfd, BTRFS_IOC_CLONE_RANGE, &ca); - printf("clone_range %s %lld %lld~%lld to %s %d %lld = %d %s\n", - argv[1], ca.src_fd, - ca.src_offset, ca.src_length, - argv[4], dfd, - ca.dest_offset, r, strerror(errno)); - return r; -} diff --git a/qa/btrfs/create_async_snap.c b/qa/btrfs/create_async_snap.c deleted file mode 100644 index 2ef22af7b45..00000000000 --- a/qa/btrfs/create_async_snap.c +++ /dev/null @@ -1,34 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; - -int main(int argc, char **argv) -{ - int fd; - int r; - - if (argc != 3) { - printf("usage: %s <source subvol> <name>\n", argv[0]); - return 1; - } - printf("creating snap ./%s from %s\n", argv[2], argv[1]); - fd = open(".", O_RDONLY); - va.fd = open(argv[1], O_RDONLY); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - strcpy(va.name, argv[2]); - r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va); - printf("result %d\n", r ? -errno:0); - return r; -} diff --git a/qa/btrfs/test_async_snap.c b/qa/btrfs/test_async_snap.c deleted file mode 100644 index 211be95a61c..00000000000 --- a/qa/btrfs/test_async_snap.c +++ /dev/null @@ -1,83 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; -struct btrfs_ioctl_vol_args vold; -int max = 4; - -void check_return(int r) -{ - if (r < 0) { - printf("********* failed with %d %s ********\n", errno, strerror(errno)); - exit(1); - } -} - -int main(int argc, char **argv) -{ - int num = 1000; - - if (argc > 1) - num = atoi(argv[1]); - printf("will do %d iterations\n", num); - - int cwd = open(".", O_RDONLY); - printf("cwd = %d\n", cwd); - while (num-- > 0) { - if (rand() % 10 == 0) { - __u64 transid; - int r; - printf("sync starting\n"); - r = ioctl(cwd, BTRFS_IOC_START_SYNC, &transid); - check_return(r); - printf("sync started, transid %lld, waiting\n", transid); - r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &transid); - check_return(r); - printf("sync finished\n"); - } - - int i = rand() % max; - struct stat st; - va.fd = cwd; - sprintf(va.name, "test.%d", i); - va.transid = 0; - int r = stat(va.name, &st); - if (r < 0) { - if (rand() % 3 == 0) { - printf("snap create (sync) %s\n", va.name); - va.flags = 0; - r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va); - check_return(r); - } else { - printf("snap create (async) %s\n", va.name); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va); - check_return(r); - printf("snap created, transid %lld\n", va.transid); - if (rand() % 2 == 0) { - printf("waiting for async snap create\n"); - r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &va.transid); - check_return(r); - } - } - } else { - printf("snap remove %s\n", va.name); - vold.fd = va.fd; - strcpy(vold.name, va.name); - r = ioctl(cwd, BTRFS_IOC_SNAP_DESTROY, &vold); - check_return(r); - } - } - return 0; -} diff --git a/qa/btrfs/test_rmdir_async_snap.c b/qa/btrfs/test_rmdir_async_snap.c deleted file mode 100644 index 5dafaacaaeb..00000000000 --- a/qa/btrfs/test_rmdir_async_snap.c +++ /dev/null @@ -1,62 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; -struct btrfs_ioctl_vol_args vold; - -int main(int argc, char **argv) -{ - int num = 1000; - int i, r, fd; - char buf[30]; - - if (argc > 1) - num = atoi(argv[1]); - printf("will do %d iterations\n", num); - - fd = open(".", O_RDONLY); - vold.fd = 0; - strcpy(vold.name, "current"); - r = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&vold); - printf("create current ioctl got %d\n", r ? errno:0); - if (r) - return 1; - - for (i=0; i<num; i++) { - sprintf(buf, "current/dir.%d", i); - r = mkdir(buf, 0755); - printf("mkdir got %d\n", r ? errno:0); - if (r) - return 1; - } - - va.fd = open("current", O_RDONLY); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - for (i=0; i<num; i++) { - system("/bin/cp /boot/vmlinuz-3.2.0-ceph-00142-g9e98323 current/foo"); - sprintf(buf, "current/dir.%d", i); - r = rmdir(buf); - printf("rmdir got %d\n", r ? errno:0); - if (r) - return 1; - - if (i % 10) continue; - sprintf(va.name, "snap.%d", i); - r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va); - printf("ioctl got %d\n", r ? errno:0); - if (r) - return 1; - } - return 0; -} diff --git a/qa/cephfs/begin/3-kernel.yaml b/qa/cephfs/begin/3-kernel.yaml new file mode 100644 index 00000000000..e94a0d87dc8 --- /dev/null +++ b/qa/cephfs/begin/3-kernel.yaml @@ -0,0 +1,23 @@ +# When the --kernel option is given to teuthology-suite, the kernel is set for +# all nodes (also, the kernel is "distro" when the --kernel option is not set). +# We don't generally want to use a custom kernel for all tests, so unset it. +# The k-testing.yaml will set it, if given, for only the client nodes. +# +# Allow overriding this by using a branch ending in "-all". + +teuthology: + postmerge: + - | + local branch = yaml.kernel.branch + if branch and not yaml.kernel.branch:find "-all$" then + log.debug("removing default kernel specification: %s", yaml.kernel) + py_attrgetter(yaml.kernel).pop('branch', nil) + py_attrgetter(yaml.kernel).pop('deb', nil) + py_attrgetter(yaml.kernel).pop('flavor', nil) + py_attrgetter(yaml.kernel).pop('kdb', nil) + py_attrgetter(yaml.kernel).pop('koji', nil) + py_attrgetter(yaml.kernel).pop('koji_task', nil) + py_attrgetter(yaml.kernel).pop('rpm', nil) + py_attrgetter(yaml.kernel).pop('sha1', nil) + py_attrgetter(yaml.kernel).pop('tag', nil) + end diff --git a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml index 2ee219125e7..048cd5ce8b9 100644 --- a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml +++ b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml @@ -1,3 +1,12 @@ +teuthology: + premerge: | + log.debug("base kernel %s", base_config.kernel) + local kernel = base_config.kernel + if kernel.branch ~= "distro" then + log.debug("overriding testing kernel with %s", kernel) + yaml_fragment.kernel.client = kernel + end + kernel: client: branch: testing diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml index 94b42579777..5ac25a8f790 100644 --- a/qa/cephfs/overrides/ignorelist_health.yaml +++ b/qa/cephfs/overrides/ignorelist_health.yaml @@ -24,3 +24,4 @@ overrides: - BLUESTORE_SLOW_OP_ALERT - slow operation indications in BlueStore - experiencing slow operations in BlueStore + - MGR_MODULE_ERROR diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml new file mode 100644 index 00000000000..d5ba487b9bf --- /dev/null +++ b/qa/config/crimson_bluestore.yaml @@ -0,0 +1,25 @@ +overrides: + ceph: + fs: xfs + conf: + osd: + # crimson's osd objectstore option + crimson osd objectstore: bluestore + debug alienstore: 20 + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore compression mode: aggressive + bluestore fsck on mount: true + bluestore compression algorithm: snappy + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + bluestore rocksdb cf: false + log to stderr: true + err to stderr: true + log flush on exit: true + log to file: false diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml index fa8f49a4986..a10c59d77cc 100644 --- a/qa/config/crimson_qa_overrides.yaml +++ b/qa/config/crimson_qa_overrides.yaml @@ -9,6 +9,7 @@ overrides: osd pool default crimson: true osd: crimson osd obc lru size: 10 + debug ms: 20 flavor: crimson workunit: env: diff --git a/qa/config/seastore.yaml b/qa/config/crimson_seastore.yaml index 6158563eedf..d1919456ab1 100644 --- a/qa/config/seastore.yaml +++ b/qa/config/crimson_seastore.yaml @@ -1,13 +1,13 @@ overrides: ceph: - fs: xfs conf: osd: - osd objectstore: seastore + # crimson's osd objectstore option + crimson osd objectstore: seastore debug seastore: 20 debug seastore onode: 20 debug seastore odata: 20 - debug seastore ompap: 20 + debug seastore omap: 20 debug seastore tm: 20 debug seastore t: 20 debug seastore cleaner: 20 diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs index c979e5b105f..c558a1382ef 100644 --- a/qa/crontab/teuthology-cronjobs +++ b/qa/crontab/teuthology-cronjobs @@ -52,7 +52,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 00 05 * * 0,2,4 $CW $SS 1 --ceph main --suite smoke -p 100 --force-priority 08 05 * * 0 $CW $SS 1 --ceph squid --suite smoke -p 100 --force-priority 16 05 * * 0 $CW $SS 1 --ceph reef --suite smoke -p 100 --force-priority -24 05 * * 0 $CW $SS 1 --ceph quincy --suite smoke -p 100 --force-priority ## ********** windows tests on main branch - weekly # 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL @@ -122,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 16 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820 24 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820 32 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820 -40 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade/quincy-p2p -p 820 ### upgrade runs for reef release ###### on smithi diff --git a/qa/rgw/s3tests-branch.yaml b/qa/rgw/s3tests-branch.yaml index ef6819c87e0..8710ce35893 100644 --- a/qa/rgw/s3tests-branch.yaml +++ b/qa/rgw/s3tests-branch.yaml @@ -1,4 +1,4 @@ overrides: s3tests: - force-branch: ceph-master - # git_remote: https://github.com/ceph/ + force-branch: ceph-master + # git_remote: https://github.com/ceph/ diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh index 82bf7391a7d..72d70ca7ad5 100755 --- a/qa/standalone/ceph-helpers.sh +++ b/qa/standalone/ceph-helpers.sh @@ -1888,7 +1888,6 @@ function repair() { local last_scrub=$(get_last_scrub_stamp $pgid) ceph pg repair $pgid wait_for_scrub $pgid "$last_scrub" - sleep 2 } function test_repair() { @@ -1902,7 +1901,7 @@ function test_repair() { wait_for_clean || return 1 repair 1.0 || return 1 kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 repair 1.0 || return 1 + ! TIMEOUT=2 repair 1.0 || return 1 teardown $dir || return 1 } ####################################################################### @@ -1949,7 +1948,7 @@ function test_pg_scrub() { wait_for_clean || return 1 pg_scrub 1.0 || return 1 kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 pg_scrub 1.0 || return 1 + ! TIMEOUT=2 pg_scrub 1.0 || return 1 teardown $dir || return 1 } @@ -2089,7 +2088,7 @@ function test_wait_for_scrub() { wait_for_scrub $pgid "$last_scrub" || return 1 kill_daemons $dir KILL osd || return 1 last_scrub=$(get_last_scrub_stamp $pgid) - ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1 + ! TIMEOUT=2 wait_for_scrub $pgid "$last_scrub" || return 1 teardown $dir || return 1 } diff --git a/qa/standalone/mon/mon-cluster-log.sh b/qa/standalone/mon/mon-cluster-log.sh index 863a97c7cab..7b9adda0af6 100755 --- a/qa/standalone/mon/mon-cluster-log.sh +++ b/qa/standalone/mon/mon-cluster-log.sh @@ -62,7 +62,7 @@ function TEST_cluster_log_level() { ceph config set mon.a mon_cluster_log_level info ceph osd down 0 TIMEOUT=20 wait_for_osd up 0 || return 1 - grep -q "cluster [[]INF[]] osd.0.*boot" $dir/log + TIMEOUT=60 wait_for_string $dir/log "cluster [[]INF[]] osd.0.*boot" return_code=$? if [ $return_code -ne 0 ]; then echo "Failed : Could not find INF log in the cluster log file" @@ -145,9 +145,17 @@ function TEST_journald_cluster_log_level() { ceph osd down 0 TIMEOUT=20 wait_for_osd up 0 || return 1 search_str="osd.0.*boot" - sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log - grep -q "$search_str" $dir/journal.log - return_code=$? + return_code=1 + RETRY_DURATION=60 + for ((i=0; i < $RETRY_DURATION; i++)); do + sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log + if ! grep "$search_str" $dir/journal.log; then + sleep 1 + else + return_code=0 + break + fi + done if [ $return_code -ne 0 ]; then echo "Failed : Could not find INF log in the journalctl log file" ERRORS=$(($ERRORS + 1)) diff --git a/qa/standalone/osd-backfill/osd-backfill-space.sh b/qa/standalone/osd-backfill/osd-backfill-space.sh index 6a5c69412f4..84b9703bbfc 100755 --- a/qa/standalone/osd-backfill/osd-backfill-space.sh +++ b/qa/standalone/osd-backfill/osd-backfill-space.sh @@ -609,9 +609,16 @@ function TEST_backfill_grow() { wait_for_clean || return 1 + #Capture the timestamp after complete cleanup or finish the recovery progress + current_timestamp=$(date +"%Y-%m-%dT%H:%M:%S") + delete_pool $poolname kill_daemons $dir || return 1 - ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 + + #Ignore the num_bytes mismatch messages before calling wait_cleanup + if ! awk -v ts="$current_timestamp" '$0 >= ts && /num_bytes mismatch/' $dir/osd.*.log > /dev/null; then + return 1 + fi } # Create a 5 shard EC pool on 6 OSD cluster diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 6fea441b3a9..a34f4a47189 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -219,6 +219,18 @@ function TEST_rados_repair_warning() { ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph health unmute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + + set -o pipefail + # Should clear this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -235,7 +247,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 843e9b9901b..7b77a60f35b 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -163,7 +163,7 @@ function wait_for_scrub_mod() { fi sleep 1 # are we still the primary? - local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` if [ $orig_primary != $current_primary ]; then echo $orig_primary no longer primary for $pgid return 0 @@ -194,7 +194,7 @@ function pg_scrub_mod() { local last_scrub=$(get_last_scrub_stamp $pgid) # locate the primary - local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` local recovery=false ceph pg scrub $pgid #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 491e46603f7..6dd5b10ae8f 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -5833,7 +5833,7 @@ function TEST_periodic_scrub_replicated() { flush_pg_stats # Request a regular scrub and it will be done - pg_schedule_scrub $pg + pg_scrub $pg grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 # deep-scrub error is no longer present diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 8015e023bdd..385479258f2 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() { --osd_op_queue=wpq \ --osd_stats_update_period_not_scrubbing=1 \ --osd_stats_update_period_scrubbing=1 \ + --osd_scrub_retry_after_noscrub=1 \ + --osd_scrub_retry_pg_state=2 \ + --osd_scrub_retry_delay=2 \ --osd_scrub_sleep=0.2" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() { declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" ) wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1 - sleep 2 - # # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub # scheduled for the future' value # - ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 - ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 ceph osd set noscrub || return 1 sleep 2 + ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 + ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 + sleep 8 saved_last_stamp=${sched_data['query_last_stamp']} ceph tell $pgid schedule-scrub @@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() { teardown $dir || return 1 } +function wait_initial_scrubs() { + local -n pg_to_prim_dict=$1 + local extr_dbg=1 # note: 3 and above leave some temp files around + + # set a long schedule for the periodic scrubs. Wait for the + # initial 'no previous scrub is known' scrubs to finish for all PGs. + ceph tell osd.* config set osd_scrub_min_interval 7200 + ceph tell osd.* config set osd_deep_scrub_interval 14400 + ceph tell osd.* config set osd_max_scrubs 32 + ceph tell osd.* config set osd_scrub_sleep 0 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 10 + ceph tell osd.* config set osd_scrub_chunk_max 10 + + for pg in "${!pg_to_prim_dict[@]}"; do + (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg" + ceph tell $pg scrub || return 1 + done + + sleep 1 + (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + + tout=20 + while [ $tout -gt 0 ] ; do + sleep 0.5 + (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + not_done=$(ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l ) + # note that we should ignore a header line + if [ "$not_done" -le 1 ]; then + break + fi + not_done=$(( (not_done - 2) / 4 )) + echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)" + tout=$((tout - 1)) + done + (( tout == 0 )) && return 1 + return 0 +} + + +# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued +# for its replicas: +# if the operator is requesting a scrub of the same PG, the operator's request +# should trigger an abort of the ongoing scrub. +# +# The test process: +# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one. +# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one +# should be stuck in replica reservation. We will verify that. +# - now - the operator is requesting that second PG to be scrubbed. The original (pending) +# scrub should be aborted. We would check for: +# - the new, operator's scrub to be scheduled +# - the replicas' reservers to be released +function TEST_abort_periodic_for_operator() { + local dir=$1 + local -A cluster_conf=( + ['osds_num']="5" + ['pgs_in_pool']="16" + ['pool_name']="test" + ) + local extr_dbg=1 # note: 3 and above leave some temp files around + + standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1 + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} + echo "Pool: $poolname : $poolid" + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # fill the pool with some data + TESTDATA="testdata.$$" + dd if=/dev/urandom of=$TESTDATA bs=320 count=1 + for i in $( seq 1 256 ) + do + rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null + done + rm -f $TESTDATA + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + + # create the dictionary of the PGs in the pool + declare -A pg_pr + declare -A pg_ac + declare -A pg_po + build_pg_dicts "$dir" pg_pr pg_ac pg_po "-" + (( extr_dbg >= 2 )) && echo "PGs table:" + for pg in "${!pg_pr[@]}"; do + (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}" + done + + wait_initial_scrubs pg_pr || return 1 + + # limit all OSDs to one scrub at a time + ceph tell osd.* config set osd_max_scrubs 1 + ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1 + + # configure for slow scrubs + ceph tell osd.* config set osd_scrub_sleep 3 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 2 + ceph tell osd.* config set osd_scrub_chunk_max 2 + (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty + + # the first PG to work with: + local pg1="1.0" + # and another one, that shares its primary, and at least one more active set member + local pg2="" + for pg in "${!pg_pr[@]}"; do + if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then + local -i common=0 + count_common_active $pg $pg1 pg_ac common + if [[ $common -gt 1 ]]; then + pg2=$pg + break + fi + fi + done + if [[ -z "$pg2" ]]; then + # \todo handle the case when no such PG is found + echo "No PG found with the same primary as $pg1" + return 1 + fi + + # the common primary is allowed two concurrent scrubs + ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2 + echo "The two PGs to manipulate are $pg1 and $pg2" + + set_query_debug "$pg1" + # wait till the information published by pg1 is updated to show it as + # not being scrubbed + local is_act + for i in $( seq 1 3 ) + do + is_act=$(ceph pg "$pg1" query | jq '.scrubber.active') + if [[ "$is_act" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to finish scrubbing" + sleep 0.7 + done + ceph pg dump pgs + if [[ "$is_act" != "false" ]]; then + ceph pg "$pg1" query + echo "PG $pg1 appears to be still scrubbing" + return 1 + fi + sleep 0.5 + + echo "Initiating a periodic scrub of $pg1" + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + ceph tell $pg1 schedule-deep-scrub || return 1 + sleep 1 + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + for i in $( seq 1 14 ) + do + sleep 0.5 + stt=$(ceph pg "$pg1" query | jq '.scrubber') + is_active=$(echo $stt | jq '.active') + is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas') + if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to start scrubbing: $stt" + done + if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then + ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + echo "The scrub is not active or is reserving replicas" + return 1 + fi + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + + # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared + # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from + # reserving its replicas. + + (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty + + # now - the 2'nd scrub - which should be blocked on reserving + set_query_debug "$pg2" + ceph tell "$pg2" schedule-deep-scrub + sleep 0.5 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + sleep 1 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + # make sure pg2 scrub is stuck in the reserving state + local stt2=$(ceph pg "$pg2" query | jq '.scrubber') + local pg2_is_reserving + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_reserving" != "true" ]]; then + echo "The scheduled scrub for $pg2 should have been stuck" + ceph pg dump pgs + return 1 + fi + + # now - issue an operator-initiated scrub on pg2. + # The periodic scrub should be aborted, and the operator-initiated scrub should start. + echo "Instructing $pg2 to perform a high-priority scrub" + ceph tell "$pg2" scrub + for i in $( seq 1 10 ) + do + sleep 0.5 + stt2=$(ceph pg "$pg2" query | jq '.scrubber') + pg2_is_active=$(echo $stt2 | jq '.active') + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then + break + fi + echo "Still waiting: $stt2" + done + + if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then + echo "The high-priority scrub for $pg2 is not active or is reserving replicas" + return 1 + fi + echo "Done" +} + + + main osd-scrub-test "$@" # Local Variables: diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh index 49b8346b8d2..dd37b643e08 100644 --- a/qa/standalone/scrub/scrub-helpers.sh +++ b/qa/standalone/scrub/scrub-helpers.sh @@ -240,8 +240,8 @@ function standard_scrub_cluster() { local saved_echo_flag=${-//[^x]/} set +x - run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \ --osd_scrub_interval_randomize_ratio=0 \ @@ -249,9 +249,12 @@ function standard_scrub_cluster() { --osd_pool_default_pg_autoscale_mode=off \ --osd_pg_stat_report_interval_max_seconds=1 \ --osd_pg_stat_report_interval_max_epochs=1 \ + --osd_stats_update_period_not_scrubbing=3 \ + --osd_stats_update_period_scrubbing=1 \ --osd_scrub_retry_after_noscrub=5 \ --osd_scrub_retry_pg_state=5 \ --osd_scrub_retry_delay=3 \ + --osd_pool_default_size=3 \ $extra_pars" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() { } +# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries: +# - pg_primary_dict: a dictionary of pgid -> acting_primary +# - pg_acting_dict: a dictionary of pgid -> acting set +# - pg_pool_dict: a dictionary of pgid -> pool +# If the input file is '-', the function will fetch the dump directly from the ceph cluster. +function build_pg_dicts { + local dir=$1 + local -n pg_primary_dict=$2 + local -n pg_acting_dict=$3 + local -n pg_pool_dict=$4 + local infile=$5 + + local extr_dbg=0 # note: 3 and above leave some temp files around + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # if the infile name is '-', fetch the dump directly from the ceph cluster + if [[ $infile == "-" ]]; then + local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty" + local -r ceph_cmd_out=$(eval $ceph_cmd) + local -r ceph_cmd_rc=$? + if [[ $ceph_cmd_rc -ne 0 ]]; then + echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc" + fi + (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2 + l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' ` + else + l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile ` + fi + (( extr_dbg >= 2 )) && echo "L0: $l0" + + mapfile -t l1 < <(echo "$l0" | jq -c '.[]') + (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}" + + for item in "${l1[@]}"; do + pgid=$(echo "$item" | jq -r '.pgid') + acting=$(echo "$item" | jq -r '.acting | @sh') + pg_acting_dict["$pgid"]=$acting + acting_primary=$(echo "$item" | jq -r '.acting_primary') + pg_primary_dict["$pgid"]=$acting_primary + pool=$(echo "$item" | jq -r '.pool') + pg_pool_dict["$pgid"]=$pool + done + + if [[ -n "$saved_echo_flag" ]]; then set -x; fi +} + + +# a function that counts the number of common active-set elements between two PGs +# 1 - the first PG +# 2 - the second PG +# 3 - the dictionary of active sets +function count_common_active { + local pg1=$1 + local pg2=$2 + local -n pg_acting_dict=$3 + local -n res=$4 + + local -a a1=(${pg_acting_dict[$pg1]}) + local -a a2=(${pg_acting_dict[$pg2]}) + + local -i cnt=0 + for i in "${a1[@]}"; do + for j in "${a2[@]}"; do + if [[ $i -eq $j ]]; then + cnt=$((cnt+1)) + fi + done + done + + res=$cnt +} + + +# given a PG, find another one with a disjoint active set +# - but allow a possible common Primary +# 1 - the PG +# 2 - the dictionary of active sets +# 3 - [out] - the PG with a disjoint active set +function find_disjoint_but_primary { + local pg=$1 + local -n ac_dict=$2 + local -n p_dict=$3 + local -n res=$4 + + for cand in "${!ac_dict[@]}"; do + if [[ "$cand" != "$pg" ]]; then + local -i common=0 + count_common_active "$pg" "$cand" ac_dict common + if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then + res=$cand + return + fi + fi + done +} + + + # A debug flag is set for the PG specified, causing the 'pg query' command to display # an additional 'scrub sessions counter' field. # diff --git a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml index 7e7ede3e334..5be06bc6732 100644 --- a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml +++ b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml @@ -21,7 +21,6 @@ overrides: ceph_repository: dev ceph_mgr_modules: - status - - restful cephfs_pools: - name: "cephfs_data" pg_num: "64" diff --git a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml b/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml deleted file mode 100644 index 8e389134b92..00000000000 --- a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml +++ /dev/null @@ -1,15 +0,0 @@ -tasks: -- exec: - mgr.x: - - systemctl stop ceph-mgr.target - - sleep 5 - - ceph -s -- exec: - mon.a: - - ceph restful create-key admin - - ceph restful create-self-signed-cert - - ceph restful restart -- workunit: - clients: - client.0: - - rest/test-restful.sh diff --git a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml index 309f5060045..53e2b7fdbc8 100644 --- a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml +++ b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml @@ -20,7 +20,6 @@ overrides: ceph_repository: dev ceph_mgr_modules: - status - - restful cephfs_pools: - name: "cephfs_data" pg_num: "64" diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa index fea2489fdf6..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/.qa +++ b/qa/suites/crimson-rados-experimental/.qa @@ -1 +1 @@ -../.qa
\ No newline at end of file +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml deleted file mode 120000 index bd9854e7029..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/supported/centos_latest.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml deleted file mode 100644 index d8e5898b99f..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml +++ /dev/null @@ -1,14 +0,0 @@ -overrides: - ceph-deploy: - conf: - global: - osd pool default size: 2 - osd crush chooseleaf type: 0 - osd pool default pg num: 128 - osd pool default pgp num: 128 - ceph: - conf: - osd: - osd shutdown pgref assert: true -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0] diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml deleted file mode 100644 index c22f08eecf8..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - install: - ceph: - flavor: crimson -tasks: -- install: -- ceph: - conf: - osd: - debug monc: 20 - mon: - mon min osdmap epochs: 50 - paxos service trim min: 10 - # prune full osdmaps regularly - mon osdmap full prune min: 15 - mon osdmap full prune interval: 2 - mon osdmap full prune txsize: 2 - flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml deleted file mode 120000 index 6a70c381709..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/config/seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml deleted file mode 100644 index ad8c921425b..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml +++ /dev/null @@ -1,28 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - reached quota - - but it is still running - - overall HEALTH_ - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(CACHE_POOL_NO_HIT_SET\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(PG_DEGRADED\) - conf: - client: - debug ms: 1 - mon: - mon warn on pool no app: false - osd: - osd class load list: "*" - osd class default list: "*" - osd blocked scrub grace period: 3600 -tasks: -- workunit: - clients: - client.0: - - rados/test.sh - - rados/test_pool_quota.sh diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml deleted file mode 100644 index 25efcdac83d..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - ceph: - crush_tunables: optimal - conf: - mon: - mon osd initial require min compat client: luminous - osd: - osd_discard_disconnected_ops: false -tasks: -- rados: - clients: [client.0] - ops: 4000 - objects: 500 - max_attr_len: 8192 - op_weights: - read: 45 - write: 45 - delete: 10 diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/suites/crimson-rados-experimental/thrash/% index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/% +++ b/qa/suites/crimson-rados-experimental/thrash/% diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled new file mode 120000 index 00000000000..5393a75548a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/2-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml new file mode 120000 index 00000000000..5ff70eadf75 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml @@ -0,0 +1 @@ +.qa/overrides/3-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa diff --git a/qa/suites/fs/thrash/workloads/overrides/+ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/fs/thrash/workloads/overrides/+ +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/rados/rest/% b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/rados/rest/% +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled new file mode 120000 index 00000000000..47afd70202d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/more-active-recovery.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..0bbc72db754 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled @@ -0,0 +1,6 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled new file mode 100644 index 00000000000..4aed086bcc3 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..88f15f2f691 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml index 9774de6887b..79641f695ab 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml @@ -6,6 +6,15 @@ overrides: conf: osd: osd shutdown pgref assert: true + crimson alien thread cpu cores: 6-7 + osd.0: + crimson seastar cpu cores: 0-2 + osd.1: + crimson seastar cpu cores: 3-5 + osd.2: + crimson seastar cpu cores: 0-2 + osd.3: + crimson seastar cpu cores: 3-5 global: ms cluster mode: crc ms service mode: crc diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled new file mode 100644 index 00000000000..e559d9126e8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled @@ -0,0 +1,4 @@ +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro new file mode 120000 index 00000000000..a5b729b9efa --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro @@ -0,0 +1 @@ +.qa/distros/crimson-supported-all-distro/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml index 2bf67af1b18..2bf67af1b18 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml new file mode 100644 index 00000000000..ecad09cfe3a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml @@ -0,0 +1,11 @@ +overrides: + install: + ceph: + flavor: crimson +tasks: +- install: +- ceph: + conf: + osd: + debug monc: 20 + flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled new file mode 100644 index 00000000000..0c2062240ee --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled @@ -0,0 +1,16 @@ +# no need to verify os + flavor + sha1 +verify_ceph_hash: false +tasks: +- cephadm: + conf: + mgr: + debug ms: 1 + debug mgr: 20 + debug osd: 10 +- cephadm.shell: + mon.a: + - ceph orch status + - ceph orch ps + - ceph orch ls + - ceph orch host ls + - ceph orch device ls diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/rados/rest/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/rados/rest/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml new file mode 100644 index 00000000000..aa44b6101ff --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml @@ -0,0 +1,34 @@ +overrides: + ceph: + log-ignorelist: + - but it is still running + - objects unfound and apparently lost + conf: + osd: + osd debug reject backfill probability: .3 + osd scrub min interval: 60 + osd scrub max interval: 120 + osd max backfills: 3 + osd snap trim sleep: 2 + osd delete sleep: 1 + mon: + mon min osdmap epochs: 50 + paxos service trim min: 10 + # prune full osdmaps regularly + mon osdmap full prune min: 15 + mon osdmap full prune interval: 2 + mon osdmap full prune txsize: 2 +tasks: +- thrashosds: + timeout: 2400 + dump_ops_enable: false + sighup_delay: 0 + min_in: 3 + noscrub_toggle_delay: 0 + chance_thrash_pg_upmap: 0 + reweight_osd: 0 + thrash_primary_affinity: false + ceph_objectstore_tool: false + chance_inject_pause_short: 0 + chance_thrash_cluster_full: 0 + chance_reset_purged_snaps_last: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml new file mode 120000 index 00000000000..9124eb1aa29 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml @@ -0,0 +1 @@ +.qa/tasks/thrashosds-health.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml new file mode 100644 index 00000000000..8c9764ade84 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + client.0: + admin socket: /var/run/ceph/ceph-$name.asok +tasks: +- radosbench: + clients: [client.0] + time: 150 +- admin_socket: + client.0: + objecter_requests: + test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}" diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml new file mode 100644 index 00000000000..d35e8421ab4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml @@ -0,0 +1,20 @@ +overrides: + conf: + osd: + osd deep scrub update digest min age: 0 +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + pool_snaps: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml new file mode 100644 index 00000000000..902c4b56a1e --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml @@ -0,0 +1,49 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml new file mode 100644 index 00000000000..071f55e3928 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml new file mode 100644 index 00000000000..afe04229898 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + balance_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml new file mode 100644 index 00000000000..445b582ea42 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + localize_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml index af0ac39310e..e7e8070fd76 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml @@ -1,3 +1,6 @@ +overrides: + ceph: + crush_tunables: jewel tasks: - rados: clients: [client.0] @@ -6,16 +9,15 @@ tasks: max_in_flight: 64 objects: 1024 size: 16384 - ec_pool: true - balanced_reads: true + max_attr_len: 8192 op_weights: read: 100 - write: 0 - append: 100 + write: 100 delete: 50 snap_create: 50 snap_remove: 50 - rollback: 50 - copy_from: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 setattr: 25 rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml new file mode 100644 index 00000000000..1161c3cc253 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + balance_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml new file mode 100644 index 00000000000..80af0def0e4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + localize_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml new file mode 100644 index 00000000000..0694ffcd0d6 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml @@ -0,0 +1,14 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml new file mode 100644 index 00000000000..606dcae6922 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml @@ -0,0 +1,8 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 500 + write_fadvise_dontneed: true + op_weights: + write: 100 diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml index a6af2957119..1302e14f21a 100644 --- a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml +++ b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml @@ -17,4 +17,4 @@ tasks: timeout: 1h clients: client.0: - - rados/test_python.sh -m 'not (tier or ec)' + - rados/test_python.sh -m 'not (wait or tier or ec)' diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore deleted file mode 120000 index dbccf5ad928..00000000000 --- a/qa/suites/crimson-rados/singleton/objectstore +++ /dev/null @@ -1 +0,0 @@ -../thrash/objectstore
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml new file mode 120000 index 00000000000..481e393be4a --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml new file mode 120000 index 00000000000..abd86d7d986 --- /dev/null +++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml @@ -0,0 +1 @@ +.qa/overrides/short_pg_log.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 55dde639c23..b7a0338566c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -8,6 +8,7 @@ overrides: - \(MDS_DAMAGE\) - \(FS_DEGRADED\) - \(MDS_CACHE_OVERSIZED\) + - \(MDS_ESTIMATED_REPLAY_TIME\) ceph-fuse: disabled: true tasks: diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml index aa966bff214..2dd668c9f88 100644 --- a/qa/suites/fs/nfs/tasks/nfs.yaml +++ b/qa/suites/fs/nfs/tasks/nfs.yaml @@ -1,3 +1,10 @@ +overrides: + install: + extra_system_packages: + rpm: + - fio + deb: + - fio tasks: - cephfs_test_runner: modules: diff --git a/qa/suites/fs/thrash/workloads/overrides/% b/qa/suites/fs/thrash/workloads/overrides/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/fs/thrash/workloads/overrides/% diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml index 91b45367934..91b45367934 100644 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml index bd202f988c8..bd202f988c8 100644 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml diff --git a/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/nofs/kernel.yaml b/qa/suites/fs/upgrade/nofs/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/nofs/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/workload/begin/3-kernel.yaml b/qa/suites/fs/workload/begin/3-kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/workload/begin/3-kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml index 602d3416263..aa327b0cdf5 100644 --- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml +++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml @@ -5,6 +5,7 @@ overrides: - "mds.dir_split" tasks: - workunit: + timeout: 5h clients: all: - kernel_untar_build.sh diff --git a/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml index e8f390c3b78..7f20f9f04a8 100644 --- a/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml +++ b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml @@ -28,3 +28,5 @@ overrides: mon: # cephadm can take up to 5 minutes to bring up remaining mons mon down mkfs grace: 300 + log-ignorelist: + - NVMEOF_SINGLE_GATEWAY diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml index 7c97edae552..0416ae2ea4e 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml @@ -1,7 +1,8 @@ +# runs on default nvmeof image (i.e. DEFAULT_NVMEOF_IMAGE) tasks: - nvmeof: installer: host.a - gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + gw_image: default # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml new file mode 100644 index 00000000000..8eb4f6dc63c --- /dev/null +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml @@ -0,0 +1,36 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 3 + namespaces_count: 20 + cli_image: quay.io/ceph/nvmeof-cli:latest + create_mtls_secrets: true + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + - nvmeof/fio_test.sh --rbd_iostat + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage + IOSTAT_INTERVAL: '10' + RUNTIME: '60' + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/mtls_test.sh diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml index 9ef37004427..dfe31380bb6 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml @@ -18,6 +18,7 @@ tasks: clients: client.0: - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh env: RBD_POOL: mypool RBD_IMAGE_PREFIX: myimage @@ -27,7 +28,6 @@ tasks: timeout: 30m clients: client.0: - - nvmeof/basic_tests.sh - nvmeof/fio_test.sh --rbd_iostat client.1: - nvmeof/basic_tests.sh diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml index 12cb50b408d..d66b6fc8093 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml @@ -31,8 +31,11 @@ tasks: no_coverage_and_limits: true timeout: 30m clients: - client.0: + client.3: - nvmeof/scalability_test.sh nvmeof.a,nvmeof.b - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d + - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c env: SCALING_DELAYS: '50' + RBD_POOL: mypool + NVMEOF_GROUP: mygroup0 diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml new file mode 100644 index 00000000000..83d54cdf5c3 --- /dev/null +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml @@ -0,0 +1,37 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 10 + namespaces_count: 90 # each subsystem + cli_image: quay.io/ceph/nvmeof-cli:latest + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- cephadm.exec: + host.a: + - ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml + - cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml + - "sed -i '/ pool: mypool/a\\ spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml" + - cat /tmp/nvmeof-no-huge-page.yaml + - ceph orch ls --refresh + - ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml + - ceph orch redeploy nvmeof.mypool.mygroup0 + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml index b4755a6433b..0f7ac011a60 100644 --- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml @@ -6,8 +6,8 @@ tasks: pool_name: mypool image_name_prefix: myimage gateway_config: - subsystems_count: 3 - namespaces_count: 20 # each subsystem + subsystems_count: 120 + namespaces_count: 8 # each subsystem cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml index 6a5bd1d754e..46037784d31 100644 --- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml +++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml @@ -8,6 +8,10 @@ overrides: - out of quorum # nvmeof daemon thrashing - CEPHADM_FAILED_DAEMON + - NVMEOF_SINGLE_GATEWAY + - NVMEOF_GATEWAY_DOWN + - are in unavailable state + - is unavailable - is in error state - failed cephadm daemon diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml index 422c821536a..b58dc14d87b 100644 --- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml +++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml @@ -3,9 +3,14 @@ overrides: log-ignorelist: # nvmeof daemon thrashing - CEPHADM_FAILED_DAEMON + - NVMEOF_SINGLE_GATEWAY + - NVMEOF_GATEWAY_DOWN + - are in unavailable state + - is unavailable - is in error state - failed cephadm daemon tasks: - nvmeof.thrash: checker_host: 'client.0' + randomize: False diff --git a/qa/suites/nvmeof/thrash/workloads/fio.yaml b/qa/suites/nvmeof/thrash/workloads/fio.yaml index b042b92d6ae..f9a0d0ebde5 100644 --- a/qa/suites/nvmeof/thrash/workloads/fio.yaml +++ b/qa/suites/nvmeof/thrash/workloads/fio.yaml @@ -1,11 +1,11 @@ tasks: - workunit: no_coverage_and_limits: true - timeout: 30m + timeout: 60m clients: client.0: - - nvmeof/fio_test.sh --rbd_iostat + - nvmeof/fio_test.sh --random_devices 200 env: RBD_POOL: mypool IOSTAT_INTERVAL: '10' - RUNTIME: '600' + RUNTIME: '1800' diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml new file mode 100644 index 00000000000..3bbf30ea427 --- /dev/null +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml @@ -0,0 +1,91 @@ +roles: +# Test is for basic smb deployment & functionality. one node cluster is OK +- - host.a + - mon.a + - mgr.x + - osd.0 + - osd.1 + - client.0 +- - host.b + - mon.b + - osd.2 + - osd.3 +- - host.c + - mon.c + - osd.4 + - osd.5 +# Reserve a host for acting as a domain controller and smb client +- - host.d + - cephadm.exclude +overrides: + ceph: + log-only-match: + - CEPHADM_ +tasks: +- cephadm.configure_samba_client_container: + role: host.d +- vip: + count: 1 +- cephadm: + +- cephadm.shell: + host.a: + - ceph fs volume create cephfs +- cephadm.wait_for_service: + service: mds.cephfs + +- cephadm.shell: + host.a: + # add subvolgroup & subvolumes for test + - cmd: ceph fs subvolumegroup create cephfs smb + - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777 + - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777 + # set up smb cluster and shares + - cmd: ceph mgr module enable smb + - cmd: sleep 30 + - cmd: > + ceph smb cluster create modusr1 user + --define-user-pass=user1%t3stP4ss1 + --placement=count:3 + --clustering=default + --public_addrs={{VIP0}}/{{VIPPREFIXLEN}} + - cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1 + - cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2 +# Wait for the smb service to start +- cephadm.wait_for_service: + service: smb.modusr1 + +# Check if shares exist +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +# verify CTDB is healthy, cluster well formed +- cephadm.exec: + host.a: + - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.modusr1\")))[-1].name' > /tmp/svcname" + - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status" + - cat /tmp/ctdb_status + - grep 'pnn:0 .*OK' /tmp/ctdb_status + - grep 'pnn:1 .*OK' /tmp/ctdb_status + - grep 'pnn:2 .*OK' /tmp/ctdb_status + - grep 'Number of nodes:3' /tmp/ctdb_status + - rm -rf /tmp/svcname /tmp/ctdb_status + +# Test the assigned VIP +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share2 -c ls" + +- cephadm.shell: + host.a: + - cmd: ceph smb share rm modusr1 share2 + - cmd: ceph smb share rm modusr1 share1 + - cmd: ceph smb cluster rm modusr1 +# Wait for the smb service to be removed +- cephadm.wait_for_service_not_present: + service: smb.modusr1 diff --git a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml index 1eb4a184dca..e2a2ca03cc9 100644 --- a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml +++ b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml @@ -6,7 +6,6 @@ overrides: - objects misplaced - Synthetic exception in serve - influxdb python module not found - - \(MGR_ZABBIX_ - foo bar - Failed to open Telegraf - evicting unresponsive client diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml index 372bf2561fa..8b3c4c11ac6 100644 --- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml +++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml @@ -15,6 +15,7 @@ overrides: # causing tests to fail due to health warns, even if # the tests themselves are successful. - \(OSDMAP_FLAGS\) + - \(PG_DEGRADED\) tasks: - workunit: clients: diff --git a/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml new file mode 100644 index 00000000000..7cd47898544 --- /dev/null +++ b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml @@ -0,0 +1,8 @@ +roles: +- [mon.a, mgr.x, osd.0, osd.1, client.0] +tasks: +- install: +- exec: + client.0: + - mkdir $TESTDIR/ceph_test_bluefs && cd $TESTDIR/ceph_test_bluefs && ceph_test_bluefs --log-file $TESTDIR/archive/ceph_test_bluefs.log --debug-bluefs 5/20 --gtest_catch_exceptions=0 + - rm -rf $TESTDIR/ceph_test_bluefs diff --git a/qa/suites/rados/rest/mgr-restful.yaml b/qa/suites/rados/rest/mgr-restful.yaml deleted file mode 100644 index 4901f401d30..00000000000 --- a/qa/suites/rados/rest/mgr-restful.yaml +++ /dev/null @@ -1,31 +0,0 @@ -openstack: -- volumes: # attached to each instance - count: 3 - size: 10 # GB -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, mds.a, client.a] -tasks: -- install: -- ceph: - log-ignorelist: - - overall HEALTH_ - - \(MGR_DOWN\) - - \(PG_ - - \(OSD_ - - \(OBJECT_ - - \(OSDMAP_FLAGS\) - - \(POOL_APP_NOT_ENABLED\) -- exec: - mon.a: - - ceph restful create-key admin - - ceph restful create-self-signed-cert - - ceph restful restart -- workunit: - clients: - client.a: - - rest/test-restful.sh -- exec: - mon.a: - - ceph restful delete-key admin - - ceph restful list-keys | jq ".admin" | grep null - diff --git a/qa/suites/rados/rest/supported-random-distro$ b/qa/suites/rados/rest/supported-random-distro$ deleted file mode 120000 index 7cef21eeffd..00000000000 --- a/qa/suites/rados/rest/supported-random-distro$ +++ /dev/null @@ -1 +0,0 @@ -../basic/supported-random-distro$
\ No newline at end of file diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml new file mode 100644 index 00000000000..69a54b0f1b7 --- /dev/null +++ b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml @@ -0,0 +1,57 @@ +roles: +- - mon.a + - mon.b + - mgr.a + - mgr.b + - osd.0 + - osd.1 + - osd.2 + - osd.3 +- - mon.c + - mon.d + - mgr.c + - mgr.d + - osd.4 + - osd.5 + - osd.6 + - osd.7 +- - mon.e +- - client.0 + +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +overrides: + ceph: + conf: + global: + osd pool default size: 3 + osd pool default min size: 2 + mon: + debug mon: 30 +tasks: +- install: +- ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + log-ignorelist: + - \(POOL_ + - \(CACHE_POOL_ + - overall HEALTH_ + - \(PG_AVAILABILITY\) + - Reduced data availability + - \(PG_DEGRADED\) + - \(MON_DOWN\) + - \(OSD_DATACENTER_DOWN\) + - \(OSD_DOWN\) + - \(OSD_HOST_DOWN\) + + +- workunit: + clients: + client.0: + - mon/mon-stretch-mode-5-mons-8-osds.sh +- cephfs_test_runner: + modules: + - tasks.stretch_mode_disable_enable diff --git a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml index a8bbbafece0..b916bed1475 100644 --- a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml +++ b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml @@ -2,6 +2,9 @@ meta: - desc: | rbd object class functional tests tasks: -- exec: - client.2: - - ceph_test_cls_rbd --gtest_filter=-TestClsRbd.get_features:TestClsRbd.parents:TestClsRbd.mirror +- workunit: + clients: + client.2: + - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' diff --git a/qa/suites/rados/valgrind-leaks/1-start.yaml b/qa/suites/rados/valgrind-leaks/1-start.yaml index 1cdd8a688e8..cc8c8e53766 100644 --- a/qa/suites/rados/valgrind-leaks/1-start.yaml +++ b/qa/suites/rados/valgrind-leaks/1-start.yaml @@ -12,6 +12,7 @@ overrides: - overall HEALTH_ - \(PG_ - \(POOL_APP_NOT_ENABLED\) + - OSD bench result conf: global: osd heartbeat grace: 40 diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml index c70893893fd..17cf141b0cd 100644 --- a/qa/suites/rados/verify/validater/valgrind.yaml +++ b/qa/suites/rados/verify/validater/valgrind.yaml @@ -26,6 +26,8 @@ overrides: - \(MON_DOWN\) - \(SLOW_OPS\) - slow request + - OSD bench result + - OSD_DOWN valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] osd: [--tool=memcheck] diff --git a/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml new file mode 100644 index 00000000000..d2072c41a68 --- /dev/null +++ b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml @@ -0,0 +1,8 @@ +tasks: + - exec: + client.0: + - mkdir /home/ubuntu/cephtest/migration + - qemu-img create -f qcow2 /home/ubuntu/cephtest/migration/empty.qcow2 1G + - echo '{"type":"qcow","stream":{"type":"http","url":"https://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0 + - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.1 + - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.2 diff --git a/qa/suites/rgw/bucket-logging/% b/qa/suites/rgw/bucket-logging/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/bucket-logging/% diff --git a/qa/suites/rgw/bucket-logging/.qa b/qa/suites/rgw/bucket-logging/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/0-install.yaml b/qa/suites/rgw/bucket-logging/0-install.yaml new file mode 100644 index 00000000000..6cf82f57476 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/0-install.yaml @@ -0,0 +1,13 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: [client.0] +- tox: [client.0] + +overrides: + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/bucket-logging/beast.yaml b/qa/suites/rgw/bucket-logging/beast.yaml new file mode 120000 index 00000000000..09ced62c42a --- /dev/null +++ b/qa/suites/rgw/bucket-logging/beast.yaml @@ -0,0 +1 @@ +.qa/rgw_frontend/beast.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/fixed-1.yaml b/qa/suites/rgw/bucket-logging/fixed-1.yaml new file mode 120000 index 00000000000..02df5dd0cd0 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/fixed-1.yaml @@ -0,0 +1 @@ +.qa/clusters/fixed-1.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml new file mode 120000 index 00000000000..32340b1fa8b --- /dev/null +++ b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml @@ -0,0 +1 @@ +.qa/rgw/ignore-pg-availability.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/overrides.yaml b/qa/suites/rgw/bucket-logging/overrides.yaml new file mode 100644 index 00000000000..a448a323d36 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/overrides.yaml @@ -0,0 +1,10 @@ +overrides: + ceph: + conf: + client: + setuser: ceph + setgroup: ceph + debug rgw: 20 + rgw bucket logging obj roll time: 5 + rgw: + storage classes: LUKEWARM, FROZEN diff --git a/qa/suites/rgw/bucket-logging/s3tests-branch.yaml b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml new file mode 120000 index 00000000000..bdcaca48ae0 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml @@ -0,0 +1 @@ +.qa/rgw/s3tests-branch.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/supported-distros b/qa/suites/rgw/bucket-logging/supported-distros new file mode 120000 index 00000000000..78f2991b407 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/supported-distros @@ -0,0 +1 @@ +.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/tasks/+ b/qa/suites/rgw/bucket-logging/tasks/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/bucket-logging/tasks/+ diff --git a/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml new file mode 100644 index 00000000000..c1d3b7192e1 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml @@ -0,0 +1,6 @@ +tasks: +- s3tests: + client.0: + boto3_extensions: True + rgw_server: client.0 + extra_attrs: ["bucket_logging"] diff --git a/qa/suites/rgw/crypt/2-kms/barbican.yaml b/qa/suites/rgw/crypt/2-kms/barbican.yaml index 9bf5fb81131..e3f78810416 100644 --- a/qa/suites/rgw/crypt/2-kms/barbican.yaml +++ b/qa/suites/rgw/crypt/2-kms/barbican.yaml @@ -27,7 +27,7 @@ tasks: - tox: [ client.0 ] - keystone: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 services: - name: swift type: object-store @@ -68,7 +68,7 @@ tasks: project: s3 - barbican: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 use-keystone-role: client.0 keystone_authtoken: auth_plugin: password diff --git a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml index 5e4234236a9..ac2104cdd05 100644 --- a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled +++ b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml @@ -28,4 +28,4 @@ overrides: - name: b2 endpoints: [c2.client.1] rgw-multisite-tests: - args: [tests.py] + args: [tests.py, -a, '!fails_with_rgw'] diff --git a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml index 462570e7727..303f98d540e 100644 --- a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml +++ b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml @@ -1,7 +1,7 @@ tasks: - kafka: client.0: - kafka_version: 2.6.0 + kafka_version: 3.8.1 - notification-tests: client.0: extra_attr: ["kafka_test", "data_path_v2_kafka_test"] diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml new file mode 100644 index 00000000000..5c83d5c0d23 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml @@ -0,0 +1,20 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: + client.0: + +overrides: + install: + ceph: + extra_system_packages: + rpm: + - java + deb: + - default-jre + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros new file mode 120000 index 00000000000..46280a42a96 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros @@ -0,0 +1 @@ +../../.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml new file mode 100644 index 00000000000..01d6fc637de --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml @@ -0,0 +1,8 @@ +tasks: +- kafka-failover: + client.0: + kafka_version: 3.8.1 +- notification-tests: + client.0: + extra_attr: ["kafka_failover"] + rgw_server: client.0 diff --git a/qa/suites/rgw/sts/auth-order/.qa b/qa/suites/rgw/sts/auth-order/.qa new file mode 120000 index 00000000000..fea2489fdf6 --- /dev/null +++ b/qa/suites/rgw/sts/auth-order/.qa @@ -0,0 +1 @@ +../.qa
\ No newline at end of file diff --git a/qa/suites/rgw/sts/auth-order/local-sts.yaml b/qa/suites/rgw/sts/auth-order/local-sts.yaml new file mode 100644 index 00000000000..2f7dcc6b128 --- /dev/null +++ b/qa/suites/rgw/sts/auth-order/local-sts.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: local, sts, external diff --git a/qa/suites/rgw/sts/auth-order/sts-local.yaml b/qa/suites/rgw/sts/auth-order/sts-local.yaml new file mode 100644 index 00000000000..a7b00d00f0b --- /dev/null +++ b/qa/suites/rgw/sts/auth-order/sts-local.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, local, external diff --git a/qa/suites/rgw/tempest/0-install.yaml b/qa/suites/rgw/tempest/0-install.yaml index f968db20c2b..b6ef17de4ee 100644 --- a/qa/suites/rgw/tempest/0-install.yaml +++ b/qa/suites/rgw/tempest/0-install.yaml @@ -4,7 +4,7 @@ tasks: - tox: [ client.0 ] - keystone: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 services: - name: swift type: object-store diff --git a/qa/suites/rgw/tempest/tasks/s3/% b/qa/suites/rgw/tempest/tasks/s3/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/% diff --git a/qa/suites/rgw/tempest/tasks/s3/.qa b/qa/suites/rgw/tempest/tasks/s3/.qa new file mode 120000 index 00000000000..fea2489fdf6 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/.qa @@ -0,0 +1 @@ +../.qa
\ No newline at end of file diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa new file mode 120000 index 00000000000..fea2489fdf6 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa @@ -0,0 +1 @@ +../.qa
\ No newline at end of file diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml new file mode 100644 index 00000000000..c46a51e0958 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, external, local diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml new file mode 100644 index 00000000000..a7b00d00f0b --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, local, external diff --git a/qa/suites/rgw/tempest/tasks/s3tests.yaml b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml index 4efb579fa83..4efb579fa83 100644 --- a/qa/suites/rgw/tempest/tasks/s3tests.yaml +++ b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml diff --git a/qa/suites/rgw/verify/tasks/s3tests-java.yaml b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml index 9ad89cc6790..9ad89cc6790 100644 --- a/qa/suites/rgw/verify/tasks/s3tests-java.yaml +++ b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml index 57e455ba78d..a0adaecf9b2 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml @@ -19,6 +19,20 @@ overrides: - \(MGR_DOWN\) - slow request - \(MON_MSGR2_NOT_ENABLED\) + - \(OSD_DOWN\) + - \(OSD_HOST_DOWN\) + - \(POOL_APP_NOT_ENABLED\) + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - OSDMAP_FLAGS + - OSD_ROOT_DOWN + conf: global: enable experimental unrecoverable data corrupting features: "*" @@ -30,4 +44,3 @@ roles: - mgr.x - osd.0 - osd.1 - - osd.2
\ No newline at end of file diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml index e4897db4d35..48cfa2f756f 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml @@ -18,9 +18,6 @@ tasks: mon: mon_warn_on_insecure_global_id_reclaim: false mon_warn_on_insecure_global_id_reclaim_allowed: false - log-ignorelist: - - Not found or unloadable - - evicting unresponsive client - exec: osd.0: - ceph osd require-osd-release quincy @@ -30,14 +27,3 @@ overrides: conf: mon: mon warn on osd down out interval zero: false - log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) - - OSD_DOWN - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_DEGRADED - - Reduced data availability - - Degraded data redundancy - - OSDMAP_FLAGS diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml index 6aa429f18b5..fe4ff9bb113 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml @@ -3,14 +3,13 @@ meta: install upgrade ceph/-x on cluster restart : mons, osd.* tasks: +- print: "**** start install.upgrade of nodes" - install.upgrade: - mon.a: -- exec: - osd.0: - - ceph osd require-osd-release quincy + all: - print: "**** done install.upgrade of nodes" +- print: "**** start ceph.restart of all osds" - ceph.restart: - daemons: [mon.a,mgr.x,osd.0,osd.1,osd.2] + daemons: [osd.0,osd.1,osd.2] mon-health-to-clog: false wait-for-healthy: false wait-for-osds-up: false diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml index 40fbcefe728..62fb6427f72 100644 --- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml @@ -32,13 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml index e27c7c0f092..f7167975aa9 100644 --- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml @@ -1,11 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml index 9c2ff9da185..9a0585cc074 100644 --- a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml @@ -9,4 +9,6 @@ workload: clients: client.0: - cls + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done end rados_api.yaml" diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml index 005514292ce..5641471629e 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml @@ -1,17 +1,25 @@ overrides: ceph: log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS - - PG_AVAILABILITY + - OSD_UPGRADE_FINISHED tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml index b722f187361..a55dddf46f7 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ first-half-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml index 649b024a476..d54ba8039d0 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ stress-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml index 146bd57960d..62fb6427f72 100644 --- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml @@ -32,4 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - PG_DEGRADED + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml index ce4e0cc228b..b5160c2dd00 100644 --- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml @@ -1,12 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY - - PG_DEGRADED + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml new file mode 100644 index 00000000000..fa93b2f2ece --- /dev/null +++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + log-ignorelist: + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME + - reached quota + - running out of quota + - overall HEALTH_ + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED + - slow request + - noscrub + - nodeep-scrub diff --git a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml index a46e34db5dd..79cf1a96601 100644 --- a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml +++ b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml @@ -9,4 +9,6 @@ workload: clients: client.0: - cls + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done end rados_api.yaml" diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml index 992f9e1bc36..59ccfe2cd02 100644 --- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml @@ -1,11 +1,25 @@ overrides: ceph: log-ignorelist: + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml index f092096f444..79ad2af8ea1 100644 --- a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ first-half-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml index 05bb672b3ac..166327a58f9 100644 --- a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ stress-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml new file mode 100644 index 00000000000..fa93b2f2ece --- /dev/null +++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + log-ignorelist: + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME + - reached quota + - running out of quota + - overall HEALTH_ + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED + - slow request + - noscrub + - nodeep-scrub diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 9b04e3dc675..8f666d2fa9b 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1206,8 +1206,18 @@ def cluster(ctx, config): args.extend([ run.Raw('|'), 'head', '-n', '1', ]) - stdout = mon0_remote.sh(args) - return stdout or None + r = mon0_remote.run( + stdout=BytesIO(), + args=args, + stderr=StringIO(), + ) + stdout = r.stdout.getvalue().decode() + if stdout: + return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr + return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_ignorelist']) is not None: diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 7005c8db0ff..57d22f3b5e6 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -2796,6 +2796,59 @@ class CephManager: num += 1 return num + def _print_not_active_clean_pg(self, pgs): + """ + Print the PGs that are not active+clean. + """ + for pg in pgs: + if not (pg['state'].count('active') and + pg['state'].count('clean') and + not pg['state'].count('stale')): + log.debug( + "PG %s is not active+clean, but %s", + pg['pgid'], pg['state'] + ) + + def pg_all_active_clean(self): + """ + Check if all pgs are active+clean + return: True if all pgs are active+clean else False + """ + pgs = self.get_pg_stats() + result = self._get_num_active_clean(pgs) == len(pgs) + if result: + log.debug("All PGs are active+clean") + else: + log.debug("Not all PGs are active+clean") + self._print_not_active_clean_pg(pgs) + return result + + def _print_not_active_pg(self, pgs): + """ + Print the PGs that are not active. + """ + for pg in pgs: + if not (pg['state'].count('active') + and not pg['state'].count('stale')): + log.debug( + "PG %s is not active, but %s", + pg['pgid'], pg['state'] + ) + + def pg_all_active(self): + """ + Check if all pgs are active + return: True if all pgs are active else False + """ + pgs = self.get_pg_stats() + result = self._get_num_active(pgs) == len(pgs) + if result: + log.debug("All PGs are active") + else: + log.debug("Not all PGs are active") + self._print_not_active_pg(pgs) + return result + def is_clean(self): """ True if all pgs are clean @@ -3237,6 +3290,26 @@ class CephManager: self.make_admin_daemon_dir(remote) self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart() + def get_crush_rule_id(self, crush_rule_name): + """ + Get crush rule id by name + :returns: int -- crush rule id + """ + out = self.raw_cluster_cmd('osd', 'crush', 'rule', 'dump', '--format=json') + j = json.loads('\n'.join(out.split('\n')[1:])) + for rule in j: + if rule['rule_name'] == crush_rule_name: + return rule['rule_id'] + assert False, 'rule %s not found' % crush_rule_name + + def get_mon_dump_json(self): + """ + mon dump --format=json converted to a python object + :returns: the python object + """ + out = self.raw_cluster_cmd('mon', 'dump', '--format=json') + return json.loads('\n'.join(out.split('\n')[1:])) + def get_mon_status(self, mon): """ Extract all the monitor status information from the cluster @@ -3340,6 +3413,23 @@ class CephManager: self.log(task_status) return task_status + # Stretch mode related functions + def is_degraded_stretch_mode(self): + """ + Return whether the cluster is in degraded stretch mode + """ + try: + osdmap = self.get_osd_dump_json() + stretch_mode = osdmap.get('stretch_mode', {}) + degraded_stretch_mode = stretch_mode.get('degraded_stretch_mode', 0) + self.log("is_degraded_stretch_mode: {0}".format(degraded_stretch_mode)) + return degraded_stretch_mode == 1 + except (TypeError, AttributeError) as e: + # Log the error or handle it as needed + self.log("Error accessing degraded_stretch_mode: {0}".format(e)) + return False + + def utility_task(name): """ Generate ceph_manager subtask corresponding to ceph_manager diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index dab61c2c700..0cde6050718 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -475,12 +475,16 @@ def ceph_log(ctx, config): run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( - stdout=StringIO(), + stdout=BytesIO(), args=args, + stderr=StringIO(), ) - stdout = r.stdout.getvalue() - if stdout != '': + stdout = r.stdout.getvalue().decode() + if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None # NOTE: technically the first and third arg to first_in_ceph_log diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py index c1312ec5efc..21b96d2b22b 100644 --- a/qa/tasks/cephfs/cephfs_test_case.py +++ b/qa/tasks/cephfs/cephfs_test_case.py @@ -252,8 +252,8 @@ class CephFSTestCase(CephTestCase): def get_session_data(self, client_id): return self._session_by_id(client_id) - def _session_list(self): - ls_data = self.fs.mds_asok(['session', 'ls']) + def _session_list(self, rank=None, status=None): + ls_data = self.fs.rank_asok(['session', 'ls'], rank=rank, status=status) ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] return ls_data @@ -269,9 +269,9 @@ class CephFSTestCase(CephTestCase): def perf_dump(self, rank=None, status=None): return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status) - def wait_until_evicted(self, client_id, timeout=30): + def wait_until_evicted(self, client_id, rank=None, timeout=30): def is_client_evicted(): - ls = self._session_list() + ls = self._session_list(rank=rank) for s in ls: if s['id'] == client_id: return False diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 2b7fd2ee569..3846ef23f97 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -649,6 +649,8 @@ class FilesystemBase(MDSClusterBase): def set_session_timeout(self, timeout): self.set_var("session_timeout", "%d" % timeout) + def set_session_autoclose(self, autoclose_time): + self.set_var("session_autoclose", "%d" % autoclose_time) def set_allow_standby_replay(self, yes): self.set_var("allow_standby_replay", yes) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index 00a68dd0183..beb41019e6d 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -2740,3 +2740,184 @@ class TestFSSetMaxMDS(TestAdminCommands): ''' self.fs.set_max_mds(2, confirm=True) self.assertEqual(self.fs.get_var('max_mds'), 2) + + +class TestToggleVolumes(CephFSTestCase): + ''' + Contains code for enabling/disabling mgr/volumes plugin. + ''' + + VOL_MOD_NAME = 'volumes' + CONFIRM = '--yes-i-really-mean-it' + + def tearDown(self): + ''' + Ensure that the volumes plugin is enabled after the test has finished + running since not doing so might affect tearDown() of CephFSTestCase or + other superclasses. + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + if 'volumes' in json_output['force_disabled_modules']: + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + + super(TestToggleVolumes, self).tearDown() + + def test_force_disable_with_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes + --yes-i-really-mean-it" successfully disables volumes plugin. + + Also test "ceph mgr module ls" output after this. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + def test_force_disable_fails_without_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes" fails with + EPERM when confirmation flag is not passed along. + + Also test that output of this command suggests user to pass + --yes-i-really-mean-it. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + # ensure that the confirmation flag was recommended + self.assertIn(self.CONFIRM, proc_stderr) + + def test_force_disable_idempotency(self): + ''' + Test that running "ceph mgr module force disable volumes" passes when + volumes plugin was already force disabled. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + sleep(5) + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format ' + 'json-pretty') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + # XXX: this this test, running this command 2nd time should pass. + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}') + + def test_force_disable_nonexistent_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable abcd {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_force_disable_non_alwayson_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + json_output = self.get_ceph_cmd_stdout( + 'mgr module ls --format json-pretty', check_status=False, + stderr=StringIO()) + output_dict = json.loads(json_output) + some_non_alwayson_mod = output_dict['enabled_modules'][0] + + proc = self.run_ceph_cmd( + f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_enabled_by_default(self): + ''' + Test that volumes plugin is enabled by default and is also reported as + "always on". + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + def test_disable_fails(self): + ''' + Test that running "ceph mgr module disable volumes" fails with EPERM. + + This is expected since volumes is an always-on module and therefore + it can only be disabled using command "ceph mgr module force disable + volumes". + ''' + proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + + def test_enable_idempotency(self): + ''' + Test that enabling volumes plugin when it is already enabled doesn't + exit with non-zero return value. + + Also test that it reports plugin as already enabled. + ''' + proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}', + stderr=StringIO()) + self.assertEqual(proc.returncode, 0) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('already enabled', proc_stderr) + self.assertIn('always-on', proc_stderr) + + def test_enable_post_disabling(self): + ''' + Test that enabling volumes plugin after (force-)disabling it works + successfully. + + Alo test "ceph mgr module ls" output for volumes plugin afterwards. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + # give bit of time for plugin to be disabled. + sleep(5) + + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + # give bit of time for plugin to be functional again + sleep(5) + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + # plugin is reported properly by "ceph mgr module ls" command, check if + # it is also working fine. + self.run_ceph_cmd('fs volume ls') diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 16de379f54f..468378fce3d 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -4,6 +4,7 @@ import time from tasks.cephfs.fuse_mount import FuseMount from tasks.cephfs.cephfs_test_case import CephFSTestCase from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while, MaxWhileTries log = logging.getLogger(__name__) @@ -152,6 +153,8 @@ class TestExportPin(CephFSTestCase): # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap # to be written out every event. Yuck! self.config_set('mds', 'mds_debug_subtrees', False) + # make sure ESubtreeMap is written frequently enough: + self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4') self.config_rm('mds', 'mds bal split size') # don't split /top self.mount_a.run_shell_payload("rm -rf 1") @@ -628,3 +631,186 @@ done log.info("{0} migrations have occured due to the cluster resizing".format(count)) # rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2 self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget + +class TestDumpExportStates(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + + EXPORT_STATES = ['locking', 'discovering', 'freezing', 'prepping', 'warning', 'exporting'] + + def setUp(self): + super().setUp() + + self.fs.set_max_mds(self.MDSS_REQUIRED) + self.status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload('mkdir -p test/export') + + def tearDown(self): + super().tearDown() + + def _wait_for_export_target(self, source, target, sleep=2, timeout=10): + try: + with safe_while(sleep=sleep, tries=timeout//sleep) as proceed: + while proceed(): + info = self.fs.getinfo().get_rank(self.fs.id, source) + log.info(f'waiting for rank {target} to be added to the export target') + if target in info['export_targets']: + return + except MaxWhileTries as e: + raise RuntimeError(f'rank {target} has not been added to export target after {timeout}s') from e + + def _dump_export_state(self, rank): + states = self.fs.rank_asok(['dump_export_states'], rank=rank, status=self.status) + self.assertTrue(type(states) is list) + self.assertEqual(len(states), 1) + return states[0] + + def _test_base(self, path, source, target, state_index, kill): + self.fs.rank_asok(['config', 'set', 'mds_kill_import_at', str(kill)], rank=target, status=self.status) + + self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status) + self._wait_for_export_target(source, target) + + target_rank = self.fs.get_rank(rank=target, status=self.status) + self.delete_mds_coredump(target_rank['name']) + + state = self._dump_export_state(source) + + self.assertTrue(type(state['tid']) is int) + self.assertEqual(state['path'], path) + self.assertEqual(state['state'], self.EXPORT_STATES[state_index]) + self.assertEqual(state['peer'], target) + + return state + + def _test_state_history(self, state): + history = state['state_history'] + self.assertTrue(type(history) is dict) + size = 0 + for name in self.EXPORT_STATES: + self.assertTrue(type(history[name]) is dict) + size += 1 + if name == state['state']: + break + self.assertEqual(len(history), size) + + def _test_freeze_tree(self, state, waiters): + self.assertTrue(type(state['freeze_tree_time']) is float) + self.assertEqual(state['unfreeze_tree_waiters'], waiters) + + def test_discovering(self): + state = self._test_base('/test', 0, 1, 1, 1) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertEqual(state['last_cum_auth_pins'], 0) + self.assertEqual(state['num_remote_waiters'], 0) + + def test_prepping(self): + client_id = self.mount_a.get_global_id() + + state = self._test_base('/test', 0, 1, 3, 3) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertEqual(state['flushed_clients'], [client_id]) + self.assertTrue(type(state['warning_ack_waiting']) is list) + + def test_exporting(self): + state = self._test_base('/test', 0, 1, 5, 5) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertTrue(type(state['notify_ack_waiting']) is list) + +class TestKillExports(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + + def setUp(self): + CephFSTestCase.setUp(self) + + self.fs.set_max_mds(self.MDSS_REQUIRED) + self.status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload('mkdir -p test/export') + + def tearDown(self): + super().tearDown() + + def _kill_export_as(self, rank, kill): + self.fs.rank_asok(['config', 'set', 'mds_kill_export_at', str(kill)], rank=rank, status=self.status) + + def _export_dir(self, path, source, target): + self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status) + + def _wait_failover(self): + self.wait_until_true(lambda: self.fs.status().hadfailover(self.status), timeout=self.fs.beacon_timeout) + + def _clear_coredump(self, rank): + crash_rank = self.fs.get_rank(rank=rank, status=self.status) + self.delete_mds_coredump(crash_rank['name']) + + def _run_kill_export(self, kill_at, exporter_rank=0, importer_rank=1, restart=True): + self._kill_export_as(exporter_rank, kill_at) + self._export_dir("/test", exporter_rank, importer_rank) + self._wait_failover() + self._clear_coredump(exporter_rank) + + if restart: + self.fs.rank_restart(rank=exporter_rank, status=self.status) + self.status = self.fs.wait_for_daemons() + + def test_session_cleanup(self): + """ + Test importer's session cleanup after an export subtree task is interrupted. + Set 'mds_kill_export_at' to 9 or 10 so that the importer will wait for the exporter + to restart while the state is 'acking'. + + See https://tracker.ceph.com/issues/61459 + """ + + kill_export_at = [9, 10] + + exporter_rank = 0 + importer_rank = 1 + + for kill in kill_export_at: + log.info(f"kill_export_at: {kill}") + self._run_kill_export(kill, exporter_rank, importer_rank) + + if len(self._session_list(importer_rank, self.status)) > 0: + client_id = self.mount_a.get_global_id() + self.fs.rank_asok(['session', 'evict', "%s" % client_id], rank=importer_rank, status=self.status) + + # timeout if buggy + self.wait_until_evicted(client_id, importer_rank) + + # for multiple tests + self.mount_a.remount() + + def test_client_eviction(self): + # modify the timeout so that we don't have to wait too long + timeout = 30 + self.fs.set_session_timeout(timeout) + self.fs.set_session_autoclose(timeout + 5) + + kill_export_at = [9, 10] + + exporter_rank = 0 + importer_rank = 1 + + for kill in kill_export_at: + log.info(f"kill_export_at: {kill}") + self._run_kill_export(kill, exporter_rank, importer_rank) + + client_id = self.mount_a.get_global_id() + self.wait_until_evicted(client_id, importer_rank, timeout + 10) + time.sleep(1) + + # failed if buggy + self.mount_a.ls() diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 29af1e76a4f..46139163ddd 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,4 @@ +import re import time import signal import logging @@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase): self.fs.wait_for_daemons(timeout=90) +class TestFailoverBeaconHealth(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def initiate_journal_replay(self, num_files=100): + """ Initiate journal replay by creating files and restarting mds server.""" + + self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000") + self.mounts[0].test_files = [str(x) for x in range(num_files)] + self.mounts[0].create_files() + self.fs.fail() + self.fs.set_joinable() + + def test_replay_beacon_estimated_time(self): + """ + That beacon emits warning message with estimated time to complete replay + """ + self.initiate_journal_replay() + self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60) + # remove the config so that replay finishes and the cluster + # is HEALTH_OK + self.config_rm("mds", "mds_delay_journal_replay_for_testing") + self.wait_for_health_clear(timeout=60) + + def test_replay_estimated_time_accuracy(self): + self.initiate_journal_replay(250) + def replay_complete(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + return 'MDS_ESTIMATED_REPLAY_TIME' not in codes + + def get_estimated_time(): + completion_percentage = 0.0 + time_duration = pending_duration = 0 + with safe_while(sleep=5, tries=360) as proceed: + while proceed(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + if 'MDS_ESTIMATED_REPLAY_TIME' in codes: + message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message'] + ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s" + m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message) + if not m: + continue + completion_percentage = float(m.group(1)) + time_duration = int(m.group(3)) + pending_duration = int(m.group(4)) + log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}") + if completion_percentage >= 50: + return (completion_percentage, time_duration, pending_duration) + _, _, pending_duration = get_estimated_time() + # wait for 25% more time to avoid false negative failures + self.wait_until_true(replay_complete, timeout=pending_duration * 1.25) + class TestFailover(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index 19076ea44b3..0a1c07dce04 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -55,7 +55,7 @@ class TestNFS(MgrTestCase): "squash": "none", "security_label": True, "protocols": [ - 4 + 3, 4 ], "transports": [ "TCP" @@ -369,6 +369,45 @@ class TestNFS(MgrTestCase): except CommandFailedError as e: self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}") + def _mnt_nfs(self, pseudo_path, port, ip): + ''' + Mount created export + :param pseudo_path: It is the pseudo root name + :param port: Port of deployed nfs cluster + :param ip: IP of deployed nfs cluster + ''' + tries = 3 + while True: + try: + self.ctx.cluster.run( + args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}', + f'{ip}:{pseudo_path}', '/mnt']) + break + except CommandFailedError: + if tries: + tries -= 1 + time.sleep(2) + continue + raise + + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) + + def _test_fio(self, pseudo_path, port, ip): + ''' + run fio with libaio on /mnt/fio + :param mnt_path: nfs mount point + ''' + try: + self._mnt_nfs(pseudo_path, port, ip) + self.ctx.cluster.run(args=['mkdir', '/mnt/fio']) + fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50'] + self.ctx.cluster.run(args=fio_cmd) + except CommandFailedError as e: + self.fail(f"expected fio to be successful but failed with {e.exitstatus}") + finally: + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio']) + self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + def _write_to_read_only_export(self, pseudo_path, port, ip): ''' Check if write to read only export fails @@ -627,6 +666,18 @@ class TestNFS(MgrTestCase): self._test_data_read_write(self.pseudo_path, port, ip) self._test_delete_cluster() + def test_async_io_fio(self): + ''' + Test async io using fio. Expect completion without hang or crash + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_fio(self.pseudo_path, port, ip) + self._test_delete_cluster() + def test_cluster_info(self): ''' Test cluster info outputs correct ip and hostname diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py index b5691c83852..ae1c1f2056c 100644 --- a/qa/tasks/cephfs/test_quota.py +++ b/qa/tasks/cephfs/test_quota.py @@ -115,9 +115,11 @@ class TestQuota(CephFSTestCase): readable_values = {"10K": "10240", "100Ki": "102400", + "100KiB": "102400", "10M": "10485760", "100Mi": "104857600", "2G": "2147483648", + "2GB": "2147483648", "4Gi": "4294967296", "1T": "1099511627776", "2Ti": "2199023255552"} @@ -135,7 +137,8 @@ class TestQuota(CephFSTestCase): self.mount_a.run_shell(["mkdir", "subdir"]) - invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1"] + invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1", + "1GT", "2MM", "5Di", "8Bi", "i", "7iB"] for invalid_value in invalid_values: with self.assertRaises(CommandFailedError): self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", diff --git a/qa/tasks/check_counter.py b/qa/tasks/check_counter.py index 40818f3f475..1f63b6a0bd4 100644 --- a/qa/tasks/check_counter.py +++ b/qa/tasks/check_counter.py @@ -1,11 +1,14 @@ import logging import json +import errno from teuthology.task import Task from teuthology import misc from tasks import ceph_manager +from tasks.cephfs.filesystem import MDSCluster +from teuthology.exceptions import CommandFailedError log = logging.getLogger(__name__) @@ -61,6 +64,9 @@ class CheckCounter(Task): mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager')) active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"] + mds_cluster = MDSCluster(self.ctx) + status = mds_cluster.status() + for daemon_type, counters in targets.items(): # List of 'a', 'b', 'c'... daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type)) @@ -80,13 +86,31 @@ class CheckCounter(Task): else: log.debug("Getting stats from {0}".format(daemon_id)) - manager = self.ctx.managers[cluster_name] - proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) - response_data = proc.stdout.getvalue().strip() + if daemon_type == 'mds': + mds_info = status.get_mds(daemon_id) + if not mds_info: + continue + mds = f"mds.{mds_info['gid']}" + if mds_info['state'] != "up:active": + log.debug(f"skipping {mds}") + continue + log.debug(f"Getting stats from {mds}") + try: + proc = mon_manager.raw_cluster_cmd("tell", mds, "perf", "dump", + "--format=json-pretty") + response_data = proc.strip() + except CommandFailedError as e: + if e.exitstatus == errno.ENOENT: + log.debug(f"Failed to do 'perf dump' on {mds}") + continue + else: + manager = self.ctx.managers[cluster_name] + proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) + response_data = proc.stdout.getvalue().strip() if response_data: perf_dump = json.loads(response_data) else: - log.warning("No admin socket response from {0}, skipping".format(daemon_id)) + log.warning("No response from {0}, skipping".format(daemon_id)) continue minval = '' diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py index 2ac92439de6..d955d232c2c 100644 --- a/qa/tasks/fwd_scrub.py +++ b/qa/tasks/fwd_scrub.py @@ -33,6 +33,8 @@ class ForwardScrubber(ThrasherGreenlet): def _run(self): try: self.do_scrub() + except ThrasherGreenlet.Stopped: + pass except Exception as e: self.set_thrasher_exception(e) self.logger.exception("exception:") diff --git a/qa/tasks/kafka.py b/qa/tasks/kafka.py index 5e6c208ca30..833f03babf6 100644 --- a/qa/tasks/kafka.py +++ b/qa/tasks/kafka.py @@ -4,6 +4,7 @@ Deploy and configure Kafka for Teuthology import contextlib import logging import time +import os from teuthology import misc as teuthology from teuthology import contextutil @@ -33,6 +34,13 @@ def install_kafka(ctx, config): assert isinstance(config, dict) log.info('Installing Kafka...') + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + for (client, _) in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() test_dir=teuthology.get_testdir(ctx) @@ -40,7 +48,8 @@ def install_kafka(ctx, config): kafka_file = kafka_prefix + current_version + '.tgz' - link1 = 'https://archive.apache.org/dist/kafka/' + current_version + '/' + kafka_file + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file ctx.cluster.only(client).run( args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], ) diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py new file mode 100644 index 00000000000..3ca60ab84fc --- /dev/null +++ b/qa/tasks/kafka_failover.py @@ -0,0 +1,244 @@ +""" +Deploy and configure Kafka for Teuthology +""" +import contextlib +import logging +import time +import os + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def get_kafka_version(config): + for client, client_config in config.items(): + if 'kafka_version' in client_config: + kafka_version = client_config.get('kafka_version') + return kafka_version + +kafka_prefix = 'kafka_2.13-' + +def get_kafka_dir(ctx, config): + kafka_version = get_kafka_version(config) + current_version = kafka_prefix + kafka_version + return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version) + + +@contextlib.contextmanager +def install_kafka(ctx, config): + """ + Downloading the kafka tar file. + """ + assert isinstance(config, dict) + log.info('Installing Kafka...') + + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + test_dir=teuthology.get_testdir(ctx) + current_version = get_kafka_version(config) + + kafka_file = kafka_prefix + current_version + '.tgz' + + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file], + ) + + kafka_dir = get_kafka_dir(ctx, config) + # create config for second broker + second_broker_config_name = "server2.properties" + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/") + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'), + 'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir) + ], + ) + + # edit config + ctx.cluster.only(client).run( + args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name) + ] + ) + + try: + yield + finally: + log.info('Removing packaged dependencies of Kafka...') + test_dir=get_kafka_dir(ctx, config) + current_version = get_kafka_version(config) + for (client,_) in config.items(): + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', test_dir], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)], + ) + + +@contextlib.contextmanager +def run_kafka(ctx,config): + """ + This includes two parts: + 1. Starting Zookeeper service + 2. Starting Kafka service + """ + assert isinstance(config, dict) + log.info('Bringing up Zookeeper and Kafka services...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + kafka_dir = get_kafka_dir(ctx, config) + + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_java_log_dir = "{}/java_logs".format(second_broker_data) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './zookeeper-server-start.sh', + '{tir}/config/zookeeper.properties'.format(tir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './kafka-server-start.sh', + '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)), + './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + log.info('Stopping Zookeeper and Kafka Services...') + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-server-stop.sh', + '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './zookeeper-server-stop.sh', + '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run(args=['killall', '-9', 'java']) + + +@contextlib.contextmanager +def run_admin_cmds(ctx,config): + """ + Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic. + """ + assert isinstance(config, dict) + log.info('Checking kafka server through producer/consumer commands...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-topics.sh', '--create', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + 'echo', "First", run.Raw('|'), + './kafka-console-producer.sh', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-console-consumer.sh', '--topic', 'quickstart-events', + '--from-beginning', + '--bootstrap-server', 'localhost:9092', + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + pass + + +@contextlib.contextmanager +def task(ctx,config): + """ + Following is the way how to run kafka:: + tasks: + - kafka: + client.0: + kafka_version: 2.6.0 + """ + assert config is None or isinstance(config, list) \ + or isinstance(config, dict), \ + "task kafka only supports a list or dictionary for configuration" + + all_clients = ['client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + if config is None: + config = all_clients + if isinstance(config, list): + config = dict.fromkeys(config) + + log.debug('Kafka config is %s', config) + + with contextutil.nested( + lambda: install_kafka(ctx=ctx, config=config), + lambda: run_kafka(ctx=ctx, config=config), + lambda: run_admin_cmds(ctx=ctx, config=config), + ): + yield + diff --git a/qa/tasks/mgr/dashboard/helper.py b/qa/tasks/mgr/dashboard/helper.py index e6a7c35a23d..55355048a36 100644 --- a/qa/tasks/mgr/dashboard/helper.py +++ b/qa/tasks/mgr/dashboard/helper.py @@ -220,13 +220,11 @@ class DashboardTestCase(MgrTestCase): # To avoid any issues with e.g. unlink bugs, we destroy and recreate # the filesystem rather than just doing a rm -rf of files - cls.mds_cluster.mds_stop() - cls.mds_cluster.mds_fail() cls.mds_cluster.delete_all_filesystems() + cls.mds_cluster.mds_restart() # to reset any run-time configs, etc. cls.fs = None # is now invalid! cls.fs = cls.mds_cluster.newfs(create=True) - cls.fs.mds_restart() # In case some test messed with auth caps, reset them # pylint: disable=not-an-iterable diff --git a/qa/tasks/mgr/dashboard/test_mgr_module.py b/qa/tasks/mgr/dashboard/test_mgr_module.py index d6a368905b6..1dbdef23d34 100644 --- a/qa/tasks/mgr/dashboard/test_mgr_module.py +++ b/qa/tasks/mgr/dashboard/test_mgr_module.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import logging import requests +from urllib3.exceptions import MaxRetryError from .helper import (DashboardTestCase, JLeaf, JList, JObj, module_options_object_schema, module_options_schema, @@ -24,10 +25,11 @@ class MgrModuleTestCase(DashboardTestCase): def _check_connection(): try: # Try reaching an API endpoint successfully. + logger.info('Trying to reach the REST API endpoint') self._get('/api/mgr/module') if self._resp.status_code == 200: return True - except requests.ConnectionError: + except (MaxRetryError, requests.ConnectionError): pass return False diff --git a/qa/tasks/mgr/dashboard/test_rbd.py b/qa/tasks/mgr/dashboard/test_rbd.py index a872645e33e..83b3bf520c2 100644 --- a/qa/tasks/mgr/dashboard/test_rbd.py +++ b/qa/tasks/mgr/dashboard/test_rbd.py @@ -869,7 +869,19 @@ class RbdTest(DashboardTestCase): self.assertEqual(clone_format_version, 2) self.assertStatus(200) + # if empty list is sent, then the config will remain as it is value = [] + res = [{'section': "global", 'value': "2"}] + self._post('/api/cluster_conf', { + 'name': config_name, + 'value': value + }) + self.wait_until_equal( + lambda: _get_config_by_name(config_name), + res, + timeout=60) + + value = [{'section': "global", 'value': ""}] self._post('/api/cluster_conf', { 'name': config_name, 'value': value diff --git a/qa/tasks/mgr/dashboard/test_rgw.py b/qa/tasks/mgr/dashboard/test_rgw.py index 5c7b0329675..a9071bc2a3a 100644 --- a/qa/tasks/mgr/dashboard/test_rgw.py +++ b/qa/tasks/mgr/dashboard/test_rgw.py @@ -785,7 +785,7 @@ class RgwUserSubuserTest(RgwTestCase): 'access': 'readwrite', 'key_type': 'swift' }) - self.assertStatus(200) + self.assertStatus(201) data = self.jsonBody() subuser = self.find_object_in_list('id', 'teuth-test-user:tux', data) self.assertIsInstance(subuser, object) @@ -808,7 +808,7 @@ class RgwUserSubuserTest(RgwTestCase): 'access_key': 'yyy', 'secret_key': 'xxx' }) - self.assertStatus(200) + self.assertStatus(201) data = self.jsonBody() subuser = self.find_object_in_list('id', 'teuth-test-user:hugo', data) self.assertIsInstance(subuser, object) diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py index 74b1e9d850c..4a5506391f2 100644 --- a/qa/tasks/mgr/mgr_test_case.py +++ b/qa/tasks/mgr/mgr_test_case.py @@ -1,5 +1,6 @@ import json import logging +import socket from unittest import SkipTest @@ -108,7 +109,7 @@ class MgrTestCase(CephTestCase): # Unload all non-default plugins loaded = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd( "mgr", "module", "ls", "--format=json-pretty"))['enabled_modules'] - unload_modules = set(loaded) - {"cephadm", "restful"} + unload_modules = set(loaded) - {"cephadm"} for m in unload_modules: cls.mgr_cluster.mon_manager.raw_cluster_cmd( @@ -137,7 +138,7 @@ class MgrTestCase(CephTestCase): raise SkipTest( "Only have {0} manager daemons, {1} are required".format( len(cls.mgr_cluster.mgr_ids), cls.MGRS_REQUIRED)) - + # We expect laggy OSDs in this testing environment so turn off this warning. # See https://tracker.ceph.com/issues/61907 cls.mgr_cluster.mon_manager.raw_cluster_cmd('config', 'set', 'mds', @@ -229,15 +230,22 @@ class MgrTestCase(CephTestCase): """ # Start handing out ports well above Ceph's range. assign_port = min_port + ip_addr = cls.mgr_cluster.get_mgr_map()['active_addr'].split(':')[0] for mgr_id in cls.mgr_cluster.mgr_ids: cls.mgr_cluster.mgr_stop(mgr_id) cls.mgr_cluster.mgr_fail(mgr_id) + for mgr_id in cls.mgr_cluster.mgr_ids: - log.debug("Using port {0} for {1} on mgr.{2}".format( - assign_port, module_name, mgr_id - )) + # Find a port that isn't in use + while True: + if not cls.is_port_in_use(ip_addr, assign_port): + break + log.debug(f"Port {assign_port} in use, trying next") + assign_port += 1 + + log.debug(f"Using port {assign_port} for {module_name} on mgr.{mgr_id}") cls.mgr_cluster.set_module_localized_conf(module_name, mgr_id, config_name, str(assign_port), @@ -255,3 +263,8 @@ class MgrTestCase(CephTestCase): mgr_map['active_name'], mgr_map['active_gid'])) return done cls.wait_until_true(is_available, timeout=30) + + @classmethod + def is_port_in_use(cls, ip_addr: str, port: int) -> bool: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex((ip_addr, port)) == 0 diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py index 7ac2960371c..c41a95c71f7 100644 --- a/qa/tasks/mgr/test_module_selftest.py +++ b/qa/tasks/mgr/test_module_selftest.py @@ -36,13 +36,6 @@ class TestModuleSelftest(MgrTestCase): self.mgr_cluster.mon_manager.raw_cluster_cmd( "mgr", "self-test", "module", module_name) - def test_zabbix(self): - # Set these mandatory config fields so that the zabbix module - # won't trigger health/log errors on load/serve. - self.mgr_cluster.set_module_conf("zabbix", "zabbix_host", "localhost") - self.mgr_cluster.set_module_conf("zabbix", "identifier", "foo") - self._selftest_plugin("zabbix") - def test_prometheus(self): self._assign_ports("prometheus", "server_port", min_port=8100) self._selftest_plugin("prometheus") diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py index b4697a6f797..f1eae3c89c4 100644 --- a/qa/tasks/notification_tests.py +++ b/qa/tasks/notification_tests.py @@ -220,7 +220,7 @@ def run_tests(ctx, config): for client, client_config in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() - attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] + attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] if 'extra_attr' in client_config: attr = client_config.get('extra_attr') diff --git a/qa/tasks/nvme_loop.py b/qa/tasks/nvme_loop.py index fef270ea085..fdec467a16d 100644 --- a/qa/tasks/nvme_loop.py +++ b/qa/tasks/nvme_loop.py @@ -70,7 +70,7 @@ def task(ctx, config): remote.run(args=['lsblk'], stdout=StringIO()) p = remote.run(args=['sudo', 'nvme', 'list', '-o', 'json'], stdout=StringIO()) new_devs = [] - # `nvme list -o json` will return the following output: + # `nvme list -o json` will return one of the following output: '''{ "Devices" : [ { @@ -91,13 +91,112 @@ def task(ctx, config): } ] }''' + '''{ + "Devices":[ + { + "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4", + "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c", + "Subsystems":[ + { + "Subsystem":"nvme-subsys0", + "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT623300LN400BGN INTEL SSDPEDMD400G4", + "Controllers":[ + { + "Controller":"nvme0", + "Cntlid":"0", + "SerialNumber":"CVFT623300LN400BGN", + "ModelNumber":"INTEL SSDPEDMD400G4", + "Firmware":"8DV101H0", + "Transport":"pcie", + "Address":"0000:02:00.0", + "Slot":"2", + "Namespaces":[ + { + "NameSpace":"nvme0n1", + "Generic":"ng0n1", + "NSID":1, + "UsedBytes":400088457216, + "MaximumLBA":781422768, + "PhysicalSize":400088457216, + "SectorSize":512 + } + ], + "Paths":[ + ] + } + ], + "Namespaces":[ + ] + } + ] + } + ] + } + ''' + '''{ + "Devices":[ + { + "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4", + "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c", + "Subsystems":[ + { + "Subsystem":"nvme-subsys0", + "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT534400C2400BGN INTEL SSDPEDMD400G4", + "Controllers":[ + { + "Controller":"nvme0", + "Cntlid":"0", + "SerialNumber":"CVFT534400C2400BGN", + "ModelNumber":"INTEL SSDPEDMD400G4", + "Firmware":"8DV101H0", + "Transport":"pcie", + "Address":"0000:02:00.0", + "Slot":"2", + "Namespaces":[ + { + "NameSpace":"nvme0n1", + "Generic":"ng0n1", + "NSID":1, + "UsedBytes":400088457216, + "MaximumLBA":781422768, + "PhysicalSize":400088457216, + "SectorSize":512 + } + ], + "Paths":[ + ] + } + ], + "Namespaces":[ + ] + } + ] + } + ] + } + ''' nvme_list = json.loads(p.stdout.getvalue()) for device in nvme_list['Devices']: - dev = device['DevicePath'] - vendor = device['ModelNumber'] - if dev.startswith('/dev/') and vendor == 'Linux': - new_devs.append(dev) - bluestore_zap(remote, dev) + try: + # first try format 1 / older format + dev = device['DevicePath'] + vendor = device['ModelNumber'] + if dev.startswith('/dev/') and vendor == 'Linux': + new_devs.append(dev) + bluestore_zap(remote, dev) + except KeyError: + for subsystem in device['Subsystems']: + # format 2 + if 'Namespaces' in subsystem and subsystem['Namespaces']: + dev = '/dev/' + subsystem['Namespaces'][0]['NameSpace'] + # try format 3 last + else: + dev = '/dev/' + subsystem['Controllers'][0]['Namespaces'][0]['NameSpace'] + # vendor is the same for format 2 and 3 + vendor = subsystem['Controllers'][0]['ModelNumber'] + if vendor == 'Linux': + new_devs.append(dev) + bluestore_zap(remote, dev) log.info(f'new_devs {new_devs}') assert len(new_devs) <= len(devs) if len(new_devs) == len(devs): diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index b89f123c97e..691a6f7dd86 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -32,6 +32,7 @@ class Nvmeof(Task): gateway_config: namespaces_count: 10 cli_version: latest + create_mtls_secrets: False """ @@ -69,6 +70,7 @@ class Nvmeof(Task): self.serial = gateway_config.get('serial', 'SPDK00000000000001') self.port = gateway_config.get('port', '4420') self.srport = gateway_config.get('srport', '5500') + self.create_mtls_secrets = gateway_config.get('create_mtls_secrets', False) def deploy_nvmeof(self): """ @@ -126,12 +128,11 @@ class Nvmeof(Task): total_images = int(self.namespaces_count) * int(self.subsystems_count) log.info(f'[nvmeof]: creating {total_images} images') + rbd_create_cmd = [] for i in range(1, total_images + 1): imagename = self.image_name_prefix + str(i) - log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}') - _shell(self.ctx, self.cluster_name, self.remote, [ - 'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}' - ]) + rbd_create_cmd += ['rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}', run.Raw(';')] + _shell(self.ctx, self.cluster_name, self.remote, rbd_create_cmd) for role, i in daemons.items(): remote, id_ = i @@ -147,7 +148,38 @@ class Nvmeof(Task): started=True, ) log.info("[nvmeof]: executed deploy_nvmeof successfully!") - + + def write_mtls_config(self, gateway_ips): + log.info("[nvmeof]: writing mtls config...") + allowed_ips = "" + for ip in gateway_ips: + allowed_ips += ("IP:" + ip + ",") + self.remote.run( + args=[ + "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/server.key", + "-out", "/etc/ceph/server.crt", "-days", "3650", "-subj", "/CN=my.server", "-addext", f"subjectAltName={allowed_ips[:-1]}" + ] + ) + self.remote.run( + args=[ + "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/client.key", + "-out", "/etc/ceph/client.crt", "-days", "3650", "-subj", "/CN=client1" + ] + ) + secrets_files = {"/etc/ceph/server.key": None, + "/etc/ceph/server.crt": None, + "/etc/ceph/client.key": None, + "/etc/ceph/client.crt": None, + } + for file in secrets_files.keys(): + secrets_files[file] = self.remote.read_file(path=file, sudo=True) + + for remote in self.ctx.cluster.remotes.keys(): + for remote_file in secrets_files.keys(): + data = secrets_files[remote_file] + remote.sudo_write_file(path=remote_file, data=data, mode='0644') + log.info("[nvmeof]: written mtls config!") + def set_gateway_cfg(self): log.info('[nvmeof]: running set_gateway_cfg...') ip_address = self.remote.ip_address @@ -174,6 +206,8 @@ class Nvmeof(Task): data=conf_data, sudo=True ) + if self.create_mtls_secrets: + self.write_mtls_config(gateway_ips) log.info("[nvmeof]: executed set_gateway_cfg successfully!") @@ -216,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet): daemon_max_thrash_times: For now, NVMeoF daemons have limitation that each daemon can - be thrashed only 3 times in span of 30 mins. This option + be thrashed only 5 times in span of 30 mins. This option allows to set the amount of times it could be thrashed in a period - of time. (default: 3) + of time. (default: 5) daemon_max_thrash_period: This option goes with the above option. It sets the period of time over which each daemons can be thrashed for daemon_max_thrash_times @@ -271,17 +305,17 @@ class NvmeofThrasher(Thrasher, Greenlet): self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1)) # Limits on thrashing each daemon - self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3)) + self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 5)) self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60)) self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30)) - self.min_revive_delay = int(self.config.get('min_revive_delay', 100)) + self.min_revive_delay = int(self.config.get('min_revive_delay', 60)) self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30)) def _get_devices(self, remote): GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \ - "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'" + "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'" devices = remote.sh(GET_DEVICE_CMD).split() return devices @@ -312,6 +346,7 @@ class NvmeofThrasher(Thrasher, Greenlet): run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof', run.Raw('&&'), 'ceph', 'health', 'detail', run.Raw('&&'), 'ceph', '-s', + run.Raw('&&'), 'sudo', 'nvme', 'list', ] for dev in self.devices: check_cmd += [ @@ -386,13 +421,11 @@ class NvmeofThrasher(Thrasher, Greenlet): while not self.stopping.is_set(): killed_daemons = defaultdict(list) - weight = 1.0 / len(self.daemons) - count = 0 + thrash_daemon_num = self.rng.randint(1, self.max_thrash_daemons) + selected_daemons = self.rng.sample(self.daemons, thrash_daemon_num) for daemon in self.daemons: - skip = self.rng.uniform(0.0, 1.0) - if weight <= skip: - self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format( - label=daemon.id_, skip=skip, weight=weight)) + if daemon not in selected_daemons: + self.log(f'skipping daemon {daemon.id_} ...') continue # For now, nvmeof daemons can only be thrashed 3 times in last 30mins. @@ -410,17 +443,11 @@ class NvmeofThrasher(Thrasher, Greenlet): continue self.log('kill {label}'.format(label=daemon.id_)) - # daemon.stop() kill_method = self.kill_daemon(daemon) killed_daemons[kill_method].append(daemon) daemons_thrash_history[daemon.id_] += [datetime.now()] - # only thrash max_thrash_daemons amount of daemons - count += 1 - if count >= self.max_thrash_daemons: - break - if killed_daemons: iteration_summary = "thrashed- " for kill_method in killed_daemons: @@ -433,7 +460,7 @@ class NvmeofThrasher(Thrasher, Greenlet): self.log(f'waiting for {revive_delay} secs before reviving') time.sleep(revive_delay) # blocking wait - self.log('done waiting before reviving') + self.log(f'done waiting before reviving - iteration #{len(summary)}: {iteration_summary}') self.do_checks() self.switch_task() @@ -452,7 +479,7 @@ class NvmeofThrasher(Thrasher, Greenlet): if thrash_delay > 0.0: self.log(f'waiting for {thrash_delay} secs before thrashing') time.sleep(thrash_delay) # blocking - self.log('done waiting before thrashing') + self.log('done waiting before thrashing - everything should be up now') self.do_checks() self.switch_task() diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py index d8eac5d886f..96bcc770511 100644 --- a/qa/tasks/rados.py +++ b/qa/tasks/rados.py @@ -36,6 +36,8 @@ def task(ctx, config): write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED. This mean data don't access in the near future. Let osd backend don't keep data in cache. + pct_update_delay: delay before primary propogates pct on write pause, + defaults to 5s if balance_reads is set For example:: @@ -139,6 +141,7 @@ def task(ctx, config): object_size = int(config.get('object_size', 4000000)) op_weights = config.get('op_weights', {}) testdir = teuthology.get_testdir(ctx) + pct_update_delay = None args = [ 'adjust-ulimits', 'ceph-coverage', @@ -166,6 +169,7 @@ def task(ctx, config): args.extend(['--pool-snaps']) if config.get('balance_reads', False): args.extend(['--balance-reads']) + pct_update_delay = config.get('pct_update_delay', 5); if config.get('localize_reads', False): args.extend(['--localize-reads']) if config.get('max_attr_len', None): @@ -274,6 +278,10 @@ def task(ctx, config): if config.get('fast_read', False): manager.raw_cluster_cmd( 'osd', 'pool', 'set', pool, 'fast_read', 'true') + if pct_update_delay: + manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool, + 'pct_update_delay', str(pct_update_delay)); min_size = config.get('min_size', None); if min_size is not None: manager.raw_cluster_cmd( diff --git a/qa/tasks/radosgw_admin.py b/qa/tasks/radosgw_admin.py index 3b98702acca..fb82378761b 100644 --- a/qa/tasks/radosgw_admin.py +++ b/qa/tasks/radosgw_admin.py @@ -16,6 +16,7 @@ import logging import time import datetime import sys +import errno from io import StringIO from queue import Queue @@ -725,6 +726,40 @@ def task(ctx, config): (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--tenant', tenant_name, '--uid', 'tenanteduser'], check_status=True) + account_id = 'RGW12312312312312312' + account_name = 'testacct' + rgwadmin(ctx, client, [ + 'account', 'create', + '--account-id', account_id, + '--account-name', account_name, + ], check_status=True) + rgwadmin(ctx, client, [ + 'user', 'create', + '--account-id', account_id, + '--uid', 'testacctuser', + '--display-name', 'accountuser', + '--gen-access-key', + '--gen-secret', + ], check_status=True) + + # TESTCASE 'bucket link', 'bucket', 'account user', 'fails' + (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--bucket', bucket_name, '--uid', 'testacctuser']) + assert err == errno.EINVAL + + rgwadmin(ctx, client, ['user', 'rm', '--uid', 'testacctuser'], check_status=True) + + # TESTCASE 'bucket link', 'bucket', 'account', 'succeeds' + rgwadmin(ctx, client, + ['bucket', 'link', '--bucket', bucket_name, '--account-id', account_id], + check_status=True) + + # relink the bucket to the first user and delete the account + rgwadmin(ctx, client, + ['bucket', 'link', '--bucket', bucket_name, '--uid', user1], + check_status=True) + rgwadmin(ctx, client, ['account', 'rm', '--account-id', account_id], + check_status=True) + # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' # upload an object diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py index e83a54efc2b..f93ca017fa2 100644 --- a/qa/tasks/rgw_multisite.py +++ b/qa/tasks/rgw_multisite.py @@ -361,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config): if endpoints: # replace client names with their gateway endpoints config['endpoints'] = extract_gateway_endpoints(gateways, endpoints) + if not config.get('api_name'): # otherwise it will be set to an empty string + config['api_name'] = config['name'] zonegroup = multisite.ZoneGroup(config['name'], period) # `zonegroup set` needs --default on command line, and 'is_master' in json args = is_default_arg(config) diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 6cb75173966..fae5ef3bf00 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -8,7 +8,7 @@ import json import logging import os import yaml -from io import BytesIO +from io import BytesIO, StringIO from tarfile import ReadError from tasks.ceph_manager import CephManager @@ -235,10 +235,14 @@ def ceph_log(ctx, config): r = ctx.rook[cluster_name].remote.run( stdout=BytesIO(), args=args, + stderr=StringIO(), ) stdout = r.stdout.getvalue().decode() if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', diff --git a/qa/tasks/s3a_hadoop.py b/qa/tasks/s3a_hadoop.py index 7b77359fcf2..4518a6f397c 100644 --- a/qa/tasks/s3a_hadoop.py +++ b/qa/tasks/s3a_hadoop.py @@ -1,5 +1,6 @@ import contextlib import logging +import os from teuthology import misc from teuthology.orchestra import run @@ -40,7 +41,7 @@ def task(ctx, config): # get versions maven_major = config.get('maven-major', 'maven-3') - maven_version = config.get('maven-version', '3.6.3') + maven_version = config.get('maven-version', '3.9.9') hadoop_ver = config.get('hadoop-version', '2.9.2') bucket_name = config.get('bucket-name', 's3atest') access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F') @@ -48,11 +49,19 @@ def task(ctx, config): 'secret-key', 'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb') + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + # set versions for cloning the repo apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format( maven_version=maven_version) - maven_link = 'http://archive.apache.org/dist/maven/' + \ - '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven + maven_link = '{apache_mirror_url_front}/maven/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + \ + apache_maven hadoop_git = 'https://github.com/apache/hadoop' hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver) if hadoop_ver == 'trunk': @@ -204,6 +213,7 @@ def run_s3atest(client, maven_version, testdir, test_options): run.Raw('&&'), run.Raw(rm_test), run.Raw('&&'), + run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'), run.Raw(run_test), run.Raw(test_options) ] diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py index 6d7b39d5892..85ab97d23cd 100644 --- a/qa/tasks/s3tests.py +++ b/qa/tasks/s3tests.py @@ -57,6 +57,17 @@ def download(ctx, config): 'git', 'reset', '--hard', sha1, ], ) + if client_config.get('boto3_extensions'): + ctx.cluster.only(client).run( + args=['mkdir', + '-p', + '/home/ubuntu/.aws/models/s3/2006-03-01/'] + ) + (remote,) = ctx.cluster.only(client).remotes.keys() + remote_file = '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json' + local_file = '{qadir}/../examples/rgw/boto3/service-2.sdk-extras.json'.format(qadir=ctx.config.get('suite_path')) + remote.put_file(local_file, remote_file) + try: yield finally: @@ -70,6 +81,17 @@ def download(ctx, config): '{tdir}/s3-tests-{client}'.format(tdir=testdir, client=client), ], ) + if client_config.get('boto3_extensions'): + ctx.cluster.only(client).run( + args=[ + 'rm', '-rf', '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json', + ], + ) + ctx.cluster.only(client).run( + args=[ + 'cd', '/home/ubuntu/', run.Raw('&&'), 'rmdir', '-p', '.aws/models/s3/2006-03-01/', + ], + ) def _config_user(s3tests_conf, section, user, email): @@ -444,8 +466,10 @@ def run_tests(ctx, config): attrs += ['not fails_with_subdomain'] if not client_config.get('with-sse-s3'): attrs += ['not sse_s3'] - + attrs += client_config.get('extra_attrs', []) + if 'bucket_logging' not in attrs: + attrs += ['not bucket_logging'] if 'unit_test_scan' in client_config and client_config['unit_test_scan']: xmlfile_id = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S--") + str(uuid.uuid4()) xmlpath= f'{testdir}/archive/s3test-{xmlfile_id}.xml' diff --git a/qa/tasks/s3tests_java.py b/qa/tasks/s3tests_java.py index 3e20e10d06c..a58aa6cf0b4 100644 --- a/qa/tasks/s3tests_java.py +++ b/qa/tasks/s3tests_java.py @@ -284,6 +284,7 @@ class S3tests_java(Task): args = ['cd', '{tdir}/s3-tests-java'.format(tdir=testdir), run.Raw('&&'), + run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'), '/opt/gradle/gradle/bin/gradle', 'clean', 'test', '--rerun-tasks', '--no-build-cache', ] diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py new file mode 100644 index 00000000000..a84a85bb307 --- /dev/null +++ b/qa/tasks/stretch_mode_disable_enable.py @@ -0,0 +1,547 @@ +import logging +from tasks.mgr.mgr_test_case import MgrTestCase + +log = logging.getLogger(__name__) + +class TestStretchMode(MgrTestCase): + """ + Test the stretch mode feature of Ceph + """ + POOL = 'stretch_pool' + CLUSTER = "ceph" + WRITE_PERIOD = 10 + RECOVERY_PERIOD = WRITE_PERIOD * 6 + SUCCESS_HOLD_TIME = 7 + STRETCH_CRUSH_RULE = 'stretch_rule' + STRETCH_CRUSH_RULE_ID = None + STRETCH_BUCKET_TYPE = 'datacenter' + TIEBREAKER_MON_NAME = 'e' + DEFAULT_POOL_TYPE = 'replicated' + DEFAULT_POOL_CRUSH_RULE = 'replicated_rule' + DEFAULT_POOL_SIZE = 3 + DEFAULT_POOL_MIN_SIZE = 2 + DEFAULT_POOL_CRUSH_RULE_ID = None + # This dictionary maps the datacenter to the osd ids and hosts + DC_OSDS = { + 'dc1': { + "host01": [0, 1], + "host02": [2, 3], + }, + 'dc2': { + "host03": [4, 5], + "host04": [6, 7], + }, + } + DC_MONS = { + 'dc1': { + "host01": ['a'], + "host02": ['b'], + }, + 'dc2': { + "host03": ['c'], + "host04": ['d'], + }, + 'dc3': { + "host05": ['e'], + } + } + def _osd_count(self): + """ + Get the number of OSDs in the cluster. + """ + osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json() + return len(osd_map['osds']) + + def setUp(self): + """ + Setup the cluster and + ensure we have a clean condition before the test. + """ + # Ensure we have at least 6 OSDs + super(TestStretchMode, self).setUp() + self.DEFAULT_POOL_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.DEFAULT_POOL_CRUSH_RULE) + self.STRETCH_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.STRETCH_CRUSH_RULE) + if self._osd_count() < 4: + self.skipTest("Not enough OSDS!") + + # Remove any filesystems so that we can remove their pools + if self.mds_cluster: + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + self.mds_cluster.delete_all_filesystems() + + # Remove all other pools + for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']: + try: + self.mgr_cluster.mon_manager.remove_pool(pool['pool_name']) + except: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'delete', + pool['pool_name'], + pool['pool_name'], + '--yes-i-really-really-mean-it') + + def _setup_pool( + self, + pool_name=POOL, + pg_num=16, + pool_type=DEFAULT_POOL_TYPE, + crush_rule=DEFAULT_POOL_CRUSH_RULE, + size=None, + min_size=None + ): + """ + Create a pool, set its size and pool if specified. + """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', pool_name, str(pg_num), pool_type, crush_rule) + + if size is not None: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool_name, 'size', str(size)) + + if min_size is not None: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool_name, 'min_size', str(min_size)) + + def _write_some_data(self, t): + """ + Write some data to the pool to simulate a workload. + """ + args = [ + "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"] + self.mgr_cluster.admin_remote.run(args=args, wait=True) + + def _get_all_mons_from_all_dc(self): + """ + Get all mons from all datacenters. + """ + return [mon for dc in self.DC_MONS.values() for mons in dc.values() for mon in mons] + + def _bring_back_mon(self, mon): + """ + Bring back the mon. + """ + try: + self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart() + except Exception: + log.error("Failed to bring back mon.{}".format(str(mon))) + pass + + def _get_host(self, osd): + """ + Get the host of the osd. + """ + for dc, nodes in self.DC_OSDS.items(): + for node, osds in nodes.items(): + if osd in osds: + return node + return None + + def _move_osd_back_to_host(self, osd): + """ + Move the osd back to the host. + """ + host = self._get_host(osd) + assert host is not None, "The host of osd {} is not found.".format(osd) + log.debug("Moving osd.%d back to %s", osd, host) + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'crush', 'move', 'osd.{}'.format(str(osd)), + 'host={}'.format(host) + ) + + def tearDown(self): + """ + Clean up the cluster after the test. + """ + # Remove the pool + if self.POOL in self.mgr_cluster.mon_manager.pools: + self.mgr_cluster.mon_manager.remove_pool(self.POOL) + + osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json() + for osd in osd_map['osds']: + # mark all the osds in + if osd['weight'] == 0.0: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'in', str(osd['osd'])) + # Bring back all the osds and move it back to the host. + if osd['up'] == 0: + self.mgr_cluster.mon_manager.revive_osd(osd['osd']) + self._move_osd_back_to_host(osd['osd']) + + # Bring back all the mons + mons = self._get_all_mons_from_all_dc() + for mon in mons: + self._bring_back_mon(mon) + super(TestStretchMode, self).tearDown() + + def _kill_osd(self, osd): + """ + Kill the osd. + """ + try: + self.ctx.daemons.get_daemon('osd', osd, self.CLUSTER).stop() + except Exception: + log.error("Failed to stop osd.{}".format(str(osd))) + pass + + def _get_osds_data(self, want_osds): + """ + Get the osd data + """ + all_osds_data = \ + self.mgr_cluster.mon_manager.get_osd_dump_json()['osds'] + return [ + osd_data for osd_data in all_osds_data + if int(osd_data['osd']) in want_osds + ] + + def _get_osds_by_dc(self, dc): + """ + Get osds by datacenter. + """ + ret = [] + for host, osds in self.DC_OSDS[dc].items(): + ret.extend(osds) + return ret + + def _fail_over_all_osds_in_dc(self, dc): + """ + Fail over all osds in specified <datacenter> + """ + if not isinstance(dc, str): + raise ValueError("dc must be a string") + if dc not in self.DC_OSDS: + raise ValueError( + "dc must be one of the following: %s" % self.DC_OSDS.keys() + ) + log.debug("Failing over all osds in %s", dc) + osds = self._get_osds_by_dc(dc) + # fail over all the OSDs in the DC + log.debug("OSDs to failed over: %s", osds) + for osd_id in osds: + self._kill_osd(osd_id) + # wait until all the osds are down + self.wait_until_true( + lambda: all([int(osd['up']) == 0 + for osd in self._get_osds_data(osds)]), + timeout=self.RECOVERY_PERIOD + ) + + def _check_mons_out_of_quorum(self, want_mons): + """ + Check if the mons are not in quorum. + """ + quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names() + return all([mon not in quorum_names for mon in want_mons]) + + def _kill_mon(self, mon): + """ + Kill the mon. + """ + try: + self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).stop() + except Exception: + log.error("Failed to stop mon.{}".format(str(mon))) + pass + + def _get_mons_by_dc(self, dc): + """ + Get mons by datacenter. + """ + ret = [] + for host, mons in self.DC_MONS[dc].items(): + ret.extend(mons) + return ret + + def _fail_over_all_mons_in_dc(self, dc): + """ + Fail over all mons in the specified <datacenter> + """ + if not isinstance(dc, str): + raise ValueError("dc must be a string") + if dc not in self.DC_MONS: + raise ValueError("dc must be one of the following: %s" % + ", ".join(self.DC_MONS.keys())) + log.debug("Failing over all mons %s", dc) + mons = self._get_mons_by_dc(dc) + log.debug("Mons to be failed over: %s", mons) + for mon in mons: + self._kill_mon(mon) + # wait until all the mons are out of quorum + self.wait_until_true( + lambda: self._check_mons_out_of_quorum(mons), + timeout=self.RECOVERY_PERIOD + ) + + def _stretch_mode_enabled_correctly(self): + """ + Evaluate whether the stretch mode is enabled correctly. + by checking the OSDMap and MonMap. + """ + # Checking the OSDMap + osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json() + for pool in osdmap['pools']: + # expects crush_rule to be stretch_rule + self.assertEqual( + self.STRETCH_CRUSH_RULE_ID, + pool['crush_rule'] + ) + # expects pool size to be 4 + self.assertEqual( + 4, + pool['size'] + ) + # expects pool min_size to be 2 + self.assertEqual( + 2, + pool['min_size'] + ) + # expects pool is_stretch_pool flag to be true + self.assertEqual( + True, + pool['is_stretch_pool'] + ) + # expects peering_crush_bucket_count = 2 (always this value for stretch mode) + self.assertEqual( + 2, + pool['peering_crush_bucket_count'] + ) + # expects peering_crush_bucket_target = 2 (always this value for stretch mode) + self.assertEqual( + 2, + pool['peering_crush_bucket_target'] + ) + # expects peering_crush_bucket_barrier = 8 (crush type of datacenter is 8) + self.assertEqual( + 8, + pool['peering_crush_bucket_barrier'] + ) + # expects stretch_mode_enabled to be True + self.assertEqual( + True, + osdmap['stretch_mode']['stretch_mode_enabled'] + ) + # expects stretch_mode_bucket_count to be 2 + self.assertEqual( + 2, + osdmap['stretch_mode']['stretch_bucket_count'] + ) + # expects degraded_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['degraded_stretch_mode'] + ) + # expects recovering_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['recovering_stretch_mode'] + ) + # expects stretch_mode_bucket to be 8 (datacenter crush type = 8) + self.assertEqual( + 8, + osdmap['stretch_mode']['stretch_mode_bucket'] + ) + # Checking the MonMap + monmap = self.mgr_cluster.mon_manager.get_mon_dump_json() + # expects stretch_mode to be True + self.assertEqual( + True, + monmap['stretch_mode'] + ) + # expects disallowed_leaders to be tiebreaker_mon + self.assertEqual( + self.TIEBREAKER_MON_NAME, + monmap['disallowed_leaders'] + ) + # expects tiebreaker_mon to be tiebreaker_mon + self.assertEqual( + self.TIEBREAKER_MON_NAME, + monmap['tiebreaker_mon'] + ) + + def _stretch_mode_disabled_correctly(self): + """ + Evaluate whether the stretch mode is disabled correctly. + by checking the OSDMap and MonMap. + """ + # Checking the OSDMap + osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json() + for pool in osdmap['pools']: + # expects crush_rule to be default + self.assertEqual( + self.DEFAULT_POOL_CRUSH_RULE_ID, + pool['crush_rule'] + ) + # expects pool size to be default + self.assertEqual( + self.DEFAULT_POOL_SIZE, + pool['size'] + ) + # expects pool min_size to be default + self.assertEqual( + self.DEFAULT_POOL_MIN_SIZE, + pool['min_size'] + ) + # expects pool is_stretch_pool flag to be false + self.assertEqual( + False, + pool['is_stretch_pool'] + ) + # expects peering_crush_bucket_count = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_count'] + ) + # expects peering_crush_bucket_target = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_target'] + ) + # expects peering_crush_bucket_barrier = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_barrier'] + ) + # expects stretch_mode_enabled to be False + self.assertEqual( + False, + osdmap['stretch_mode']['stretch_mode_enabled'] + ) + # expects stretch_mode_bucket to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['stretch_bucket_count'] + ) + # expects degraded_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['degraded_stretch_mode'] + ) + # expects recovering_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['recovering_stretch_mode'] + ) + # expects stretch_mode_bucket to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['stretch_mode_bucket'] + ) + # Checking the MonMap + monmap = self.mgr_cluster.mon_manager.get_mon_dump_json() + # expects stretch_mode to be False + self.assertEqual( + False, + monmap['stretch_mode'] + ) + # expects disallowed_leaders to be empty + self.assertEqual( + "", + monmap['disallowed_leaders'] + ) + # expects tiebreaker_mon to be empty + self.assertEqual( + "", + monmap['tiebreaker_mon'] + ) + + def test_disable_stretch_mode(self): + """ + Test disabling stretch mode with the following scenario: + 1. Healthy Stretch Mode + 2. Degraded Stretch Mode + """ + # Create a pool + self._setup_pool(self.POOL, 16, 'replicated', self.STRETCH_CRUSH_RULE, 4, 2) + # Write some data to the pool + self._write_some_data(self.WRITE_PERIOD) + # disable stretch mode without --yes-i-really-mean-it (expects -EPERM 1) + self.assertEqual( + 1, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode' + )) + # Disable stretch mode with non-existent crush rule (expects -EINVAL 22) + self.assertEqual( + 22, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + 'non_existent_rule', + '--yes-i-really-mean-it' + )) + # Disable stretch mode with the current stretch rule (expect -EINVAL 22) + self.assertEqual( + 22, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + self.STRETCH_CRUSH_RULE, + '--yes-i-really-mean-it', + + )) + # Disable stretch mode without crush rule (expect success 0) + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + '--yes-i-really-mean-it' + )) + # Check if stretch mode is disabled correctly + self._stretch_mode_disabled_correctly() + # all PGs are active + clean + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # write some data to the pool + self._write_some_data(self.WRITE_PERIOD) + # Enable stretch mode + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'enable_stretch_mode', + self.TIEBREAKER_MON_NAME, + self.STRETCH_CRUSH_RULE, + self.STRETCH_BUCKET_TYPE + )) + self._stretch_mode_enabled_correctly() + # all PGs are active + clean + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # write some data to the pool + # self._write_some_data(self.WRITE_PERIOD) + # Bring down dc1 + self._fail_over_all_osds_in_dc('dc1') + self._fail_over_all_mons_in_dc('dc1') + # should be in degraded stretch mode + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.is_degraded_stretch_mode(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # Disable stretch mode with valid crush rule (expect success 0) + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + self.DEFAULT_POOL_CRUSH_RULE, + '--yes-i-really-mean-it' + )) + # Check if stretch mode is disabled correctly + self._stretch_mode_disabled_correctly() + # all PGs are active + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml index b70583a75e1..dbde1ced0db 100644 --- a/qa/tasks/thrashosds-health.yaml +++ b/qa/tasks/thrashosds-health.yaml @@ -30,3 +30,4 @@ overrides: - out of quorum - noscrub - nodeep-scrub + - is down diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index ca929ba05b4..2ed21431330 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -233,6 +233,11 @@ class LocalRemoteProcess(object): else: self.stderr.write(err) + def _handle_subprocess_output(self, output, stream): + if isinstance(stream, StringIO): + return rm_nonascii_chars(output) + return output + def wait(self, timeout=None): # Null subproc.stdin so communicate() does not try flushing/closing it # again. @@ -250,7 +255,8 @@ class LocalRemoteProcess(object): return out, err = self.subproc.communicate(timeout=timeout) - out, err = rm_nonascii_chars(out), rm_nonascii_chars(err) + out = self._handle_subprocess_output(out, self.stdout) + err = self._handle_subprocess_output(err, self.stderr) self._write_stdout(out) self._write_stderr(err) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index cdfff17d837..ad5950367e9 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -63,7 +63,7 @@ function retry_eagain() for count in $(seq 1 $max) ; do status=0 "$@" > $tmpfile 2>&1 || status=$? - if test $status = 0 || + if test $status = 0 || ! grep --quiet EAGAIN $tmpfile ; then break fi @@ -108,7 +108,7 @@ function check_response() exit 1 fi - if ! grep --quiet -- "$expected_string" $TMPFILE ; then + if ! grep --quiet -- "$expected_string" $TMPFILE ; then echo "Didn't find $expected_string in output" >&2 cat $TMPFILE >&2 exit 1 @@ -696,7 +696,7 @@ function test_auth_profiles() ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-ro ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-rw - + # add a new role-definer with the existing role-definer ceph -n client.xx-profile-rd -k client.xx.keyring \ auth add client.xx-profile-rd2 mon 'allow profile role-definer' @@ -730,7 +730,7 @@ function test_mon_caps() ceph-authtool -n client.bug --cap mon '' $TEMP_DIR/ceph.client.bug.keyring ceph auth add client.bug -i $TEMP_DIR/ceph.client.bug.keyring rados lspools --no-mon-config --keyring $TEMP_DIR/ceph.client.bug.keyring -n client.bug >& $TMPFILE || true - check_response "Permission denied" + check_response "Permission denied" } function test_mon_misc() @@ -780,7 +780,6 @@ function test_mon_misc() ceph mgr dump ceph mgr dump | jq -e '.active_clients[0].name' ceph mgr module ls - ceph mgr module enable restful expect_false ceph mgr module enable foodne ceph mgr module enable foodne --force ceph mgr module disable foodne @@ -1650,7 +1649,7 @@ function test_mon_osd() dump_json=$(ceph osd dump --format=json | \ jq -cM '.osds[] | select(.osd == 0)') [[ "${info_json}" == "${dump_json}" ]] - + info_plain="$(ceph osd info)" dump_plain="$(ceph osd dump | grep '^osd')" [[ "${info_plain}" == "${dump_plain}" ]] @@ -2244,7 +2243,7 @@ function test_mon_pg() # tell osd version # ceph tell osd.0 version - expect_false ceph tell osd.9999 version + expect_false ceph tell osd.9999 version expect_false ceph tell osd.foo version # back to pg stuff @@ -2336,7 +2335,7 @@ function test_mon_osd_pool_set() ceph osd pool get $TEST_POOL_GETSET deep_scrub_interval | expect_false grep '.' ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET recovery_priority 5 + ceph osd pool set $TEST_POOL_GETSET recovery_priority 5 ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: 5' ceph osd pool set $TEST_POOL_GETSET recovery_priority -5 ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: -5' @@ -2346,13 +2345,13 @@ function test_mon_osd_pool_set() expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 11 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5 + ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | grep 'recovery_op_priority: 5' ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 0 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.' ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET scrub_priority 5 + ceph osd pool set $TEST_POOL_GETSET scrub_priority 5 ceph osd pool get $TEST_POOL_GETSET scrub_priority | grep 'scrub_priority: 5' ceph osd pool set $TEST_POOL_GETSET scrub_priority 0 ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.' @@ -2386,10 +2385,10 @@ function test_mon_osd_pool_set() ceph osd pool set $TEST_POOL_GETSET size 2 wait_for_clean ceph osd pool set $TEST_POOL_GETSET min_size 2 - + expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 0 ceph osd pool set $TEST_POOL_GETSET hashpspool 0 --yes-i-really-mean-it - + expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 1 ceph osd pool set $TEST_POOL_GETSET hashpspool 1 --yes-i-really-mean-it @@ -2587,7 +2586,7 @@ function test_mon_osd_misc() ceph osd map 2>$TMPFILE; check_response 'pool' $? 22 # expect error about unused argument foo - ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22 + ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22 # expect "not in range" for invalid overload percentage ceph osd reweight-by-utilization 80 2>$TMPFILE; check_response 'higher than 100' $? 22 diff --git a/qa/workunits/erasure-code/bench.sh b/qa/workunits/erasure-code/bench.sh index fc75830dfd0..87e997c3500 100755 --- a/qa/workunits/erasure-code/bench.sh +++ b/qa/workunits/erasure-code/bench.sh @@ -17,7 +17,8 @@ # # Test that it works from sources with: # -# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ +# TOTAL_SIZE=$((4 * 1024 * 1024)) SIZE=4096 \ +# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=build/lib \ # qa/workunits/erasure-code/bench.sh fplot jerasure | # tee qa/workunits/erasure-code/bench.js @@ -34,10 +35,14 @@ # firefox qa/workunits/erasure-code/bench.html # # Once it is confirmed to work, it can be run with a more significant -# volume of data so that the measures are more reliable: +# volume of data so that the measures are more reliable. Ideally the size +# of the buffers (SIZE) should be larger than the L3 cache to avoid cache hits. +# The following example uses an 80MB (80 * 1024 * 1024) buffer. +# A larger buffer with fewer iterations (iterations = TOTAL SIZE / SIZE) should result in +# more time spent encoding/decoding and less time allocating/aligning buffers: # -# TOTAL_SIZE=$((4 * 1024 * 1024 * 1024)) \ -# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ +# TOTAL_SIZE=$((100 * 80 * 1024 * 1024)) SIZE=$((80 * 1024 * 1024)) \ +# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=build/lib \ # qa/workunits/erasure-code/bench.sh fplot jerasure | # tee qa/workunits/erasure-code/bench.js @@ -51,8 +56,8 @@ export PATH=/sbin:$PATH : ${PLUGIN_DIRECTORY:=/usr/lib/ceph/erasure-code} : ${PLUGINS:=isa jerasure} : ${TECHNIQUES:=vandermonde cauchy liberation reed_sol_r6_op blaum_roth liber8tion} -: ${TOTAL_SIZE:=$((1024 * 1024))} -: ${SIZE:=4096} +: ${TOTAL_SIZE:=$((100 * 80 * 1024 * 1024))} #TOTAL_SIZE / SIZE = number of encode or decode iterations to run +: ${SIZE:=$((80 * 1024 * 1024))} #size of buffer to encode/decode : ${PARAMETERS:=--parameter jerasure-per-chunk-alignment=true} declare -rA isa_techniques=( diff --git a/qa/workunits/fs/misc/fallocate.sh b/qa/workunits/fs/misc/fallocate.sh new file mode 100755 index 00000000000..253e6cb7a37 --- /dev/null +++ b/qa/workunits/fs/misc/fallocate.sh @@ -0,0 +1,17 @@ +#!/bin/sh -x + +# fallocate with mode 0 should fail with EOPNOTSUPP +set -e +mkdir -p testdir +cd testdir + +expect_failure() { + if "$@"; then return 1; else return 0; fi +} + +expect_failure fallocate -l 1M preallocated.txt +rm -f preallocated.txt + +cd .. +rmdir testdir +echo OK diff --git a/qa/workunits/fs/snaps/snaptest-git-ceph.sh b/qa/workunits/fs/snaps/snaptest-git-ceph.sh index 2b38720c9a5..6079ba8945b 100755 --- a/qa/workunits/fs/snaps/snaptest-git-ceph.sh +++ b/qa/workunits/fs/snaps/snaptest-git-ceph.sh @@ -4,7 +4,14 @@ set -e # increase the cache size sudo git config --global http.sslVerify false -sudo git config --global http.postBuffer 1048576000 +sudo git config --global http.postBuffer 1024MB # default is 1MB +sudo git config --global http.maxRequestBuffer 100M # default is 10MB +sudo git config --global core.compression 0 + +# enable the debug logs for git clone +export GIT_TRACE_PACKET=1 +export GIT_TRACE=1 +export GIT_CURL_VERBOSE=1 # try it again if the clone is slow and the second time retried=false @@ -19,6 +26,11 @@ timeout 1800 git clone https://git.ceph.com/ceph.git trap - EXIT cd ceph +# disable the debug logs for git clone +export GIT_TRACE_PACKET=0 +export GIT_TRACE=0 +export GIT_CURL_VERBOSE=0 + versions=`seq 1 90` for v in $versions diff --git a/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh new file mode 100755 index 00000000000..827fb0a0b13 --- /dev/null +++ b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh @@ -0,0 +1,72 @@ +#!/bin/bash -ex + +# A bash script for setting up stretch mode with 5 monitors and 8 OSDs. + +NUM_OSDS_UP=$(ceph osd df | grep "up" | wc -l) + +if [ $NUM_OSDS_UP -lt 8 ]; then + echo "test requires at least 8 OSDs up and running" + exit 1 +fi + +# ensure election strategy is set to "connectivity" +# See https://tracker.ceph.com/issues/69107 +ceph mon set election_strategy connectivity + +for dc in dc1 dc2 + do + ceph osd crush add-bucket $dc datacenter + ceph osd crush move $dc root=default + done + +ceph osd crush add-bucket host01 host +ceph osd crush add-bucket host02 host +ceph osd crush add-bucket host03 host +ceph osd crush add-bucket host04 host + +ceph osd crush move host01 datacenter=dc1 +ceph osd crush move host02 datacenter=dc1 +ceph osd crush move host03 datacenter=dc2 +ceph osd crush move host04 datacenter=dc2 + +ceph osd crush move osd.0 host=host01 +ceph osd crush move osd.1 host=host01 +ceph osd crush move osd.2 host=host02 +ceph osd crush move osd.3 host=host02 +ceph osd crush move osd.4 host=host03 +ceph osd crush move osd.5 host=host03 +ceph osd crush move osd.6 host=host04 +ceph osd crush move osd.7 host=host04 + +# set location for monitors +ceph mon set_location a datacenter=dc1 host=host01 +ceph mon set_location b datacenter=dc1 host=host02 +ceph mon set_location c datacenter=dc2 host=host03 +ceph mon set_location d datacenter=dc2 host=host04 + +# set location for tiebreaker monitor +ceph mon set_location e datacenter=dc3 host=host05 + +# remove the current host from crush map +hostname=$(hostname -s) +ceph osd crush remove $hostname +# create a new crush rule with stretch rule +ceph osd getcrushmap > crushmap +crushtool --decompile crushmap > crushmap.txt +sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt +cat >> crushmap_modified.txt << EOF +rule stretch_rule { + id 2 + type replicated + step take default + step choose firstn 2 type datacenter + step chooseleaf firstn 2 type host + step emit +} +# end crush map +EOF + +crushtool --compile crushmap_modified.txt -o crushmap.bin +ceph osd setcrushmap -i crushmap.bin + +ceph mon enable_stretch_mode e stretch_rule datacenter diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh index dc6fd1669da..9e7a1f5134e 100755 --- a/qa/workunits/nvmeof/basic_tests.sh +++ b/qa/workunits/nvmeof/basic_tests.sh @@ -38,8 +38,10 @@ disconnect_all() { connect_all() { sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600 sleep 5 - output=$(sudo nvme list --output-format=json) - if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then + expected_devices_count=$1 + actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l) + if [ "$actual_devices" -ne "$expected_devices_count" ]; then + sudo nvme list --output-format=json return 1 fi } @@ -72,11 +74,13 @@ test_run connect test_run list_subsys 1 test_run disconnect_all test_run list_subsys 0 -test_run connect_all +devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) +test_run connect_all $devices_count gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) test_run list_subsys $multipath_count + echo "-------------Test Summary-------------" echo "[nvmeof] All nvmeof basic tests passed!" diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh index 57d355a6318..f7f783afc67 100755 --- a/qa/workunits/nvmeof/fio_test.sh +++ b/qa/workunits/nvmeof/fio_test.sh @@ -5,6 +5,7 @@ sudo yum -y install sysstat namespace_range_start= namespace_range_end= +random_devices_count= rbd_iostat=false while [[ $# -gt 0 ]]; do @@ -17,6 +18,10 @@ while [[ $# -gt 0 ]]; do namespace_range_end=$2 shift 2 ;; + --random_devices) + random_devices_count=$2 + shift 2 + ;; --rbd_iostat) rbd_iostat=true shift @@ -29,7 +34,7 @@ done fio_file=$(mktemp -t nvmeof-fio-XXXX) all_drives_list=$(sudo nvme list --output-format=json | - jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath') + jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace') # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), # then fio runs on namespaces only in the defined range (which is 1 to 3 here). @@ -37,6 +42,8 @@ all_drives_list=$(sudo nvme list --output-format=json | # run on first 3 namespaces here. if [ "$namespace_range_start" ] || [ "$namespace_range_end" ]; then selected_drives=$(echo "${all_drives_list[@]}" | sed -n "${namespace_range_start},${namespace_range_end}p") +elif [ "$random_devices_count" ]; then + selected_drives=$(echo "${all_drives_list[@]}" | shuf -n $random_devices_count) else selected_drives="${all_drives_list[@]}" fi diff --git a/qa/workunits/nvmeof/mtls_test.sh b/qa/workunits/nvmeof/mtls_test.sh new file mode 100755 index 00000000000..e13ca530e8d --- /dev/null +++ b/qa/workunits/nvmeof/mtls_test.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -ex +source /etc/ceph/nvmeof.env + +# install yq +wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /tmp/yq && chmod +x /tmp/yq + +subjectAltName=$(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | sed 's/,/,IP:/g') + +# create mtls spec files +ceph orch ls nvmeof --export > /tmp/gw-conf-original.yaml +sudo /tmp/yq ".spec.enable_auth=true | \ + .spec.root_ca_cert=\"mountcert\" | \ + .spec.client_cert = load_str(\"/etc/ceph/client.crt\") | \ + .spec.client_key = load_str(\"/etc/ceph/client.key\") | \ + .spec.server_cert = load_str(\"/etc/ceph/server.crt\") | \ + .spec.server_key = load_str(\"/etc/ceph/server.key\")" /tmp/gw-conf-original.yaml > /tmp/gw-conf-with-mtls.yaml +cp /tmp/gw-conf-original.yaml /tmp/gw-conf-without-mtls.yaml +sudo /tmp/yq '.spec.enable_auth=false' -i /tmp/gw-conf-without-mtls.yaml + +wait_for_service() { + MAX_RETRIES=30 + for ((RETRY_COUNT=1; RETRY_COUNT<=MAX_RETRIES; RETRY_COUNT++)); do + + if ceph orch ls --refresh | grep -q "nvmeof"; then + echo "Found nvmeof in the output!" + break + fi + if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then + echo "Reached maximum retries ($MAX_RETRIES). Exiting." + break + fi + sleep 5 + done + ceph orch ps + ceph orch ls --refresh +} + +# deploy mtls +cat /tmp/gw-conf-with-mtls.yaml +ceph orch apply -i /tmp/gw-conf-with-mtls.yaml +ceph orch redeploy nvmeof.mypool.mygroup0 +sleep 100 +wait_for_service + + +# test +IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES" +for i in "${!gateway_ips[@]}" +do + ip="${gateway_ips[i]}" + sudo podman run -v /etc/ceph/server.crt:/server.crt:z -v /etc/ceph/client.crt:/client.crt:z \ + -v /etc/ceph/client.key:/client.key:z \ + -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \ + --client-key /client.key --client-cert /client.crt --server-cert /server.crt --format json subsystem list +done + + +# remove mtls +cat /tmp/gw-conf-without-mtls.yaml +ceph orch apply -i /tmp/gw-conf-without-mtls.yaml +ceph orch redeploy nvmeof.mypool.mygroup0 +sleep 100 +wait_for_service + + +# test +IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES" +for i in "${!gateway_ips[@]}" +do + ip="${gateway_ips[i]}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \ + --format json subsystem list +done + diff --git a/qa/workunits/nvmeof/scalability_test.sh b/qa/workunits/nvmeof/scalability_test.sh index 5a26b6284f7..8ede4b7eda2 100755 --- a/qa/workunits/nvmeof/scalability_test.sh +++ b/qa/workunits/nvmeof/scalability_test.sh @@ -3,37 +3,64 @@ GATEWAYS=$1 # exmaple "nvmeof.a,nvmeof.b" DELAY="${SCALING_DELAYS:-50}" +POOL="${RBD_POOL:-mypool}" +GROUP="${NVMEOF_GROUP:-mygroup0}" +source /etc/ceph/nvmeof.env if [ -z "$GATEWAYS" ]; then echo "At least one gateway needs to be defined for scalability test" exit 1 fi -pip3 install yq - status_checks() { - ceph nvme-gw show mypool '' - ceph orch ls - ceph orch ps - ceph -s + expected_count=$1 + + output=$(ceph nvme-gw show $POOL $GROUP) + nvme_show=$(echo $output | grep -o '"AVAILABLE"' | wc -l) + if [ "$nvme_show" -ne "$expected_count" ]; then + return 1 + fi + + orch_ls=$(ceph orch ls) + if ! echo "$orch_ls" | grep -q "$expected_count/$expected_count"; then + return 1 + fi + + output=$(ceph orch ps --service-name nvmeof.$POOL.$GROUP) + orch_ps=$(echo $output | grep -o 'running' | wc -l) + if [ "$orch_ps" -ne "$expected_count" ]; then + return 1 + fi + + ceph_status=$(ceph -s) + if ! echo "$ceph_status" | grep -q "HEALTH_OK"; then + return 1 + fi } +total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) +scaled_down_gateways_count=$(( total_gateways_count - $(echo "$GATEWAYS" | tr -cd ',' | wc -c) - 1 )) + echo "[nvmeof.scale] Setting up config to remove gateways ${GATEWAYS}" +ceph orch ls --service-name nvmeof.$POOL.$GROUP --export > /tmp/nvmeof-gw.yaml ceph orch ls nvmeof --export > /tmp/nvmeof-gw.yaml cat /tmp/nvmeof-gw.yaml -yq "del(.placement.hosts[] | select(. | test(\".*($(echo $GATEWAYS | sed 's/,/|/g'))\")))" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml + +pattern=$(echo $GATEWAYS | sed 's/,/\\|/g') +sed "/$pattern/d" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml cat /tmp/nvmeof-gw-new.yaml echo "[nvmeof.scale] Starting scale testing by removing ${GATEWAYS}" -status_checks -ceph orch rm nvmeof.mypool && sleep 20 # temp workaround +status_checks $total_gateways_count ceph orch apply -i /tmp/nvmeof-gw-new.yaml # downscale +ceph orch redeploy nvmeof.$POOL.$GROUP sleep $DELAY -status_checks -ceph orch rm nvmeof.mypool && sleep 20 # temp workaround +status_checks $scaled_down_gateways_count +echo "[nvmeof.scale] Downscale complete - removed gateways (${GATEWAYS}); now scaling back up" ceph orch apply -i /tmp/nvmeof-gw.yaml #upscale +ceph orch redeploy nvmeof.$POOL.$GROUP sleep $DELAY -status_checks +status_checks $total_gateways_count echo "[nvmeof.scale] Scale testing passed for ${GATEWAYS}" diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh index cc4024323eb..b573647b1e3 100755 --- a/qa/workunits/nvmeof/setup_subsystem.sh +++ b/qa/workunits/nvmeof/setup_subsystem.sh @@ -26,14 +26,21 @@ list_subsystems () { done } +list_namespaces () { + for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do + subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn + done +} + +echo "[nvmeof] Starting subsystem setup..." + # add all subsystems for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append done -list_subsystems - # add all gateway listeners for i in "${!gateway_ips[@]}" do @@ -65,11 +72,5 @@ done list_subsystems -# list namespaces -for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do - subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" - sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn -done - echo "[nvmeof] Subsystem setup done" diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index 2aa27d3d655..0ceb9ff54cf 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -914,6 +914,11 @@ test_namespace() { rbd group create rbd/test1/group1 rbd group image add rbd/test1/group1 rbd/test1/image1 + rbd group image add --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image2 + rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image1 + rbd group image rm rbd/test1/group1 rbd/test1/image2 rbd group rm rbd/test1/group1 rbd trash move rbd/test1/image1 diff --git a/qa/workunits/rbd/cli_migration.sh b/qa/workunits/rbd/cli_migration.sh index b044e747cbb..3af19420957 100755 --- a/qa/workunits/rbd/cli_migration.sh +++ b/qa/workunits/rbd/cli_migration.sh @@ -5,12 +5,16 @@ TEMPDIR= IMAGE1=image1 IMAGE2=image2 IMAGE3=image3 -IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3}" +NAMESPACE1=namespace1 +NAMESPACE2=namespace2 +NAMESPACES="${NAMESPACE1} ${NAMESPACE2}" +IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3} rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}" cleanup() { kill_nbd_server cleanup_tempdir remove_images + remove_namespaces } setup_tempdir() { @@ -42,8 +46,11 @@ create_base_image() { export_raw_image() { local image=$1 - rm -rf "${TEMPDIR}/${image}" - rbd export ${image} "${TEMPDIR}/${image}" + # Replace slashes (/) with underscores (_) for namespace images + local export_image="${image//\//_}" + + rm -rf "${TEMPDIR}/${export_image}" + rbd export "${image}" "${TEMPDIR}/${export_image}" } export_base_image() { @@ -69,6 +76,13 @@ remove_images() { done } +remove_namespaces() { + for namespace in ${NAMESPACES} + do + rbd namespace remove rbd/${namespace} || true + done +} + kill_nbd_server() { pkill -9 qemu-nbd || true } @@ -90,6 +104,11 @@ compare_images() { local ret=0 export_raw_image ${dst_image} + + # Replace slashes (/) with underscores (_) for namespace images + src_image="${src_image//\//_}" + dst_image="${dst_image//\//_}" + if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" then show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" @@ -99,18 +118,26 @@ compare_images() { } test_import_native_format() { - local base_image=$1 - local dest_image=$2 + local base_image_spec=$1 + local dest_image_spec=$2 + + # if base image is from namespace + local base_namespace="" + local base_image=${base_image_spec} + if [[ "${base_image_spec}" == rbd/*/* ]]; then + base_namespace=$(basename "$(dirname "${base_image_spec}")") + base_image=$(basename "${base_image_spec}") + fi - rbd migration prepare --import-only "rbd/${base_image}@2" ${dest_image} - rbd migration abort ${dest_image} + rbd migration prepare --import-only "${base_image_spec}@2" ${dest_image_spec} + rbd migration abort ${dest_image_spec} local pool_id=$(ceph osd pool ls detail --format xml | xmlstarlet sel -t -v "//pools/pool[pool_name='rbd']/pool_id") cat > ${TEMPDIR}/spec.json <<EOF { "type": "native", "pool_id": ${pool_id}, - "pool_namespace": "", + "pool_namespace": "${base_namespace}", "image_name": "${base_image}", "snap_name": "2" } @@ -118,85 +145,85 @@ EOF cat ${TEMPDIR}/spec.json rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec} - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" - rbd migration abort ${dest_image} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} - rbd migration execute ${dest_image} + --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec} + rbd migration execute ${dest_image_spec} - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" - rbd migration abort ${dest_image} + rbd migration abort ${dest_image_spec} # no snap name or snap id expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\"}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\"}" \ + ${dest_image_spec} # invalid source spec JSON expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \ + ${dest_image_spec} # non-existing snap name expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \ + ${dest_image_spec} # invalid snap name expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \ + ${dest_image_spec} # non-existing snap id passed as int expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \ + ${dest_image_spec} # non-existing snap id passed as string expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \ + ${dest_image_spec} # invalid snap id expect_false rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \ - ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \ + ${dest_image_spec} # snap id passed as int - local snap_id=$(rbd snap ls ${base_image} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id") + local snap_id=$(rbd snap ls ${base_image_spec} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id") rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \ - ${dest_image} - rbd migration abort ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} # snap id passed as string rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \ - ${dest_image} - rbd migration abort ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ - ${dest_image} - rbd migration abort ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ - ${dest_image} - rbd migration execute ${dest_image} - rbd migration commit ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ + ${dest_image_spec} + rbd migration execute ${dest_image_spec} + rbd migration commit ${dest_image_spec} - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" - remove_image "${dest_image}" + remove_image "${dest_image_spec}" } test_import_qcow_format() { @@ -337,12 +364,12 @@ EOF cat ${TEMPDIR}/spec.json cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \ - --source-spec-path - ${dest_image} + --source-spec-path - ${dest_image} compare_images ${base_image} ${dest_image} rbd migration abort ${dest_image} rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + --source-spec-path ${TEMPDIR}/spec.json ${dest_image} rbd migration execute ${dest_image} rbd migration commit ${dest_image} @@ -587,4 +614,18 @@ test_import_nbd_stream_qcow2 ${IMAGE2} ${IMAGE3} test_import_raw_format ${IMAGE1} ${IMAGE2} test_import_nbd_stream_raw ${IMAGE1} ${IMAGE2} +rbd namespace create rbd/${NAMESPACE1} +rbd namespace create rbd/${NAMESPACE2} +create_base_image rbd/${NAMESPACE1}/${IMAGE1} +export_base_image rbd/${NAMESPACE1}/${IMAGE1} + +# Migration from namespace to namespace +test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2} + +# Migration from namespace to non-namespace +test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} ${IMAGE2} + +# Migration from non-namespace to namespace +test_import_native_format ${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2} + echo OK diff --git a/qa/workunits/rest/test-restful.sh b/qa/workunits/rest/test-restful.sh deleted file mode 100755 index fde0d107a0b..00000000000 --- a/qa/workunits/rest/test-restful.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -ex - -mydir=`dirname $0` - -secret=`ceph config-key get mgr/restful/keys/admin` -url=$(ceph mgr dump|jq -r .services.restful|sed -e 's/\/$//') -echo "url $url secret $secret" -$mydir/test_mgr_rest_api.py $url $secret - -echo $0 OK |