diff options
Diffstat (limited to 'qa')
354 files changed, 7696 insertions, 1402 deletions
diff --git a/qa/Makefile b/qa/Makefile index ad655b7e743..05dc834adbd 100644 --- a/qa/Makefile +++ b/qa/Makefile @@ -1,4 +1,4 @@ -DIRS= workunits btrfs +DIRS= workunits all: for d in $(DIRS) ; do ( cd $$d ; $(MAKE) all ) ; done diff --git a/qa/README b/qa/README index f9b8988c6f9..a6a95c479bc 100644 --- a/qa/README +++ b/qa/README @@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or ubuntu, chosen randomly. The teuthology code can be found in https://github.com/ceph/teuthology.git + +Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git +CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench. +cosbench support has been removed from qa/tasks/cbt.py. + diff --git a/qa/btrfs/.gitignore b/qa/btrfs/.gitignore deleted file mode 100644 index 530c1b5b4ed..00000000000 --- a/qa/btrfs/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/clone_range -/test_async_snap -/create_async_snap diff --git a/qa/btrfs/Makefile b/qa/btrfs/Makefile deleted file mode 100644 index be95ecfd3cd..00000000000 --- a/qa/btrfs/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -CFLAGS = -Wall -Wextra -D_GNU_SOURCE - -TARGETS = clone_range test_async_snap create_async_snap - -.c: - $(CC) $(CFLAGS) $@.c -o $@ - -all: $(TARGETS) - -clean: - rm $(TARGETS) diff --git a/qa/btrfs/clone_range.c b/qa/btrfs/clone_range.c deleted file mode 100644 index 0a88e160131..00000000000 --- a/qa/btrfs/clone_range.c +++ /dev/null @@ -1,35 +0,0 @@ -#include <fcntl.h> -#include <stdlib.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" -#include <stdio.h> -#include <errno.h> - -int main(int argc, char **argv) -{ - struct btrfs_ioctl_clone_range_args ca; - int dfd; - int r; - - if (argc < 6) { - printf("usage: %s <srcfn> <srcoffset> <srclen> <destfn> <destoffset>\n", argv[0]); - exit(1); - } - - ca.src_fd = open(argv[1], O_RDONLY); - ca.src_offset = atoi(argv[2]); - ca.src_length = atoi(argv[3]); - dfd = open(argv[4], O_WRONLY|O_CREAT); - ca.dest_offset = atoi(argv[5]); - - r = ioctl(dfd, BTRFS_IOC_CLONE_RANGE, &ca); - printf("clone_range %s %lld %lld~%lld to %s %d %lld = %d %s\n", - argv[1], ca.src_fd, - ca.src_offset, ca.src_length, - argv[4], dfd, - ca.dest_offset, r, strerror(errno)); - return r; -} diff --git a/qa/btrfs/create_async_snap.c b/qa/btrfs/create_async_snap.c deleted file mode 100644 index 2ef22af7b45..00000000000 --- a/qa/btrfs/create_async_snap.c +++ /dev/null @@ -1,34 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; - -int main(int argc, char **argv) -{ - int fd; - int r; - - if (argc != 3) { - printf("usage: %s <source subvol> <name>\n", argv[0]); - return 1; - } - printf("creating snap ./%s from %s\n", argv[2], argv[1]); - fd = open(".", O_RDONLY); - va.fd = open(argv[1], O_RDONLY); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - strcpy(va.name, argv[2]); - r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va); - printf("result %d\n", r ? -errno:0); - return r; -} diff --git a/qa/btrfs/test_async_snap.c b/qa/btrfs/test_async_snap.c deleted file mode 100644 index 211be95a61c..00000000000 --- a/qa/btrfs/test_async_snap.c +++ /dev/null @@ -1,83 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; -struct btrfs_ioctl_vol_args vold; -int max = 4; - -void check_return(int r) -{ - if (r < 0) { - printf("********* failed with %d %s ********\n", errno, strerror(errno)); - exit(1); - } -} - -int main(int argc, char **argv) -{ - int num = 1000; - - if (argc > 1) - num = atoi(argv[1]); - printf("will do %d iterations\n", num); - - int cwd = open(".", O_RDONLY); - printf("cwd = %d\n", cwd); - while (num-- > 0) { - if (rand() % 10 == 0) { - __u64 transid; - int r; - printf("sync starting\n"); - r = ioctl(cwd, BTRFS_IOC_START_SYNC, &transid); - check_return(r); - printf("sync started, transid %lld, waiting\n", transid); - r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &transid); - check_return(r); - printf("sync finished\n"); - } - - int i = rand() % max; - struct stat st; - va.fd = cwd; - sprintf(va.name, "test.%d", i); - va.transid = 0; - int r = stat(va.name, &st); - if (r < 0) { - if (rand() % 3 == 0) { - printf("snap create (sync) %s\n", va.name); - va.flags = 0; - r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va); - check_return(r); - } else { - printf("snap create (async) %s\n", va.name); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va); - check_return(r); - printf("snap created, transid %lld\n", va.transid); - if (rand() % 2 == 0) { - printf("waiting for async snap create\n"); - r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &va.transid); - check_return(r); - } - } - } else { - printf("snap remove %s\n", va.name); - vold.fd = va.fd; - strcpy(vold.name, va.name); - r = ioctl(cwd, BTRFS_IOC_SNAP_DESTROY, &vold); - check_return(r); - } - } - return 0; -} diff --git a/qa/btrfs/test_rmdir_async_snap.c b/qa/btrfs/test_rmdir_async_snap.c deleted file mode 100644 index 5dafaacaaeb..00000000000 --- a/qa/btrfs/test_rmdir_async_snap.c +++ /dev/null @@ -1,62 +0,0 @@ -#include <stdlib.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <sys/ioctl.h> -#include <string.h> - -#include <linux/ioctl.h> -#include <linux/types.h> -#include "../../src/os/btrfs_ioctl.h" - -struct btrfs_ioctl_vol_args_v2 va; -struct btrfs_ioctl_vol_args vold; - -int main(int argc, char **argv) -{ - int num = 1000; - int i, r, fd; - char buf[30]; - - if (argc > 1) - num = atoi(argv[1]); - printf("will do %d iterations\n", num); - - fd = open(".", O_RDONLY); - vold.fd = 0; - strcpy(vold.name, "current"); - r = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&vold); - printf("create current ioctl got %d\n", r ? errno:0); - if (r) - return 1; - - for (i=0; i<num; i++) { - sprintf(buf, "current/dir.%d", i); - r = mkdir(buf, 0755); - printf("mkdir got %d\n", r ? errno:0); - if (r) - return 1; - } - - va.fd = open("current", O_RDONLY); - va.flags = BTRFS_SUBVOL_CREATE_ASYNC; - for (i=0; i<num; i++) { - system("/bin/cp /boot/vmlinuz-3.2.0-ceph-00142-g9e98323 current/foo"); - sprintf(buf, "current/dir.%d", i); - r = rmdir(buf); - printf("rmdir got %d\n", r ? errno:0); - if (r) - return 1; - - if (i % 10) continue; - sprintf(va.name, "snap.%d", i); - r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va); - printf("ioctl got %d\n", r ? errno:0); - if (r) - return 1; - } - return 0; -} diff --git a/qa/cephfs/begin/3-kernel.yaml b/qa/cephfs/begin/3-kernel.yaml new file mode 100644 index 00000000000..e94a0d87dc8 --- /dev/null +++ b/qa/cephfs/begin/3-kernel.yaml @@ -0,0 +1,23 @@ +# When the --kernel option is given to teuthology-suite, the kernel is set for +# all nodes (also, the kernel is "distro" when the --kernel option is not set). +# We don't generally want to use a custom kernel for all tests, so unset it. +# The k-testing.yaml will set it, if given, for only the client nodes. +# +# Allow overriding this by using a branch ending in "-all". + +teuthology: + postmerge: + - | + local branch = yaml.kernel.branch + if branch and not yaml.kernel.branch:find "-all$" then + log.debug("removing default kernel specification: %s", yaml.kernel) + py_attrgetter(yaml.kernel).pop('branch', nil) + py_attrgetter(yaml.kernel).pop('deb', nil) + py_attrgetter(yaml.kernel).pop('flavor', nil) + py_attrgetter(yaml.kernel).pop('kdb', nil) + py_attrgetter(yaml.kernel).pop('koji', nil) + py_attrgetter(yaml.kernel).pop('koji_task', nil) + py_attrgetter(yaml.kernel).pop('rpm', nil) + py_attrgetter(yaml.kernel).pop('sha1', nil) + py_attrgetter(yaml.kernel).pop('tag', nil) + end diff --git a/qa/cephfs/begin/3-modules.yaml b/qa/cephfs/begin/3-modules.yaml deleted file mode 100644 index 25947342569..00000000000 --- a/qa/cephfs/begin/3-modules.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Enable mgr modules now before any CephFS mounts are created by the mgr. This -# avoids the potential race of the mgr mounting CephFS and then getting failed -# over by the monitors before the monitors have a chance to note the new client -# session from the mgr beacon. In that case, the monitors will not blocklist -# that client mount automatically so the MDS will eventually do the eviction -# (and create a cluster log warning which we want to avoid). -# -# Note: ideally the mgr would gently stop mgr modules before respawning so that -# the client mounts can be unmounted but this caused issues historically with -# modules like the dashboard so an abrupt restart was chosen instead. - -mgrmodules: - sequential: - - print: "Enabling mgr modules" - # other fragments append to this - -tasks: - - sequential: - - mgrmodules diff --git a/qa/cephfs/conf/mgr.yaml b/qa/cephfs/conf/mgr.yaml index fb6e9b09fa1..2b053f8bdcf 100644 --- a/qa/cephfs/conf/mgr.yaml +++ b/qa/cephfs/conf/mgr.yaml @@ -1,7 +1,9 @@ overrides: ceph: - conf: + cluster-conf: mgr: + client mount timeout: 30 debug client: 20 debug mgr: 20 debug ms: 1 + mon warn on pool no app: false diff --git a/qa/cephfs/conf/mon.yaml b/qa/cephfs/conf/mon.yaml index e33437ae404..9bc2eb852b3 100644 --- a/qa/cephfs/conf/mon.yaml +++ b/qa/cephfs/conf/mon.yaml @@ -3,7 +3,6 @@ overrides: cluster-conf: mon: mon op complaint time: 120 - mon warn on pool no app: false # cephadm can take up to 5 minutes to bring up remaining mons # This needs to be set before cluster-conf configs are applied. conf: diff --git a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml index 2ee219125e7..048cd5ce8b9 100644 --- a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml +++ b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml @@ -1,3 +1,12 @@ +teuthology: + premerge: | + log.debug("base kernel %s", base_config.kernel) + local kernel = base_config.kernel + if kernel.branch ~= "distro" then + log.debug("overriding testing kernel with %s", kernel) + yaml_fragment.kernel.client = kernel + end + kernel: client: branch: testing diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml index a3f3a010d43..5ac25a8f790 100644 --- a/qa/cephfs/overrides/ignorelist_health.yaml +++ b/qa/cephfs/overrides/ignorelist_health.yaml @@ -2,13 +2,17 @@ overrides: ceph: log-ignorelist: - FS_DEGRADED + - fs.*is degraded + - filesystem is degraded - FS_INLINE_DATA_DEPRECATED - FS_WITH_FAILED_MDS - MDS_ALL_DOWN + - filesystem is offline - MDS_DAMAGE - MDS_DEGRADED - MDS_FAILED - MDS_INSUFFICIENT_STANDBY + - insufficient standby MDS daemons available - MDS_UP_LESS_THAN_MAX - online, but wants - filesystem is online with fewer MDS than max_mds @@ -17,3 +21,7 @@ overrides: - overall HEALTH_ - Replacing daemon - deprecated feature inline_data + - BLUESTORE_SLOW_OP_ALERT + - slow operation indications in BlueStore + - experiencing slow operations in BlueStore + - MGR_MODULE_ERROR diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml index 1740134a2e0..07ca62e01fb 100644 --- a/qa/cephfs/overrides/pg_health.yaml +++ b/qa/cephfs/overrides/pg_health.yaml @@ -9,3 +9,5 @@ overrides: - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml new file mode 100644 index 00000000000..d5ba487b9bf --- /dev/null +++ b/qa/config/crimson_bluestore.yaml @@ -0,0 +1,25 @@ +overrides: + ceph: + fs: xfs + conf: + osd: + # crimson's osd objectstore option + crimson osd objectstore: bluestore + debug alienstore: 20 + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore compression mode: aggressive + bluestore fsck on mount: true + bluestore compression algorithm: snappy + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + bluestore rocksdb cf: false + log to stderr: true + err to stderr: true + log flush on exit: true + log to file: false diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml index fa8f49a4986..a10c59d77cc 100644 --- a/qa/config/crimson_qa_overrides.yaml +++ b/qa/config/crimson_qa_overrides.yaml @@ -9,6 +9,7 @@ overrides: osd pool default crimson: true osd: crimson osd obc lru size: 10 + debug ms: 20 flavor: crimson workunit: env: diff --git a/qa/config/crimson_seastore.yaml b/qa/config/crimson_seastore.yaml new file mode 100644 index 00000000000..d1919456ab1 --- /dev/null +++ b/qa/config/crimson_seastore.yaml @@ -0,0 +1,20 @@ +overrides: + ceph: + conf: + osd: + # crimson's osd objectstore option + crimson osd objectstore: seastore + debug seastore: 20 + debug seastore onode: 20 + debug seastore odata: 20 + debug seastore omap: 20 + debug seastore tm: 20 + debug seastore t: 20 + debug seastore cleaner: 20 + debug seastore epm: 20 + debug seastore lba: 20 + debug seastore fixedkv tree: 20 + debug seastore cache: 20 + debug seastore journal: 20 + debug seastore device: 20 + debug seastore backref: 20 diff --git a/qa/config/seastore.yaml b/qa/config/seastore.yaml deleted file mode 100644 index 713d9322584..00000000000 --- a/qa/config/seastore.yaml +++ /dev/null @@ -1,6 +0,0 @@ -overrides: - ceph: - fs: xfs - conf: - osd: - osd objectstore: seastore diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs index ea328eb22c7..c558a1382ef 100644 --- a/qa/crontab/teuthology-cronjobs +++ b/qa/crontab/teuthology-cronjobs @@ -52,16 +52,11 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 00 05 * * 0,2,4 $CW $SS 1 --ceph main --suite smoke -p 100 --force-priority 08 05 * * 0 $CW $SS 1 --ceph squid --suite smoke -p 100 --force-priority 16 05 * * 0 $CW $SS 1 --ceph reef --suite smoke -p 100 --force-priority -24 05 * * 0 $CW $SS 1 --ceph quincy --suite smoke -p 100 --force-priority ## ********** windows tests on main branch - weekly # 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL -## ********** crimson tests on main branch - weekly -# 01 01 * * 0 CEPH_BRANCH=main; MACHINE_NAME=smithi; SUITE_NAME=crimson-rados; KERNEL=distro; $CW $SCHEDULE 100000 $CEPH_BRANCH $MACHINE_NAME $SUITE_NAME $CEPH_QA_EMAIL $KERNEL - - ## ********** teuthology/nop on main branch - daily @daily $CW $SS 1 --ceph main --suite teuthology/nop -p 1 --force-priority @@ -78,9 +73,10 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 32 20 * * 4 $CW $SS 4 --ceph main --suite powercycle -p 950 40 20 * * 5 $CW $SS 1 --ceph main --suite rgw -p 950 48 20 * * 6 $CW $SS 4 --ceph main --suite krbd -p 950 --kernel testing +56 20 * * 6 $CW $SS 1 --ceph main --suite crimson-rados -p 101 --force-priority --flavor crimson -## squid branch runs - twice weekly +## squid branch runs - twice weekly (crimson-rados is run weekly) ## suites rados and rbd use --subset arg and must be call with schedule_subset.sh ## see script in https://github.com/ceph/ceph/tree/main/qa/machine_types @@ -93,6 +89,7 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 32 21 * * 4,1 $CW $SS 4 --ceph squid --suite powercycle -p 100 --force-priority 40 21 * * 5,2 $CW $SS 1 --ceph squid --suite rgw -p 100 --force-priority 48 21 * * 6,3 $CW $SS 4 --ceph squid --suite krbd -p 100 --force-priority --kernel testing +56 21 * * 6 $CW $SS 1 --ceph squid --suite crimson-rados -p 100 --force-priority --flavor crimson ## reef branch runs - weekly ## suites rados and rbd use --subset arg and must be call with schedule_subset.sh @@ -124,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 16 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820 24 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820 32 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820 -40 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade/quincy-p2p -p 820 ### upgrade runs for reef release ###### on smithi diff --git a/qa/distros/container-hosts/centos_9.stream.yaml b/qa/distros/container-hosts/centos_9.stream.yaml index 425cb144b1d..d2eafe6f0a9 100644 --- a/qa/distros/container-hosts/centos_9.stream.yaml +++ b/qa/distros/container-hosts/centos_9.stream.yaml @@ -9,4 +9,7 @@ overrides: tasks: - pexec: all: + # in order to work around a possible nvme-cli <-> libnvme linking issue + # See https://tracker.ceph.com/issues/67684 + - sudo dnf remove nvme-cli -y - sudo dnf install nvmetcli nvme-cli -y diff --git a/qa/distros/container-hosts/centos_9.stream_runc.yaml b/qa/distros/container-hosts/centos_9.stream_runc.yaml index 0f3f21d8ad4..d147851ec98 100644 --- a/qa/distros/container-hosts/centos_9.stream_runc.yaml +++ b/qa/distros/container-hosts/centos_9.stream_runc.yaml @@ -8,6 +8,9 @@ overrides: tasks: - pexec: all: + # in order to work around a possible nvme-cli <-> libnvme linking issue + # See https://tracker.ceph.com/issues/67684 + - sudo dnf remove nvme-cli -y - sudo dnf install runc nvmetcli nvme-cli -y - sudo sed -i 's/^#runtime = "crun"/runtime = "runc"/g' /usr/share/containers/containers.conf - sudo sed -i 's/runtime = "crun"/#runtime = "crun"/g' /usr/share/containers/containers.conf diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/objectstore_debug/% index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/% +++ b/qa/objectstore_debug/% diff --git a/qa/objectstore_debug/bluestore-options/write$/write_random.yaml b/qa/objectstore_debug/bluestore-options/write$/write_random.yaml new file mode 100644 index 00000000000..d14f561c72a --- /dev/null +++ b/qa/objectstore_debug/bluestore-options/write$/write_random.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + osd: + bluestore write v2 random: true diff --git a/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml b/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml new file mode 100644 index 00000000000..4b20e8e52ca --- /dev/null +++ b/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + osd: + bluestore write v2: false diff --git a/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml b/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml new file mode 100644 index 00000000000..238973b1165 --- /dev/null +++ b/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + osd: + bluestore write v2: true diff --git a/qa/objectstore_debug/bluestore-bitmap.yaml b/qa/objectstore_debug/bluestore/bluestore-bitmap.yaml index b18e04bee32..b18e04bee32 100644 --- a/qa/objectstore_debug/bluestore-bitmap.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-bitmap.yaml diff --git a/qa/objectstore_debug/bluestore-comp-lz4.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml index 46f993e686c..46f993e686c 100644 --- a/qa/objectstore_debug/bluestore-comp-lz4.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml diff --git a/qa/objectstore_debug/bluestore-comp-snappy.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml index b5d58414e3f..b5d58414e3f 100644 --- a/qa/objectstore_debug/bluestore-comp-snappy.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml diff --git a/qa/objectstore_debug/bluestore-comp-zlib.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-zlib.yaml index b47ebbb7c62..b47ebbb7c62 100644 --- a/qa/objectstore_debug/bluestore-comp-zlib.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-comp-zlib.yaml diff --git a/qa/objectstore_debug/bluestore-comp-zstd.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-zstd.yaml index e2f5e4e5ba6..e2f5e4e5ba6 100644 --- a/qa/objectstore_debug/bluestore-comp-zstd.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-comp-zstd.yaml diff --git a/qa/objectstore_debug/bluestore-hybrid.yaml b/qa/objectstore_debug/bluestore/bluestore-hybrid.yaml index 68b9bc4279f..68b9bc4279f 100644 --- a/qa/objectstore_debug/bluestore-hybrid.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-hybrid.yaml diff --git a/qa/objectstore_debug/bluestore-low-osd-mem-target.yaml b/qa/objectstore_debug/bluestore/bluestore-low-osd-mem-target.yaml index b2a49790bc3..b2a49790bc3 100644 --- a/qa/objectstore_debug/bluestore-low-osd-mem-target.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-low-osd-mem-target.yaml diff --git a/qa/objectstore_debug/bluestore-stupid.yaml b/qa/objectstore_debug/bluestore/bluestore-stupid.yaml index ca811f131a7..ca811f131a7 100644 --- a/qa/objectstore_debug/bluestore-stupid.yaml +++ b/qa/objectstore_debug/bluestore/bluestore-stupid.yaml diff --git a/qa/rbd/krbd_discard_granularity.t b/qa/rbd/krbd_discard_granularity.t index 844643baedb..8001786b0ab 100644 --- a/qa/rbd/krbd_discard_granularity.t +++ b/qa/rbd/krbd_discard_granularity.t @@ -1,11 +1,13 @@ +Default object size: + $ rbd create --size 20M img $ DEV=$(sudo rbd map img) $ blockdev --getiomin $DEV 65536 $ blockdev --getioopt $DEV - 65536 + 4194304 $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity 65536 $ sudo rbd unmap $DEV @@ -14,7 +16,7 @@ $ blockdev --getiomin $DEV 512 $ blockdev --getioopt $DEV - 512 + 4194304 $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity 512 $ sudo rbd unmap $DEV @@ -38,3 +40,45 @@ $ sudo rbd unmap $DEV $ rbd rm --no-progress img + +Custom object size: + + $ rbd create --size 20M --object-size 1M img + + $ DEV=$(sudo rbd map img) + $ blockdev --getiomin $DEV + 65536 + $ blockdev --getioopt $DEV + 1048576 + $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity + 65536 + $ sudo rbd unmap $DEV + + $ DEV=$(sudo rbd map -o alloc_size=512 img) + $ blockdev --getiomin $DEV + 512 + $ blockdev --getioopt $DEV + 1048576 + $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity + 512 + $ sudo rbd unmap $DEV + + $ DEV=$(sudo rbd map -o alloc_size=1048576 img) + $ blockdev --getiomin $DEV + 1048576 + $ blockdev --getioopt $DEV + 1048576 + $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity + 1048576 + $ sudo rbd unmap $DEV + + $ DEV=$(sudo rbd map -o alloc_size=2097152 img) + $ blockdev --getiomin $DEV + 1048576 + $ blockdev --getioopt $DEV + 1048576 + $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity + 1048576 + $ sudo rbd unmap $DEV + + $ rbd rm --no-progress img diff --git a/qa/rgw/s3tests-branch.yaml b/qa/rgw/s3tests-branch.yaml index ef6819c87e0..8710ce35893 100644 --- a/qa/rgw/s3tests-branch.yaml +++ b/qa/rgw/s3tests-branch.yaml @@ -1,4 +1,4 @@ overrides: s3tests: - force-branch: ceph-master - # git_remote: https://github.com/ceph/ + force-branch: ceph-master + # git_remote: https://github.com/ceph/ diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh index f9c6924ce04..72d70ca7ad5 100755 --- a/qa/standalone/ceph-helpers.sh +++ b/qa/standalone/ceph-helpers.sh @@ -25,15 +25,6 @@ TMPDIR=${TMPDIR:-/tmp} CEPH_BUILD_VIRTUALENV=${TMPDIR} TESTDIR=${TESTDIR:-${TMPDIR}} -if type xmlstarlet > /dev/null 2>&1; then - XMLSTARLET=xmlstarlet -elif type xml > /dev/null 2>&1; then - XMLSTARLET=xml -else - echo "Missing xmlstarlet binary!" - exit 1 -fi - if [ `uname` = FreeBSD ]; then SED=gsed AWK=gawk @@ -1572,6 +1563,20 @@ function test_is_clean() { ####################################################################### +## +# Predicate checking if the named PG is in state "active+clean" +# +# @return 0 if the PG is active & clean, 1 otherwise +# +function is_pg_clean() { + local pgid=$1 + local pg_state + pg_state=$(ceph pg $pgid query 2>/dev/null | jq -r ".state ") + [[ "$pg_state" == "active+clean"* ]] +} + +####################################################################### + calc() { $AWK "BEGIN{print $*}"; } ## @@ -1688,6 +1693,33 @@ function test_wait_for_clean() { } ## +# Wait until the named PG becomes clean or until a timeout of +# $WAIT_FOR_CLEAN_TIMEOUT seconds. +# +# @return 0 if the PG is clean, 1 otherwise +# +function wait_for_pg_clean() { + local pg_id=$1 + local -a delays=($(get_timeout_delays $WAIT_FOR_CLEAN_TIMEOUT 1 3)) + local -i loop=0 + + flush_pg_stats || return 1 + + while true ; do + echo "#---------- $pgid loop $loop" + is_pg_clean $pg_id && break + if (( $loop >= ${#delays[*]} )) ; then + ceph report + echo "PG $pg_id is not clean after $loop iterations" + return 1 + fi + sleep ${delays[$loop]} + loop+=1 + done + return 0 +} + +## # Wait until the cluster becomes peered or if it does not make progress # for $WAIT_FOR_CLEAN_TIMEOUT seconds. # Progress is measured either via the **get_is_making_recovery_progress** @@ -1869,7 +1901,7 @@ function test_repair() { wait_for_clean || return 1 repair 1.0 || return 1 kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 repair 1.0 || return 1 + ! TIMEOUT=2 repair 1.0 || return 1 teardown $dir || return 1 } ####################################################################### @@ -1889,6 +1921,8 @@ function test_repair() { # function pg_scrub() { local pgid=$1 + # do not issue the scrub command unless the PG is clean + wait_for_pg_clean $pgid || return 1 local last_scrub=$(get_last_scrub_stamp $pgid) ceph pg scrub $pgid wait_for_scrub $pgid "$last_scrub" @@ -1896,6 +1930,8 @@ function pg_scrub() { function pg_deep_scrub() { local pgid=$1 + # do not issue the scrub command unless the PG is clean + wait_for_pg_clean $pgid || return 1 local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp) ceph pg deep-scrub $pgid wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp @@ -1912,7 +1948,7 @@ function test_pg_scrub() { wait_for_clean || return 1 pg_scrub 1.0 || return 1 kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 pg_scrub 1.0 || return 1 + ! TIMEOUT=2 pg_scrub 1.0 || return 1 teardown $dir || return 1 } @@ -1931,15 +1967,19 @@ function test_pg_scrub() { # function pg_schedule_scrub() { local pgid=$1 + # do not issue the scrub command unless the PG is clean + wait_for_pg_clean $pgid || return 1 local last_scrub=$(get_last_scrub_stamp $pgid) - ceph pg scrub $pgid + ceph tell $pgid schedule-scrub wait_for_scrub $pgid "$last_scrub" } function pg_schedule_deep_scrub() { local pgid=$1 + # do not issue the scrub command unless the PG is clean + wait_for_pg_clean $pgid || return 1 local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp) - ceph pg deep-scrub $pgid + ceph tell $pgid schedule-deep-scrub wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp } @@ -1948,13 +1988,11 @@ function test_pg_schedule_scrub() { setup $dir || return 1 run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 run_osd $dir 0 || return 1 create_rbd_pool || return 1 wait_for_clean || return 1 pg_schedule_scrub 1.0 || return 1 - kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 pg_scrub 1.0 || return 1 teardown $dir || return 1 } @@ -2050,7 +2088,7 @@ function test_wait_for_scrub() { wait_for_scrub $pgid "$last_scrub" || return 1 kill_daemons $dir KILL osd || return 1 last_scrub=$(get_last_scrub_stamp $pgid) - ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1 + ! TIMEOUT=2 wait_for_scrub $pgid "$last_scrub" || return 1 teardown $dir || return 1 } @@ -2341,7 +2379,7 @@ function run_tests() { shopt -s -o xtrace PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' - export .:$PATH # make sure program from sources are preferred + export PATH=./bin:.:$PATH # make sure program from sources are preferred export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one export CEPH_ARGS diff --git a/qa/standalone/crush/crush-classes.sh b/qa/standalone/crush/crush-classes.sh index 558aabe6d93..a0662c3f1ee 100755 --- a/qa/standalone/crush/crush-classes.sh +++ b/qa/standalone/crush/crush-classes.sh @@ -52,7 +52,7 @@ function get_osds_up() { local objectname=$2 local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \ - $XMLSTARLET sel -t -m "//up/osd" -v . -o ' ') + xmlstarlet sel -t -m "//up/osd" -v . -o ' ') # get rid of the trailing space echo $osds } diff --git a/qa/standalone/mon/mon-cluster-log.sh b/qa/standalone/mon/mon-cluster-log.sh index 863a97c7cab..7b9adda0af6 100755 --- a/qa/standalone/mon/mon-cluster-log.sh +++ b/qa/standalone/mon/mon-cluster-log.sh @@ -62,7 +62,7 @@ function TEST_cluster_log_level() { ceph config set mon.a mon_cluster_log_level info ceph osd down 0 TIMEOUT=20 wait_for_osd up 0 || return 1 - grep -q "cluster [[]INF[]] osd.0.*boot" $dir/log + TIMEOUT=60 wait_for_string $dir/log "cluster [[]INF[]] osd.0.*boot" return_code=$? if [ $return_code -ne 0 ]; then echo "Failed : Could not find INF log in the cluster log file" @@ -145,9 +145,17 @@ function TEST_journald_cluster_log_level() { ceph osd down 0 TIMEOUT=20 wait_for_osd up 0 || return 1 search_str="osd.0.*boot" - sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log - grep -q "$search_str" $dir/journal.log - return_code=$? + return_code=1 + RETRY_DURATION=60 + for ((i=0; i < $RETRY_DURATION; i++)); do + sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log + if ! grep "$search_str" $dir/journal.log; then + sleep 1 + else + return_code=0 + break + fi + done if [ $return_code -ne 0 ]; then echo "Failed : Could not find INF log in the journalctl log file" ERRORS=$(($ERRORS + 1)) diff --git a/qa/standalone/osd-backfill/osd-backfill-space.sh b/qa/standalone/osd-backfill/osd-backfill-space.sh index 6a5c69412f4..84b9703bbfc 100755 --- a/qa/standalone/osd-backfill/osd-backfill-space.sh +++ b/qa/standalone/osd-backfill/osd-backfill-space.sh @@ -609,9 +609,16 @@ function TEST_backfill_grow() { wait_for_clean || return 1 + #Capture the timestamp after complete cleanup or finish the recovery progress + current_timestamp=$(date +"%Y-%m-%dT%H:%M:%S") + delete_pool $poolname kill_daemons $dir || return 1 - ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 + + #Ignore the num_bytes mismatch messages before calling wait_cleanup + if ! awk -v ts="$current_timestamp" '$0 >= ts && /num_bytes mismatch/' $dir/osd.*.log > /dev/null; then + return 1 + fi } # Create a 5 shard EC pool on 6 OSD cluster diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh index aedfbc9b5cb..f7424de8ce1 100755 --- a/qa/standalone/osd/osd-bluefs-volume-ops.sh +++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh @@ -72,7 +72,7 @@ function TEST_bluestore() { truncate $dir/0/block -s 4294967296 # 4GB ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1 - truncate $dir/1/block -s 4311744512 # 4GB + 16MB + truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240 ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1 truncate $dir/2/block -s 4295099392 # 4GB + 129KB ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1 diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 6fea441b3a9..a34f4a47189 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -219,6 +219,18 @@ function TEST_rados_repair_warning() { ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph health unmute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + + set -o pipefail + # Should clear this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -235,7 +247,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 3d3121fe8d8..7b77a60f35b 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -99,11 +99,11 @@ function TEST_recovery_scrub_1() { kill_daemons $dir #|| return 1 declare -a err_strings - err_strings[0]="recovery in progress. Only high priority scrubs allowed." + err_strings[0]="recovery in progress.*scrubs" for osd in $(seq 0 $(expr $OSDS - 1)) do - grep "recovery in progress. Only high priority scrubs allowed." $dir/osd.${osd}.log + grep "recovery in progress.*scrubs" $dir/osd.${osd}.log done for err_string in "${err_strings[@]}" do @@ -163,7 +163,7 @@ function wait_for_scrub_mod() { fi sleep 1 # are we still the primary? - local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` if [ $orig_primary != $current_primary ]; then echo $orig_primary no longer primary for $pgid return 0 @@ -187,9 +187,14 @@ function wait_for_scrub_mod() { # function pg_scrub_mod() { local pgid=$1 + # wait for 'clean' state of the PG. Operator scrub commands are rejected + # *and not remembered* if the PG is not clean + wait_for_pg_clean $pgid + wait_for_pg_clean $pgid || return 1 + local last_scrub=$(get_last_scrub_stamp $pgid) # locate the primary - local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` local recovery=false ceph pg scrub $pgid #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" @@ -229,138 +234,6 @@ function wait_background_check() { return $return_code } -# osd_scrub_during_recovery=true make sure scrub happens -function TEST_recovery_scrub_2() { - local dir=$1 - local poolname=test - - TESTDATA="testdata.$$" - OSDS=8 - PGS=32 - OBJECTS=40 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1 - run_mgr $dir x || return 1 - local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 " - ceph_osd_args+="--osd_scrub_backoff_ratio=0 " - ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 " - ceph_osd_args+="--osd_stats_update_period_scrubbing=2" - for osd in $(seq 0 $(expr $OSDS - 1)) - do - run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 \ - $ceph_osd_args || return 1 - done - - # Create a pool with $PGS pgs - create_pool $poolname $PGS $PGS - wait_for_clean || return 1 - poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') - - dd if=/dev/urandom of=$TESTDATA bs=1M count=50 - for i in $(seq 1 $OBJECTS) - do - rados -p $poolname put obj${i} $TESTDATA - done - rm -f $TESTDATA - - ceph osd pool set $poolname size 3 - - ceph pg dump pgs - - # note that the following will be needed if the mclock scheduler is specified - #ceph tell osd.* config get osd_mclock_override_recovery_settings - - # the '_max_active' is expected to be 0 - ceph tell osd.1 config get osd_recovery_max_active - # both next parameters are expected to be >=3 - ceph tell osd.1 config get osd_recovery_max_active_hdd - ceph tell osd.1 config get osd_recovery_max_active_ssd - - # Wait for recovery to start - count=0 - while(true) - do - #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]' - if test $(ceph --format json pg dump pgs | - jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2 - then - break - fi - sleep 2 - if test "$count" -eq "10" - then - echo "Not enough recovery started simultaneously" - return 1 - fi - count=$(expr $count + 1) - done - ceph pg dump pgs - - pids="" - recov_scrub_count=0 - for pg in $(seq 0 $(expr $PGS - 1)) - do - run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg) - done - wait_background_check pids - return_code=$? - if [ $return_code -ne 0 ]; then return $return_code; fi - - ERRORS=0 - if test $recov_scrub_count -eq 0 - then - echo "No scrubs occurred while PG recovering" - ERRORS=$(expr $ERRORS + 1) - fi - - pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') - pid=$(cat $pidfile) - if ! kill -0 $pid - then - echo "OSD crash occurred" - #tail -100 $dir/osd.0.log - ERRORS=$(expr $ERRORS + 1) - fi - - # Work around for http://tracker.ceph.com/issues/38195 - kill_daemons $dir #|| return 1 - - declare -a err_strings - err_strings[0]="not scheduling scrubs due to active recovery" - - for osd in $(seq 0 $(expr $OSDS - 1)) - do - grep "not scheduling scrubs" $dir/osd.${osd}.log - done - for err_string in "${err_strings[@]}" - do - found=false - for osd in $(seq 0 $(expr $OSDS - 1)) - do - if grep "$err_string" $dir/osd.${osd}.log > /dev/null; - then - found=true - fi - done - if [ "$found" = "true" ]; then - echo "Found log message not expected '$err_string'" - ERRORS=$(expr $ERRORS + 1) - fi - done - - teardown $dir || return 1 - - if [ $ERRORS != "0" ]; - then - echo "TEST FAILED WITH $ERRORS ERRORS" - return 1 - fi - - echo "TEST PASSED" - return 0 -} - main osd-recovery-scrub "$@" # Local Variables: diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index b717026e191..6dd5b10ae8f 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() { ['pool_name']="testpool" ['extras']=" --osd_scrub_auto_repair=true" ) - local extr_dbg=3 standard_scrub_cluster $dir cluster_conf local poolid=${cluster_conf['pool_id']} local poolname=${cluster_conf['pool_name']} @@ -5754,11 +5753,13 @@ function TEST_corrupt_scrub_erasure_overwrites() { # # Test to make sure that a periodic scrub won't cause deep-scrub info to be lost +# Update 2024: this functionality was removed from the code. The test will be skipped. # function TEST_periodic_scrub_replicated() { local dir=$1 local poolname=psr_pool local objname=POBJ + return 0 run_mon $dir a --osd_pool_default_size=2 || return 1 run_mgr $dir x || return 1 @@ -5795,12 +5796,13 @@ function TEST_periodic_scrub_replicated() { flush_pg_stats local last_scrub=$(get_last_scrub_stamp $pg) - # Fake a schedule scrub + # Fake a scheduled deep scrub ceph tell $pg schedule-scrub || return 1 # Wait for schedule regular scrub wait_for_scrub $pg "$last_scrub" # It needed to be upgraded + # update 2024: the "upgrade" functionality has been removed grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1 # Bad object still known @@ -5831,7 +5833,7 @@ function TEST_periodic_scrub_replicated() { flush_pg_stats # Request a regular scrub and it will be done - pg_schedule_scrub $pg + pg_scrub $pg grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 # deep-scrub error is no longer present @@ -6249,6 +6251,254 @@ function TEST_request_scrub_priority() { grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1 } +# +# Testing the "split scrub store" feature: shallow scrubs do not +# purge deep errors from the store. +# +# Corrupt one copy of a replicated pool, creating both shallow and deep errors. +# Then shallow-scrub the pool and verify that the deep errors are still present. +# +function TEST_dual_store_replicated_cluster() { + local dir=$1 + local poolname=csr_pool + local total_objs=19 + local extr_dbg=1 # note: 3 and above leave some temp files around + + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " + ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 " + ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 " + for osd in $(seq 0 1) + do + run_osd $dir $osd $ceph_osd_args || return 1 + done + + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_pool foo 1 || return 1 + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + ceph osd pool set $poolname noscrub 1 + ceph osd pool set $poolname nodeep-scrub 1 + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + add_something $dir $poolname $objname || return 1 + + rados --pool $poolname setomapheader $objname hdr-$objname || return 1 + rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1 + done + + # Increase file 1 MB + 1KB + dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025 + rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1 + rm -f $dir/new.ROBJ19 + + local pg=$(get_pg $poolname ROBJ0) + local primary=$(get_primary $poolname ROBJ0) + + # Compute an old omap digest and save oi + CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \ + config set osd_deep_scrub_update_digest_min_age 0 + CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \ + config set osd_deep_scrub_update_digest_min_age 0 + pg_deep_scrub $pg + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + + # Alternate corruption between osd.0 and osd.1 + local osd=$(expr $i % 2) + + case $i in + 1) + # Size (deep scrub data_digest too) + local payload=UVWXYZZZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 2) + # digest (deep scrub only) + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 3) + # missing + objectstore_tool $dir $osd $objname remove || return 1 + ;; + + 4) + # Modify omap value (deep scrub only) + objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1 + ;; + + 5) + # Delete omap key (deep scrub only) + objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1 + ;; + + 6) + # Add extra omap key (deep scrub only) + echo extra > $dir/extra-val + objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1 + rm $dir/extra-val + ;; + + 7) + # Modify omap header (deep scrub only) + echo -n newheader > $dir/hdr + objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1 + rm $dir/hdr + ;; + + 8) + rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1 + rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1 + + # Break xattrs + echo -n bad-val > $dir/bad-val + objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1 + objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1 + echo -n val3-$objname > $dir/newval + objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1 + rm $dir/bad-val $dir/newval + ;; + + 9) + objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi + echo -n D > $dir/change + rados --pool $poolname put $objname $dir/change + objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi + rm $dir/oi $dir/change + ;; + + # ROBJ10 must be handled after digests are re-computed by a deep scrub below + # ROBJ11 must be handled with config change before deep scrub + # ROBJ12 must be handled with config change before scrubs + # ROBJ13 must be handled before scrubs + + 14) + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1 + objectstore_tool $dir 1 $objname rm-attr _ || return 1 + rm $dir/bad-val + ;; + + 15) + objectstore_tool $dir $osd $objname rm-attr _ || return 1 + ;; + + 16) + objectstore_tool $dir 0 $objname rm-attr snapset || return 1 + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1 + ;; + + 17) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ17 + echo $payload > $dir/new.ROBJ17 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1 + ;; + + 18) + # Deep-scrub only (all replicas are diffent than the object info + local payload=ROBJ18 + echo $payload > $dir/new.ROBJ18 + objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1 + objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1 + # Make one replica have a different object info, so a full repair must happen too + objectstore_tool $dir $osd $objname corrupt-info || return 1 + ;; + + 19) + # Set osd-max-object-size smaller than this object's size + + esac + done + + local pg=$(get_pg $poolname ROBJ0) + + ceph tell osd.\* injectargs -- --osd-max-object-size=1048576 + + inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1 + + # first sequence: the final shallow scrub should not override any of the deep errors + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1.json + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1b.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_1b_s.json + + pg_deep_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_2.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_2s.json + + pg_scrub $pg + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_3.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_3s.json + + diff -u $dir/dp_results.json $dir/sh2_results.json || return 1 + + # inject a read error, which is a special case: the scrub encountering the read error + # would override the previously collected shard info. + inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0 + + pg_deep_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_4.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json + # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_4s.json + + pg_scrub $pg + + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_5.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \ + python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json + (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \ + jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json + + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \ + $dir/sh2Part2_w13_results.json + rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\ + jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json + + # the shallow scrub results should differ from the results of the deep + # scrub preceding it, but the difference should be limited to ROBJ13 + diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1 + diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1 + + ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it + return 0 +} + main osd-scrub-repair "$@" diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index ec0066d955f..385479258f2 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -57,7 +57,7 @@ function TEST_scrub_test() { TESTDATA="testdata.$$" run_mon $dir a --osd_pool_default_size=3 || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 " ceph_osd_args+="--osd_stats_update_period_scrubbing=2" @@ -160,7 +160,7 @@ function TEST_interval_changes() { # This min scrub interval results in 30 seconds backoff time run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd --osd_scrub_min_interval=$min_interval --osd_scrub_max_interval=$max_interval --osd_scrub_interval_randomize_ratio=0 || return 1 @@ -205,7 +205,9 @@ function TEST_interval_changes() { perf_counters $dir $OSDS } -function TEST_scrub_extended_sleep() { +# RRR 6aug24: this test cannot work as expected, following the changes in the +# scrub type to overrides matrix. Disabled for now. +function NO_scrub_extended_sleep() { local dir=$1 local poolname=test local OSDS=3 @@ -224,7 +226,7 @@ function TEST_scrub_extended_sleep() { DAY_END=$(expr $DAY + 3) run_mon $dir a --osd_pool_default_size=3 || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 " @@ -312,7 +314,7 @@ function _scrub_abort() { fi run_mon $dir a --osd_pool_default_size=3 || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do # Set scheduler to "wpq" until there's a reliable way to query scrub @@ -424,7 +426,7 @@ function TEST_scrub_permit_time() { TESTDATA="testdata.$$" run_mon $dir a --osd_pool_default_size=3 || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local scrub_begin_hour=$(date -d '2 hour ago' +"%H" | sed 's/^0//') local scrub_end_hour=$(date -d '1 hour ago' +"%H" | sed 's/^0//') for osd in $(seq 0 $(expr $OSDS - 1)) @@ -531,7 +533,7 @@ function TEST_dump_scrub_schedule() { TESTDATA="testdata.$$" run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 # Set scheduler to "wpq" until there's a reliable way to query scrub states # with "--osd-scrub-sleep" set to 0. The "mclock_scheduler" overrides the @@ -542,6 +544,9 @@ function TEST_dump_scrub_schedule() { --osd_op_queue=wpq \ --osd_stats_update_period_not_scrubbing=1 \ --osd_stats_update_period_scrubbing=1 \ + --osd_scrub_retry_after_noscrub=1 \ + --osd_scrub_retry_pg_state=2 \ + --osd_scrub_retry_delay=2 \ --osd_scrub_sleep=0.2" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -598,17 +603,16 @@ function TEST_dump_scrub_schedule() { declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" ) wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1 - sleep 2 - # # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub # scheduled for the future' value # - ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 - ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 ceph osd set noscrub || return 1 sleep 2 + ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 + ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 + sleep 8 saved_last_stamp=${sched_data['query_last_stamp']} ceph tell $pgid schedule-scrub @@ -638,7 +642,8 @@ function TEST_dump_scrub_schedule() { # missed it. declare -A cond_active_dmp=( ['dmp_state_has_scrubbing']="true" ['query_active']="false" ) sched_data=() - wait_any_cond $pgid 10 $saved_last_stamp cond_active_dmp "WaitingActive " sched_data || return 1 + wait_any_cond $pgid 10 $saved_last_stamp cond_active_dmp "WaitingActive " sched_data + sleep 4 perf_counters $dir $OSDS } @@ -653,7 +658,7 @@ function TEST_pg_dump_objects_scrubbed() { setup $dir || return 1 run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 @@ -680,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() { teardown $dir || return 1 } +function wait_initial_scrubs() { + local -n pg_to_prim_dict=$1 + local extr_dbg=1 # note: 3 and above leave some temp files around + + # set a long schedule for the periodic scrubs. Wait for the + # initial 'no previous scrub is known' scrubs to finish for all PGs. + ceph tell osd.* config set osd_scrub_min_interval 7200 + ceph tell osd.* config set osd_deep_scrub_interval 14400 + ceph tell osd.* config set osd_max_scrubs 32 + ceph tell osd.* config set osd_scrub_sleep 0 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 10 + ceph tell osd.* config set osd_scrub_chunk_max 10 + + for pg in "${!pg_to_prim_dict[@]}"; do + (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg" + ceph tell $pg scrub || return 1 + done + + sleep 1 + (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + + tout=20 + while [ $tout -gt 0 ] ; do + sleep 0.5 + (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + not_done=$(ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l ) + # note that we should ignore a header line + if [ "$not_done" -le 1 ]; then + break + fi + not_done=$(( (not_done - 2) / 4 )) + echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)" + tout=$((tout - 1)) + done + (( tout == 0 )) && return 1 + return 0 +} + + +# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued +# for its replicas: +# if the operator is requesting a scrub of the same PG, the operator's request +# should trigger an abort of the ongoing scrub. +# +# The test process: +# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one. +# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one +# should be stuck in replica reservation. We will verify that. +# - now - the operator is requesting that second PG to be scrubbed. The original (pending) +# scrub should be aborted. We would check for: +# - the new, operator's scrub to be scheduled +# - the replicas' reservers to be released +function TEST_abort_periodic_for_operator() { + local dir=$1 + local -A cluster_conf=( + ['osds_num']="5" + ['pgs_in_pool']="16" + ['pool_name']="test" + ) + local extr_dbg=1 # note: 3 and above leave some temp files around + + standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1 + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} + echo "Pool: $poolname : $poolid" + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # fill the pool with some data + TESTDATA="testdata.$$" + dd if=/dev/urandom of=$TESTDATA bs=320 count=1 + for i in $( seq 1 256 ) + do + rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null + done + rm -f $TESTDATA + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + + # create the dictionary of the PGs in the pool + declare -A pg_pr + declare -A pg_ac + declare -A pg_po + build_pg_dicts "$dir" pg_pr pg_ac pg_po "-" + (( extr_dbg >= 2 )) && echo "PGs table:" + for pg in "${!pg_pr[@]}"; do + (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}" + done + + wait_initial_scrubs pg_pr || return 1 + + # limit all OSDs to one scrub at a time + ceph tell osd.* config set osd_max_scrubs 1 + ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1 + + # configure for slow scrubs + ceph tell osd.* config set osd_scrub_sleep 3 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 2 + ceph tell osd.* config set osd_scrub_chunk_max 2 + (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty + + # the first PG to work with: + local pg1="1.0" + # and another one, that shares its primary, and at least one more active set member + local pg2="" + for pg in "${!pg_pr[@]}"; do + if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then + local -i common=0 + count_common_active $pg $pg1 pg_ac common + if [[ $common -gt 1 ]]; then + pg2=$pg + break + fi + fi + done + if [[ -z "$pg2" ]]; then + # \todo handle the case when no such PG is found + echo "No PG found with the same primary as $pg1" + return 1 + fi + + # the common primary is allowed two concurrent scrubs + ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2 + echo "The two PGs to manipulate are $pg1 and $pg2" + + set_query_debug "$pg1" + # wait till the information published by pg1 is updated to show it as + # not being scrubbed + local is_act + for i in $( seq 1 3 ) + do + is_act=$(ceph pg "$pg1" query | jq '.scrubber.active') + if [[ "$is_act" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to finish scrubbing" + sleep 0.7 + done + ceph pg dump pgs + if [[ "$is_act" != "false" ]]; then + ceph pg "$pg1" query + echo "PG $pg1 appears to be still scrubbing" + return 1 + fi + sleep 0.5 + + echo "Initiating a periodic scrub of $pg1" + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + ceph tell $pg1 schedule-deep-scrub || return 1 + sleep 1 + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + for i in $( seq 1 14 ) + do + sleep 0.5 + stt=$(ceph pg "$pg1" query | jq '.scrubber') + is_active=$(echo $stt | jq '.active') + is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas') + if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to start scrubbing: $stt" + done + if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then + ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + echo "The scrub is not active or is reserving replicas" + return 1 + fi + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + + # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared + # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from + # reserving its replicas. + + (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty + + # now - the 2'nd scrub - which should be blocked on reserving + set_query_debug "$pg2" + ceph tell "$pg2" schedule-deep-scrub + sleep 0.5 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + sleep 1 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + # make sure pg2 scrub is stuck in the reserving state + local stt2=$(ceph pg "$pg2" query | jq '.scrubber') + local pg2_is_reserving + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_reserving" != "true" ]]; then + echo "The scheduled scrub for $pg2 should have been stuck" + ceph pg dump pgs + return 1 + fi + + # now - issue an operator-initiated scrub on pg2. + # The periodic scrub should be aborted, and the operator-initiated scrub should start. + echo "Instructing $pg2 to perform a high-priority scrub" + ceph tell "$pg2" scrub + for i in $( seq 1 10 ) + do + sleep 0.5 + stt2=$(ceph pg "$pg2" query | jq '.scrubber') + pg2_is_active=$(echo $stt2 | jq '.active') + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then + break + fi + echo "Still waiting: $stt2" + done + + if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then + echo "The high-priority scrub for $pg2 is not active or is reserving replicas" + return 1 + fi + echo "Done" +} + + + main osd-scrub-test "$@" # Local Variables: diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh index b0922892a4a..dd37b643e08 100644 --- a/qa/standalone/scrub/scrub-helpers.sh +++ b/qa/standalone/scrub/scrub-helpers.sh @@ -240,8 +240,8 @@ function standard_scrub_cluster() { local saved_echo_flag=${-//[^x]/} set +x - run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \ --osd_scrub_interval_randomize_ratio=0 \ @@ -249,6 +249,12 @@ function standard_scrub_cluster() { --osd_pool_default_pg_autoscale_mode=off \ --osd_pg_stat_report_interval_max_seconds=1 \ --osd_pg_stat_report_interval_max_epochs=1 \ + --osd_stats_update_period_not_scrubbing=3 \ + --osd_stats_update_period_scrubbing=1 \ + --osd_scrub_retry_after_noscrub=5 \ + --osd_scrub_retry_pg_state=5 \ + --osd_scrub_retry_delay=3 \ + --osd_pool_default_size=3 \ $extra_pars" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -294,6 +300,107 @@ function standard_scrub_wpq_cluster() { } +# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries: +# - pg_primary_dict: a dictionary of pgid -> acting_primary +# - pg_acting_dict: a dictionary of pgid -> acting set +# - pg_pool_dict: a dictionary of pgid -> pool +# If the input file is '-', the function will fetch the dump directly from the ceph cluster. +function build_pg_dicts { + local dir=$1 + local -n pg_primary_dict=$2 + local -n pg_acting_dict=$3 + local -n pg_pool_dict=$4 + local infile=$5 + + local extr_dbg=0 # note: 3 and above leave some temp files around + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # if the infile name is '-', fetch the dump directly from the ceph cluster + if [[ $infile == "-" ]]; then + local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty" + local -r ceph_cmd_out=$(eval $ceph_cmd) + local -r ceph_cmd_rc=$? + if [[ $ceph_cmd_rc -ne 0 ]]; then + echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc" + fi + (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2 + l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' ` + else + l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile ` + fi + (( extr_dbg >= 2 )) && echo "L0: $l0" + + mapfile -t l1 < <(echo "$l0" | jq -c '.[]') + (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}" + + for item in "${l1[@]}"; do + pgid=$(echo "$item" | jq -r '.pgid') + acting=$(echo "$item" | jq -r '.acting | @sh') + pg_acting_dict["$pgid"]=$acting + acting_primary=$(echo "$item" | jq -r '.acting_primary') + pg_primary_dict["$pgid"]=$acting_primary + pool=$(echo "$item" | jq -r '.pool') + pg_pool_dict["$pgid"]=$pool + done + + if [[ -n "$saved_echo_flag" ]]; then set -x; fi +} + + +# a function that counts the number of common active-set elements between two PGs +# 1 - the first PG +# 2 - the second PG +# 3 - the dictionary of active sets +function count_common_active { + local pg1=$1 + local pg2=$2 + local -n pg_acting_dict=$3 + local -n res=$4 + + local -a a1=(${pg_acting_dict[$pg1]}) + local -a a2=(${pg_acting_dict[$pg2]}) + + local -i cnt=0 + for i in "${a1[@]}"; do + for j in "${a2[@]}"; do + if [[ $i -eq $j ]]; then + cnt=$((cnt+1)) + fi + done + done + + res=$cnt +} + + +# given a PG, find another one with a disjoint active set +# - but allow a possible common Primary +# 1 - the PG +# 2 - the dictionary of active sets +# 3 - [out] - the PG with a disjoint active set +function find_disjoint_but_primary { + local pg=$1 + local -n ac_dict=$2 + local -n p_dict=$3 + local -n res=$4 + + for cand in "${!ac_dict[@]}"; do + if [[ "$cand" != "$pg" ]]; then + local -i common=0 + count_common_active "$pg" "$cand" ac_dict common + if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then + res=$cand + return + fi + fi + done +} + + + # A debug flag is set for the PG specified, causing the 'pg query' command to display # an additional 'scrub sessions counter' field. # diff --git a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml index 7e7ede3e334..5be06bc6732 100644 --- a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml +++ b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml @@ -21,7 +21,6 @@ overrides: ceph_repository: dev ceph_mgr_modules: - status - - restful cephfs_pools: - name: "cephfs_data" pg_num: "64" diff --git a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml b/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml deleted file mode 100644 index 8e389134b92..00000000000 --- a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml +++ /dev/null @@ -1,15 +0,0 @@ -tasks: -- exec: - mgr.x: - - systemctl stop ceph-mgr.target - - sleep 5 - - ceph -s -- exec: - mon.a: - - ceph restful create-key admin - - ceph restful create-self-signed-cert - - ceph restful restart -- workunit: - clients: - client.0: - - rest/test-restful.sh diff --git a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml index 309f5060045..53e2b7fdbc8 100644 --- a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml +++ b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml @@ -20,7 +20,6 @@ overrides: ceph_repository: dev ceph_mgr_modules: - status - - restful cephfs_pools: - name: "cephfs_data" pg_num: "64" diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa index fea2489fdf6..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/.qa +++ b/qa/suites/crimson-rados-experimental/.qa @@ -1 +1 @@ -../.qa
\ No newline at end of file +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml deleted file mode 120000 index bd9854e7029..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/supported/centos_latest.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml deleted file mode 100644 index d8e5898b99f..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml +++ /dev/null @@ -1,14 +0,0 @@ -overrides: - ceph-deploy: - conf: - global: - osd pool default size: 2 - osd crush chooseleaf type: 0 - osd pool default pg num: 128 - osd pool default pgp num: 128 - ceph: - conf: - osd: - osd shutdown pgref assert: true -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0] diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml deleted file mode 100644 index c22f08eecf8..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - install: - ceph: - flavor: crimson -tasks: -- install: -- ceph: - conf: - osd: - debug monc: 20 - mon: - mon min osdmap epochs: 50 - paxos service trim min: 10 - # prune full osdmaps regularly - mon osdmap full prune min: 15 - mon osdmap full prune interval: 2 - mon osdmap full prune txsize: 2 - flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml deleted file mode 120000 index 6a70c381709..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/config/seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml deleted file mode 100644 index ad8c921425b..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml +++ /dev/null @@ -1,28 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - reached quota - - but it is still running - - overall HEALTH_ - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(CACHE_POOL_NO_HIT_SET\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(PG_DEGRADED\) - conf: - client: - debug ms: 1 - mon: - mon warn on pool no app: false - osd: - osd class load list: "*" - osd class default list: "*" - osd blocked scrub grace period: 3600 -tasks: -- workunit: - clients: - client.0: - - rados/test.sh - - rados/test_pool_quota.sh diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml deleted file mode 100644 index 25efcdac83d..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - ceph: - crush_tunables: optimal - conf: - mon: - mon osd initial require min compat client: luminous - osd: - osd_discard_disconnected_ops: false -tasks: -- rados: - clients: [client.0] - ops: 4000 - objects: 500 - max_attr_len: 8192 - op_weights: - read: 45 - write: 45 - delete: 10 diff --git a/qa/suites/rados/rest/% b/qa/suites/crimson-rados-experimental/thrash/% index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/rados/rest/% +++ b/qa/suites/crimson-rados-experimental/thrash/% diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled new file mode 120000 index 00000000000..5393a75548a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/2-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml new file mode 120000 index 00000000000..5ff70eadf75 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml @@ -0,0 +1 @@ +.qa/overrides/3-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa diff --git a/qa/suites/fs/thrash/workloads/overrides/+ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/fs/thrash/workloads/overrides/+ +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled new file mode 120000 index 00000000000..47afd70202d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/more-active-recovery.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..0bbc72db754 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled @@ -0,0 +1,6 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled new file mode 100644 index 00000000000..4aed086bcc3 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..88f15f2f691 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml index 9774de6887b..79641f695ab 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml @@ -6,6 +6,15 @@ overrides: conf: osd: osd shutdown pgref assert: true + crimson alien thread cpu cores: 6-7 + osd.0: + crimson seastar cpu cores: 0-2 + osd.1: + crimson seastar cpu cores: 3-5 + osd.2: + crimson seastar cpu cores: 0-2 + osd.3: + crimson seastar cpu cores: 3-5 global: ms cluster mode: crc ms service mode: crc diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled new file mode 100644 index 00000000000..e559d9126e8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled @@ -0,0 +1,4 @@ +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro new file mode 120000 index 00000000000..a5b729b9efa --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro @@ -0,0 +1 @@ +.qa/distros/crimson-supported-all-distro/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml index 2bf67af1b18..2bf67af1b18 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml new file mode 100644 index 00000000000..ecad09cfe3a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml @@ -0,0 +1,11 @@ +overrides: + install: + ceph: + flavor: crimson +tasks: +- install: +- ceph: + conf: + osd: + debug monc: 20 + flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled new file mode 100644 index 00000000000..0c2062240ee --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled @@ -0,0 +1,16 @@ +# no need to verify os + flavor + sha1 +verify_ceph_hash: false +tasks: +- cephadm: + conf: + mgr: + debug ms: 1 + debug mgr: 20 + debug osd: 10 +- cephadm.shell: + mon.a: + - ceph orch status + - ceph orch ps + - ceph orch ls + - ceph orch host ls + - ceph orch device ls diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/rados/rest/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/rados/rest/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml new file mode 100644 index 00000000000..aa44b6101ff --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml @@ -0,0 +1,34 @@ +overrides: + ceph: + log-ignorelist: + - but it is still running + - objects unfound and apparently lost + conf: + osd: + osd debug reject backfill probability: .3 + osd scrub min interval: 60 + osd scrub max interval: 120 + osd max backfills: 3 + osd snap trim sleep: 2 + osd delete sleep: 1 + mon: + mon min osdmap epochs: 50 + paxos service trim min: 10 + # prune full osdmaps regularly + mon osdmap full prune min: 15 + mon osdmap full prune interval: 2 + mon osdmap full prune txsize: 2 +tasks: +- thrashosds: + timeout: 2400 + dump_ops_enable: false + sighup_delay: 0 + min_in: 3 + noscrub_toggle_delay: 0 + chance_thrash_pg_upmap: 0 + reweight_osd: 0 + thrash_primary_affinity: false + ceph_objectstore_tool: false + chance_inject_pause_short: 0 + chance_thrash_cluster_full: 0 + chance_reset_purged_snaps_last: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml new file mode 120000 index 00000000000..9124eb1aa29 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml @@ -0,0 +1 @@ +.qa/tasks/thrashosds-health.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml new file mode 100644 index 00000000000..8c9764ade84 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + client.0: + admin socket: /var/run/ceph/ceph-$name.asok +tasks: +- radosbench: + clients: [client.0] + time: 150 +- admin_socket: + client.0: + objecter_requests: + test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}" diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml new file mode 100644 index 00000000000..d35e8421ab4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml @@ -0,0 +1,20 @@ +overrides: + conf: + osd: + osd deep scrub update digest min age: 0 +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + pool_snaps: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml new file mode 100644 index 00000000000..902c4b56a1e --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml @@ -0,0 +1,49 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml new file mode 100644 index 00000000000..071f55e3928 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml new file mode 100644 index 00000000000..afe04229898 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + balance_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml new file mode 100644 index 00000000000..445b582ea42 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + localize_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml index af0ac39310e..e7e8070fd76 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml @@ -1,3 +1,6 @@ +overrides: + ceph: + crush_tunables: jewel tasks: - rados: clients: [client.0] @@ -6,16 +9,15 @@ tasks: max_in_flight: 64 objects: 1024 size: 16384 - ec_pool: true - balanced_reads: true + max_attr_len: 8192 op_weights: read: 100 - write: 0 - append: 100 + write: 100 delete: 50 snap_create: 50 snap_remove: 50 - rollback: 50 - copy_from: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 setattr: 25 rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml new file mode 100644 index 00000000000..1161c3cc253 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + balance_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml new file mode 100644 index 00000000000..80af0def0e4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + localize_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml new file mode 100644 index 00000000000..0694ffcd0d6 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml @@ -0,0 +1,14 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml new file mode 100644 index 00000000000..606dcae6922 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml @@ -0,0 +1,8 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 500 + write_fadvise_dontneed: true + op_weights: + write: 100 diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml index 06d475e2165..1302e14f21a 100644 --- a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml +++ b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml @@ -17,4 +17,4 @@ tasks: timeout: 1h clients: client.0: - - rados/test_python.sh -m 'not (tier or ec or bench)' + - rados/test_python.sh -m 'not (wait or tier or ec)' diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml index 0f6021975a4..50d170f5022 100644 --- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml +++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml @@ -10,3 +10,4 @@ tasks: osd: debug monc: 20 flavor: crimson +- ssh_keys: diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore deleted file mode 120000 index dbccf5ad928..00000000000 --- a/qa/suites/crimson-rados/singleton/objectstore +++ /dev/null @@ -1 +0,0 @@ -../thrash/objectstore
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml new file mode 120000 index 00000000000..481e393be4a --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml new file mode 120000 index 00000000000..abd86d7d986 --- /dev/null +++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml @@ -0,0 +1 @@ +.qa/overrides/short_pg_log.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml b/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml deleted file mode 120000 index 09cfdb59eda..00000000000 --- a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml
\ No newline at end of file diff --git a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml b/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml deleted file mode 120000 index 5a4de14e7e0..00000000000 --- a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml
\ No newline at end of file diff --git a/qa/suites/fs/functional/tasks/test_snap_schedule/% b/qa/suites/fs/functional/tasks/test_snap_schedule/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/fs/functional/tasks/test_snap_schedule/% diff --git a/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$ diff --git a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v1.yaml index 120b2bf04be..120b2bf04be 100644 --- a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml +++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v1.yaml diff --git a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v2.yaml index c8bcf95c056..c8bcf95c056 100644 --- a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml +++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v2.yaml diff --git a/qa/suites/fs/functional/tasks/snap-schedule.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/snap-schedule.yaml index 26922abeda4..7d7f62f16a8 100644 --- a/qa/suites/fs/functional/tasks/snap-schedule.yaml +++ b/qa/suites/fs/functional/tasks/test_snap_schedule/snap-schedule.yaml @@ -15,6 +15,7 @@ overrides: - is full \(reached quota - POOL_FULL - POOL_BACKFILLFULL + - cluster \[WRN\] evicting unresponsive client tasks: - cephfs_test_runner: diff --git a/qa/suites/fs/functional/tasks/uninlining.yaml b/qa/suites/fs/functional/tasks/uninlining.yaml new file mode 100644 index 00000000000..1c5da558b2a --- /dev/null +++ b/qa/suites/fs/functional/tasks/uninlining.yaml @@ -0,0 +1,26 @@ +overrides: + ceph: + conf: + mgr: + debug mgr: 20 + debug ms: 1 + debug finisher: 20 + debug client: 20 + mds: + # to force replication without waiting for hit ratio to ramp up + # this helps with quicker testing against replicas + mds_bal_replicate_threshold: 1 + log-whitelist: + - OSD full dropping all updates + - OSD near full + - pausewr flag + - failsafe engaged, dropping updates + - failsafe disengaged, no longer dropping + - is full \(reached quota + - POOL_FULL + - POOL_BACKFILLFULL + +tasks: + - cephfs_test_runner: + modules: + - tasks.cephfs.test_uninlining diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml index da841373220..42ca9336c8e 100644 --- a/qa/suites/fs/libcephfs/tasks/client.yaml +++ b/qa/suites/fs/libcephfs/tasks/client.yaml @@ -12,3 +12,4 @@ tasks: clients: client.0: - client/test.sh + - client/test_oc_disabled.sh diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 55dde639c23..b7a0338566c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -8,6 +8,7 @@ overrides: - \(MDS_DAMAGE\) - \(FS_DEGRADED\) - \(MDS_CACHE_OVERSIZED\) + - \(MDS_ESTIMATED_REPLAY_TIME\) ceph-fuse: disabled: true tasks: diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml index aa966bff214..2dd668c9f88 100644 --- a/qa/suites/fs/nfs/tasks/nfs.yaml +++ b/qa/suites/fs/nfs/tasks/nfs.yaml @@ -1,3 +1,10 @@ +overrides: + install: + extra_system_packages: + rpm: + - fio + deb: + - fio tasks: - cephfs_test_runner: modules: diff --git a/qa/suites/fs/thrash/workloads/overrides/% b/qa/suites/fs/thrash/workloads/overrides/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/fs/thrash/workloads/overrides/% diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml index 91b45367934..91b45367934 100644 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml index bd202f988c8..bd202f988c8 100644 --- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml +++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml diff --git a/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml index 713adb9628a..96e4353e99c 100644 --- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml +++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml @@ -2,3 +2,4 @@ overrides: ceph: log-ignorelist: - OSD_DOWN + - osd.*is down diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml index fd8e5c9221e..4a5f54dc8c3 100644 --- a/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml +++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml @@ -11,8 +11,7 @@ tasks: - cephadm: image: quay.ceph.io/ceph-ci/ceph:squid roleless: true - cephadm_branch: squid - cephadm_git_url: https://github.com/ceph/ceph + compiled_cephadm_branch: squid conf: osd: #set config option for which cls modules are allowed to be loaded / used diff --git a/qa/suites/fs/upgrade/nofs/kernel.yaml b/qa/suites/fs/upgrade/nofs/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/nofs/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/upgrade/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml b/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml new file mode 100644 index 00000000000..db0ec6db8b9 --- /dev/null +++ b/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml @@ -0,0 +1,5 @@ +tasks: + - cephfs_test_runner: + fail_on_skip: false + modules: + - tasks.cephfs.test_volumes.TestCloneProgressReporter diff --git a/qa/suites/fs/workload/begin/3-kernel.yaml b/qa/suites/fs/workload/begin/3-kernel.yaml new file mode 120000 index 00000000000..a7f7b735665 --- /dev/null +++ b/qa/suites/fs/workload/begin/3-kernel.yaml @@ -0,0 +1 @@ +.qa/cephfs/begin/3-kernel.yaml
\ No newline at end of file diff --git a/qa/suites/fs/workload/begin/3-modules.yaml b/qa/suites/fs/workload/begin/3-modules.yaml deleted file mode 120000 index 1eba706a59d..00000000000 --- a/qa/suites/fs/workload/begin/3-modules.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/begin/3-modules.yaml
\ No newline at end of file diff --git a/qa/suites/fs/workload/tasks/3-snaps/yes.yaml b/qa/suites/fs/workload/tasks/3-snaps/yes.yaml index dee81778942..51bbe2a3dbf 100644 --- a/qa/suites/fs/workload/tasks/3-snaps/yes.yaml +++ b/qa/suites/fs/workload/tasks/3-snaps/yes.yaml @@ -1,8 +1,3 @@ -mgrmodules: - sequential: - - exec: - mon.a: - - ceph mgr module enable snap_schedule overrides: ceph: mgr-modules: diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml index 602d3416263..aa327b0cdf5 100644 --- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml +++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml @@ -5,6 +5,7 @@ overrides: - "mds.dir_split" tasks: - workunit: + timeout: 5h clients: all: - kernel_untar_build.sh diff --git a/qa/suites/nvmeof/basic/base/install.yaml b/qa/suites/nvmeof/basic/base/install.yaml index 64b754e4270..88974f0e638 100644 --- a/qa/suites/nvmeof/basic/base/install.yaml +++ b/qa/suites/nvmeof/basic/base/install.yaml @@ -3,8 +3,7 @@ tasks: - install: extra_packages: - nvme-cli -- cephadm: - watchdog_setup: +- cephadm: - cephadm.shell: host.a: # get state before nvmeof deployment diff --git a/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml index 56e6cc0992a..7f20f9f04a8 100644 --- a/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml +++ b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml @@ -1,21 +1,26 @@ roles: +- - client.0 +- - client.1 - - host.a - mon.a - mgr.x - osd.0 - - osd.1 - - client.0 + - client.2 - ceph.nvmeof.nvmeof.a - - host.b - mon.b + - osd.1 + - client.3 + - ceph.nvmeof.nvmeof.b +- - host.c - mon.c - osd.2 + - client.4 + - ceph.nvmeof.nvmeof.c +- - host.d - osd.3 - - osd.4 - - client.1 - - ceph.nvmeof.nvmeof.b -- - client.2 -- - client.3 + - client.5 + - ceph.nvmeof.nvmeof.d overrides: ceph: @@ -23,3 +28,5 @@ overrides: mon: # cephadm can take up to 5 minutes to bring up remaining mons mon down mkfs grace: 300 + log-ignorelist: + - NVMEOF_SINGLE_GATEWAY diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml index 1532c944452..0416ae2ea4e 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml @@ -1,23 +1,24 @@ +# runs on default nvmeof image (i.e. DEFAULT_NVMEOF_IMAGE) tasks: - nvmeof: - client: client.0 - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + installer: host.a + gw_image: default # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" rbd: pool_name: mypool image_name_prefix: myimage gateway_config: subsystems_count: 3 namespaces_count: 20 - cli_image: quay.io/ceph/nvmeof-cli:1.2 + cli_image: quay.io/ceph/nvmeof-cli:latest - cephadm.wait_for_service: - service: nvmeof.mypool + service: nvmeof.mypool.mygroup0 - workunit: no_coverage_and_limits: true clients: - client.2: - - rbd/nvmeof_setup_subsystem.sh + client.0: + - nvmeof/setup_subsystem.sh env: RBD_POOL: mypool RBD_IMAGE_PREFIX: myimage @@ -26,12 +27,12 @@ tasks: no_coverage_and_limits: true timeout: 30m clients: - client.2: - - rbd/nvmeof_basic_tests.sh - - rbd/nvmeof_fio_test.sh --start_ns 1 --end_ns 30 --rbd_iostat - client.3: - - rbd/nvmeof_basic_tests.sh - - rbd/nvmeof_fio_test.sh --start_ns 31 --end_ns 60 + client.0: + - nvmeof/basic_tests.sh + - nvmeof/fio_test.sh --start_ns 1 --end_ns 30 --rbd_iostat + client.1: + - nvmeof/basic_tests.sh + - nvmeof/fio_test.sh --start_ns 31 --end_ns 60 env: RBD_POOL: mypool IOSTAT_INTERVAL: '10' diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml new file mode 100644 index 00000000000..8eb4f6dc63c --- /dev/null +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml @@ -0,0 +1,36 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 3 + namespaces_count: 20 + cli_image: quay.io/ceph/nvmeof-cli:latest + create_mtls_secrets: true + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + - nvmeof/fio_test.sh --rbd_iostat + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage + IOSTAT_INTERVAL: '10' + RUNTIME: '60' + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/mtls_test.sh diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml new file mode 100644 index 00000000000..dfe31380bb6 --- /dev/null +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml @@ -0,0 +1,39 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 3 + namespaces_count: 20 + cli_image: quay.io/ceph/nvmeof-cli:latest + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/fio_test.sh --rbd_iostat + client.1: + - nvmeof/basic_tests.sh + - nvmeof/namespace_test.sh + env: + RBD_POOL: mypool + IOSTAT_INTERVAL: '10' + RUNTIME: '120' + NEW_NAMESPACES_COUNT: '5' diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml new file mode 100644 index 00000000000..d66b6fc8093 --- /dev/null +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml @@ -0,0 +1,41 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 3 + namespaces_count: 20 + cli_image: quay.io/ceph/nvmeof-cli:latest + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + - nvmeof/fio_test.sh --rbd_iostat + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage + IOSTAT_INTERVAL: '10' + RUNTIME: '60' + +- workunit: + no_coverage_and_limits: true + timeout: 30m + clients: + client.3: + - nvmeof/scalability_test.sh nvmeof.a,nvmeof.b + - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d + - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c + env: + SCALING_DELAYS: '50' + RBD_POOL: mypool + NVMEOF_GROUP: mygroup0 diff --git a/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml b/qa/suites/nvmeof/thrash/clusters/4-gateways-1-initiator.yaml index afe0ed726fe..37c727ed37c 100644 --- a/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml +++ b/qa/suites/nvmeof/thrash/clusters/4-gateways-1-initiator.yaml @@ -1,26 +1,30 @@ roles: +- - client.0 # initiator - - host.a - mon.a - mgr.x - osd.0 - osd.1 - - client.0 + - client.1 - ceph.nvmeof.nvmeof.a - - host.b - mon.b - osd.2 - osd.3 - osd.4 - - client.1 + - client.2 - ceph.nvmeof.nvmeof.b - - host.c - mon.c - osd.5 - osd.6 - osd.7 - - client.2 + - client.3 - ceph.nvmeof.nvmeof.c -- - client.3 # initiator +- - host.d + - client.4 + - ceph.nvmeof.nvmeof.d + overrides: ceph: diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml new file mode 100644 index 00000000000..83d54cdf5c3 --- /dev/null +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml @@ -0,0 +1,37 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 10 + namespaces_count: 90 # each subsystem + cli_image: quay.io/ceph/nvmeof-cli:latest + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- cephadm.exec: + host.a: + - ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml + - cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml + - "sed -i '/ pool: mypool/a\\ spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml" + - cat /tmp/nvmeof-no-huge-page.yaml + - ceph orch ls --refresh + - ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml + - ceph orch redeploy nvmeof.mypool.mygroup0 + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml new file mode 100644 index 00000000000..0f7ac011a60 --- /dev/null +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml @@ -0,0 +1,24 @@ +tasks: +- nvmeof: + installer: host.a + gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" + rbd: + pool_name: mypool + image_name_prefix: myimage + gateway_config: + subsystems_count: 120 + namespaces_count: 8 # each subsystem + cli_image: quay.io/ceph/nvmeof-cli:latest + +- cephadm.wait_for_service: + service: nvmeof.mypool.mygroup0 + +- workunit: + no_coverage_and_limits: true + clients: + client.0: + - nvmeof/setup_subsystem.sh + - nvmeof/basic_tests.sh + env: + RBD_POOL: mypool + RBD_IMAGE_PREFIX: myimage diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml deleted file mode 100644 index 3e5262f95df..00000000000 --- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml +++ /dev/null @@ -1,24 +0,0 @@ -tasks: -- nvmeof: - client: client.0 - gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest" - rbd: - pool_name: mypool - image_name_prefix: myimage - gateway_config: - subsystems_count: 3 - namespaces_count: 20 # each subsystem - cli_image: quay.io/ceph/nvmeof-cli:1.2 - -- cephadm.wait_for_service: - service: nvmeof.mypool - -- workunit: - no_coverage_and_limits: true - clients: - client.3: - - rbd/nvmeof_setup_subsystem.sh - - rbd/nvmeof_basic_tests.sh - env: - RBD_POOL: mypool - RBD_IMAGE_PREFIX: myimage diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml index 4306de99e4d..46037784d31 100644 --- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml +++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml @@ -8,12 +8,16 @@ overrides: - out of quorum # nvmeof daemon thrashing - CEPHADM_FAILED_DAEMON + - NVMEOF_SINGLE_GATEWAY + - NVMEOF_GATEWAY_DOWN + - are in unavailable state + - is unavailable - is in error state - failed cephadm daemon tasks: - nvmeof.thrash: - checker_host: 'client.3' + checker_host: 'client.0' switch_thrashers: True - mon_thrash: diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml index 0271e410f7c..b58dc14d87b 100644 --- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml +++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml @@ -3,9 +3,14 @@ overrides: log-ignorelist: # nvmeof daemon thrashing - CEPHADM_FAILED_DAEMON + - NVMEOF_SINGLE_GATEWAY + - NVMEOF_GATEWAY_DOWN + - are in unavailable state + - is unavailable - is in error state - failed cephadm daemon tasks: - nvmeof.thrash: - checker_host: 'client.3' + checker_host: 'client.0' + randomize: False diff --git a/qa/suites/nvmeof/thrash/workloads/fio.yaml b/qa/suites/nvmeof/thrash/workloads/fio.yaml index fa7153d2ed9..f9a0d0ebde5 100644 --- a/qa/suites/nvmeof/thrash/workloads/fio.yaml +++ b/qa/suites/nvmeof/thrash/workloads/fio.yaml @@ -1,11 +1,11 @@ tasks: - workunit: no_coverage_and_limits: true - timeout: 30m + timeout: 60m clients: - client.3: - - rbd/nvmeof_fio_test.sh --rbd_iostat + client.0: + - nvmeof/fio_test.sh --random_devices 200 env: RBD_POOL: mypool IOSTAT_INTERVAL: '10' - RUNTIME: '600' + RUNTIME: '1800' diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml index e57b7763661..18f3ed374ea 100644 --- a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml @@ -39,6 +39,7 @@ tasks: ceph smb cluster create modusr1 user --define-user-pass=user1%t3stP4ss1 --define-user-pass=user2%t3stP4ss2 + --placement=count:1 - cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1 - cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2 # Wait for the smb service to start diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml new file mode 100644 index 00000000000..3bbf30ea427 --- /dev/null +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml @@ -0,0 +1,91 @@ +roles: +# Test is for basic smb deployment & functionality. one node cluster is OK +- - host.a + - mon.a + - mgr.x + - osd.0 + - osd.1 + - client.0 +- - host.b + - mon.b + - osd.2 + - osd.3 +- - host.c + - mon.c + - osd.4 + - osd.5 +# Reserve a host for acting as a domain controller and smb client +- - host.d + - cephadm.exclude +overrides: + ceph: + log-only-match: + - CEPHADM_ +tasks: +- cephadm.configure_samba_client_container: + role: host.d +- vip: + count: 1 +- cephadm: + +- cephadm.shell: + host.a: + - ceph fs volume create cephfs +- cephadm.wait_for_service: + service: mds.cephfs + +- cephadm.shell: + host.a: + # add subvolgroup & subvolumes for test + - cmd: ceph fs subvolumegroup create cephfs smb + - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777 + - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777 + # set up smb cluster and shares + - cmd: ceph mgr module enable smb + - cmd: sleep 30 + - cmd: > + ceph smb cluster create modusr1 user + --define-user-pass=user1%t3stP4ss1 + --placement=count:3 + --clustering=default + --public_addrs={{VIP0}}/{{VIPPREFIXLEN}} + - cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1 + - cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2 +# Wait for the smb service to start +- cephadm.wait_for_service: + service: smb.modusr1 + +# Check if shares exist +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +# verify CTDB is healthy, cluster well formed +- cephadm.exec: + host.a: + - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.modusr1\")))[-1].name' > /tmp/svcname" + - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status" + - cat /tmp/ctdb_status + - grep 'pnn:0 .*OK' /tmp/ctdb_status + - grep 'pnn:1 .*OK' /tmp/ctdb_status + - grep 'pnn:2 .*OK' /tmp/ctdb_status + - grep 'Number of nodes:3' /tmp/ctdb_status + - rm -rf /tmp/svcname /tmp/ctdb_status + +# Test the assigned VIP +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share2 -c ls" + +- cephadm.shell: + host.a: + - cmd: ceph smb share rm modusr1 share2 + - cmd: ceph smb share rm modusr1 share1 + - cmd: ceph smb cluster rm modusr1 +# Wait for the smb service to be removed +- cephadm.wait_for_service_not_present: + service: smb.modusr1 diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml new file mode 100644 index 00000000000..b9b0ec0d6f1 --- /dev/null +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml @@ -0,0 +1,135 @@ +roles: +# Test is for basic smb deployment & functionality. one node cluster is OK +- - host.a + - mon.a + - mgr.x + - osd.0 + - osd.1 + - client.0 +- - host.b + - mon.b + - osd.2 + - osd.3 +- - host.c + - mon.c + - osd.4 + - osd.5 +# Reserve a host for acting as an smb client +- - host.d + - cephadm.exclude +overrides: + ceph: + log-only-match: + - CEPHADM_ +tasks: +- cephadm.configure_samba_client_container: + role: host.d +- cephadm: + +- cephadm.shell: + host.a: + - ceph fs volume create cephfs +- cephadm.wait_for_service: + service: mds.cephfs + +- cephadm.shell: + host.a: + # add subvolgroup & subvolumes for test + - cmd: ceph fs subvolumegroup create cephfs smb + - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777 + - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777 + # set up smb cluster and shares + - cmd: ceph mgr module enable smb + # TODO: replace sleep with poll of mgr state? + - cmd: sleep 30 + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: uctdb1 + auth_mode: user + user_group_settings: + - {source_type: resource, ref: ug1} + placement: + count: 3 + - resource_type: ceph.smb.usersgroups + users_groups_id: ug1 + values: + users: + - {name: user1, password: t3stP4ss1} + - {name: user2, password: t3stP4ss2} + groups: [] + - resource_type: ceph.smb.share + cluster_id: uctdb1 + share_id: share1 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv1 + path: / + - resource_type: ceph.smb.share + cluster_id: uctdb1 + share_id: share2 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv2 + path: / + # --- End Embedded YAML +# Wait for the smb service to start +- cephadm.wait_for_service: + service: smb.uctdb1 +# Since this is a true cluster there should be a clustermeta in rados +- cephadm.shell: + host.a: + - cmd: rados --pool=.smb -N uctdb1 get cluster.meta.json /dev/stdout + +# Check if shares exist +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user2%t3stP4ss2 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +# verify CTDB is healthy, cluster well formed +- cephadm.exec: + host.a: + - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.uctdb1\")))[-1].name' > /tmp/svcname" + - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status" + - cat /tmp/ctdb_status + - grep 'pnn:0 .*OK' /tmp/ctdb_status + - grep 'pnn:1 .*OK' /tmp/ctdb_status + - grep 'pnn:2 .*OK' /tmp/ctdb_status + - grep 'Number of nodes:3' /tmp/ctdb_status + - rm -rf /tmp/svcname /tmp/ctdb_status + +# Test a different host in the cluster +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.c'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user2%t3stP4ss2 //{{'host.c'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +- cephadm.shell: + host.a: + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: uctdb1 + intent: removed + - resource_type: ceph.smb.usersgroups + users_groups_id: ug1 + intent: removed + - resource_type: ceph.smb.share + cluster_id: uctdb1 + share_id: share1 + intent: removed + - resource_type: ceph.smb.share + cluster_id: uctdb1 + share_id: share2 + intent: removed + # --- End Embedded YAML +# Wait for the smb service to be removed +- cephadm.wait_for_service_not_present: + service: smb.uctdb1 diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml new file mode 100644 index 00000000000..b74593058e2 --- /dev/null +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml @@ -0,0 +1,138 @@ +roles: +# Test is for basic smb deployment & functionality. one node cluster is OK +- - host.a + - mon.a + - mgr.x + - osd.0 + - osd.1 + - client.0 +- - host.b + - mon.b + - osd.2 + - osd.3 +- - host.c + - mon.c + - osd.4 + - osd.5 +# Reserve a host for acting as a domain controller and smb client +- - host.d + - cephadm.exclude +overrides: + ceph: + log-only-match: + - CEPHADM_ +tasks: +- cephadm.deploy_samba_ad_dc: + role: host.d +- cephadm: + +- cephadm.shell: + host.a: + - ceph fs volume create cephfs +- cephadm.wait_for_service: + service: mds.cephfs + +- cephadm.shell: + host.a: + # add subvolgroup & subvolumes for test + - cmd: ceph fs subvolumegroup create cephfs smb + - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777 + - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777 + # set up smb cluster and shares + - cmd: ceph mgr module enable smb + # TODO: replace sleep with poll of mgr state? + - cmd: sleep 30 + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: adctdb1 + auth_mode: active-directory + domain_settings: + realm: DOMAIN1.SINK.TEST + join_sources: + - source_type: resource + ref: join1-admin + custom_dns: + - "{{ctx.samba_ad_dc_ip}}" + placement: + count: 3 + - resource_type: ceph.smb.join.auth + auth_id: join1-admin + auth: + username: Administrator + password: Passw0rd + - resource_type: ceph.smb.share + cluster_id: adctdb1 + share_id: share1 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv1 + path: / + - resource_type: ceph.smb.share + cluster_id: adctdb1 + share_id: share2 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv2 + path: / + # --- End Embedded YAML +# Wait for the smb service to start +- cephadm.wait_for_service: + service: smb.adctdb1 +# Since this is a true cluster there should be a clustermeta in rados +- cephadm.shell: + host.a: + - cmd: rados --pool=.smb -N adctdb1 get cluster.meta.json /dev/stdout + +# Check if shares exist +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +# verify CTDB is healthy, cluster well formed +- cephadm.exec: + host.a: + - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.adctdb1\")))[-1].name' > /tmp/svcname" + - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status" + - cat /tmp/ctdb_status + - grep 'pnn:0 .*OK' /tmp/ctdb_status + - grep 'pnn:1 .*OK' /tmp/ctdb_status + - grep 'pnn:2 .*OK' /tmp/ctdb_status + - grep 'Number of nodes:3' /tmp/ctdb_status + - rm -rf /tmp/svcname /tmp/ctdb_status + +# Test a different host in the cluster +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.c'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.c'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +- cephadm.shell: + host.a: + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: adctdb1 + intent: removed + - resource_type: ceph.smb.join.auth + auth_id: join1-admin + intent: removed + - resource_type: ceph.smb.share + cluster_id: adctdb1 + share_id: share1 + intent: removed + - resource_type: ceph.smb.share + cluster_id: adctdb1 + share_id: share2 + intent: removed + # --- End Embedded YAML +# Wait for the smb service to be removed +- cephadm.wait_for_service_not_present: + service: smb.adctdb1 diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml new file mode 100644 index 00000000000..0aa55a53a3d --- /dev/null +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml @@ -0,0 +1,145 @@ +roles: +# Test is for basic smb deployment & functionality. one node cluster is OK +- - host.a + - mon.a + - mgr.x + - osd.0 + - osd.1 + - client.0 +- - host.b + - mon.b + - osd.2 + - osd.3 +- - host.c + - mon.c + - osd.4 + - osd.5 +# Reserve a host for acting as a domain controller and smb client +- - host.d + - cephadm.exclude +overrides: + ceph: + log-only-match: + - CEPHADM_ +tasks: +- cephadm.deploy_samba_ad_dc: + role: host.d +- vip: + count: 2 +- cephadm: + +- cephadm.shell: + host.a: + - ceph fs volume create cephfs +- cephadm.wait_for_service: + service: mds.cephfs + +- cephadm.shell: + host.a: + # add subvolgroup & subvolumes for test + - cmd: ceph fs subvolumegroup create cephfs smb + - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777 + - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777 + # set up smb cluster and shares + - cmd: ceph mgr module enable smb + # TODO: replace sleep with poll of mgr state? + - cmd: sleep 30 + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: adipctdb + auth_mode: active-directory + domain_settings: + realm: DOMAIN1.SINK.TEST + join_sources: + - source_type: resource + ref: join1-admin + custom_dns: + - "{{ctx.samba_ad_dc_ip}}" + public_addrs: + - address: {{VIP0}}/{{VIPPREFIXLEN}} + - address: {{VIP1}}/{{VIPPREFIXLEN}} + placement: + count: 3 + - resource_type: ceph.smb.join.auth + auth_id: join1-admin + auth: + username: Administrator + password: Passw0rd + - resource_type: ceph.smb.share + cluster_id: adipctdb + share_id: share1 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv1 + path: / + - resource_type: ceph.smb.share + cluster_id: adipctdb + share_id: share2 + cephfs: + volume: cephfs + subvolumegroup: smb + subvolume: sv2 + path: / + # --- End Embedded YAML +# Wait for the smb service to start +- cephadm.wait_for_service: + service: smb.adipctdb +# Since this is a true cluster there should be a clustermeta in rados +- cephadm.shell: + host.a: + - cmd: rados --pool=.smb -N adipctdb get cluster.meta.json /dev/stdout + +# Check if shares exist +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls" + +# verify CTDB is healthy, cluster well formed +- cephadm.exec: + host.a: + - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.adipctdb\")))[-1].name' > /tmp/svcname" + - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status" + - cat /tmp/ctdb_status + - grep 'pnn:0 .*OK' /tmp/ctdb_status + - grep 'pnn:1 .*OK' /tmp/ctdb_status + - grep 'pnn:2 .*OK' /tmp/ctdb_status + - grep 'Number of nodes:3' /tmp/ctdb_status + - rm -rf /tmp/svcname /tmp/ctdb_status + +# Test the two assigned VIPs +- cephadm.exec: + host.d: + - sleep 30 + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP0}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP1}}/share1 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP0}}/share2 -c ls" + - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP1}}/share2 -c ls" + +- cephadm.shell: + host.a: + - cmd: ceph smb apply -i - + stdin: | + # --- Begin Embedded YAML + - resource_type: ceph.smb.cluster + cluster_id: adipctdb + intent: removed + - resource_type: ceph.smb.join.auth + auth_id: join1-admin + intent: removed + - resource_type: ceph.smb.share + cluster_id: adipctdb + share_id: share1 + intent: removed + - resource_type: ceph.smb.share + cluster_id: adipctdb + share_id: share2 + intent: removed + # --- End Embedded YAML +# Wait for the smb service to be removed +- cephadm.wait_for_service_not_present: + service: smb.adipctdb diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml index ce08d40bb58..f07c298c9fc 100644 --- a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml +++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml @@ -40,6 +40,7 @@ tasks: --domain-realm=domain1.sink.test --domain-join-user-pass=Administrator%Passw0rd --custom-dns={{ctx.samba_ad_dc_ip}} + --placement=count:1 - cmd: ceph smb share create modtest1 share1 cephfs / --subvolume=smb/sv1 - cmd: ceph smb share create modtest1 share2 cephfs / --subvolume=smb/sv2 # Wait for the smb service to start diff --git a/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml b/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml index 4c5e267408b..8509fcc14e3 100644 --- a/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml +++ b/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml @@ -3,6 +3,6 @@ tasks: host.a: - ceph osd pool create foo - rbd pool init foo - - ceph orch apply nvmeof foo + - ceph orch apply nvmeof foo default - cephadm.wait_for_service: - service: nvmeof.foo + service: nvmeof.foo.default diff --git a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml index 0080d3bf730..c6bec082843 100644 --- a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml +++ b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml @@ -131,8 +131,10 @@ tasks: - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done - ceph orch ps + - ceph versions # verify all rgw daemons on same version and version hash matches what we are upgrading to - - ceph versions | jq -e '.rgw | length == 1' + # `ceph versions` might not get updated immediately for rgw so retry this + - time timeout 60 bash -c "until ceph versions | jq -e '.rgw | length == 1'; do sleep 2; done" - ceph versions | jq -e '.rgw | keys' | grep $sha1 - ceph orch upgrade status - ceph health detail diff --git a/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml b/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml index 74acebd7037..8c56e41756a 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml @@ -25,3 +25,4 @@ tasks: client.0: - cephadm/test_iscsi_pids_limit.sh - cephadm/test_iscsi_etc_hosts.sh + - cephadm/test_iscsi_setup.sh diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml new file mode 100644 index 00000000000..5207fd415b7 --- /dev/null +++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml @@ -0,0 +1,77 @@ +overrides: + ceph: + log-ignorelist: + - CEPHADM_FAILED_DAEMON + log-only-match: + - CEPHADM_ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - mon.b + - mgr.b + - osd.1 +- - host.c + - mon.c + - osd.2 +tasks: +- install: +- cephadm: +- cephadm.shell: + host.c: + - | + set -ex + # Deploy monitoring stack + ceph orch apply node-exporter + ceph orch apply grafana + ceph orch apply alertmanager + ceph orch apply prometheus + sleep 240 + # generate SSL certificate + openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*" + # Generate a mgmt.spec template + cat << EOT > /tmp/mgmt.spec + service_type: mgmt-gateway + service_id: foo + placement: + hosts: + - ${HOSTNAME} + spec: + ssl_protocols: + - TLSv1.2 + - TLSv1.3 + ssl_ciphers: + - AES128-SHA + - AES256-SHA + enable_health_check_endpoint: True + EOT + # Add generated certificates to spec file + echo " ssl_certificate: |" >> /tmp/mgmt.spec + while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/cert.pem >> /tmp/mgmt.spec + echo " ssl_certificate_key: |" >> /tmp/mgmt.spec + while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/key.pem >> /tmp/mgmt.spec + # Apply spec + ceph orch apply -i /tmp/mgmt.spec +- cephadm.wait_for_service: + service: mgmt-gateway +- cephadm.shell: + host.a: + - | + set -ex + # retrieve mgmt hostname and ip + MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname') + MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr') + # check mgmt-gateway health + curl -k -s https://${MGMT_GTW_IP}/health + curl -k -s https://${MGMT_GTW_IP}:29443/health + # wait for background services to be reconfigured following mgmt-gateway installation + sleep 180 + # check grafana endpoints are responsive and database health is okay + curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"' + # check prometheus endpoints are responsive + curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"' + # check alertmanager endpoints are responsive + curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status + diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml index 89733dabead..515293ea83a 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -61,6 +61,6 @@ tasks: curl -s http://${PROM_IP}:9095/api/v1/alerts curl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e '.data | .alerts | .[] | select(.labels | .alertname == "CephMonDown") | .state == "firing"' # check alertmanager endpoints are responsive and mon down alert is active - curl -s http://${ALERTM_IP}:9093/api/v1/status - curl -s http://${ALERTM_IP}:9093/api/v1/alerts - curl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e '.data | .[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' + curl -s http://${ALERTM_IP}:9093/api/v2/status + curl -s http://${ALERTM_IP}:9093/api/v2/alerts + curl -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e '.[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' diff --git a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml index 1eb4a184dca..e2a2ca03cc9 100644 --- a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml +++ b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml @@ -6,7 +6,6 @@ overrides: - objects misplaced - Synthetic exception in serve - influxdb python module not found - - \(MGR_ZABBIX_ - foo bar - Failed to open Telegraf - evicting unresponsive client diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml index 372bf2561fa..8b3c4c11ac6 100644 --- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml +++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml @@ -15,6 +15,7 @@ overrides: # causing tests to fail due to health warns, even if # the tests themselves are successful. - \(OSDMAP_FLAGS\) + - \(PG_DEGRADED\) tasks: - workunit: clients: diff --git a/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml new file mode 100644 index 00000000000..7cd47898544 --- /dev/null +++ b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml @@ -0,0 +1,8 @@ +roles: +- [mon.a, mgr.x, osd.0, osd.1, client.0] +tasks: +- install: +- exec: + client.0: + - mkdir $TESTDIR/ceph_test_bluefs && cd $TESTDIR/ceph_test_bluefs && ceph_test_bluefs --log-file $TESTDIR/archive/ceph_test_bluefs.log --debug-bluefs 5/20 --gtest_catch_exceptions=0 + - rm -rf $TESTDIR/ceph_test_bluefs diff --git a/qa/suites/rados/rest/mgr-restful.yaml b/qa/suites/rados/rest/mgr-restful.yaml deleted file mode 100644 index 4901f401d30..00000000000 --- a/qa/suites/rados/rest/mgr-restful.yaml +++ /dev/null @@ -1,31 +0,0 @@ -openstack: -- volumes: # attached to each instance - count: 3 - size: 10 # GB -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, mds.a, client.a] -tasks: -- install: -- ceph: - log-ignorelist: - - overall HEALTH_ - - \(MGR_DOWN\) - - \(PG_ - - \(OSD_ - - \(OBJECT_ - - \(OSDMAP_FLAGS\) - - \(POOL_APP_NOT_ENABLED\) -- exec: - mon.a: - - ceph restful create-key admin - - ceph restful create-self-signed-cert - - ceph restful restart -- workunit: - clients: - client.a: - - rest/test-restful.sh -- exec: - mon.a: - - ceph restful delete-key admin - - ceph restful list-keys | jq ".admin" | grep null - diff --git a/qa/suites/rados/rest/supported-random-distro$ b/qa/suites/rados/rest/supported-random-distro$ deleted file mode 120000 index 7cef21eeffd..00000000000 --- a/qa/suites/rados/rest/supported-random-distro$ +++ /dev/null @@ -1 +0,0 @@ -../basic/supported-random-distro$
\ No newline at end of file diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml index 66cf2bc7593..58e253bf6f4 120000 --- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml +++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml @@ -1 +1 @@ -.qa/objectstore_debug/bluestore-bitmap.yaml
\ No newline at end of file +.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
\ No newline at end of file diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml index da2e2598c33..d694c94945f 120000 --- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml +++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml @@ -1 +1 @@ -.qa/objectstore_debug/bluestore-comp-lz4.yaml
\ No newline at end of file +.qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml
\ No newline at end of file diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml index f75b0e1b48e..d7defabaa3c 120000 --- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml +++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml @@ -1 +1 @@ -.qa/objectstore_debug/bluestore-comp-snappy.yaml
\ No newline at end of file +.qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml
\ No newline at end of file diff --git a/qa/suites/rados/singleton/all/mon-connection-score.yaml b/qa/suites/rados/singleton/all/mon-connection-score.yaml new file mode 100644 index 00000000000..f9e0ba3452d --- /dev/null +++ b/qa/suites/rados/singleton/all/mon-connection-score.yaml @@ -0,0 +1,40 @@ +roles: +- - mon.a + - mon.b + - mon.c + - osd.0 + - osd.1 + - osd.2 + - mgr.x + - client.0 + +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + log-ignorelist: + - overall HEALTH_ + - \(OSDMAP_FLAGS\) + - \(OSD_ + - \(PG_ + - \(POOL_ + - \(CACHE_POOL_ + - \(OBJECT_ + - \(SLOW_OPS\) + - \(REQUEST_SLOW\) + - \(TOO_FEW_PGS\) + - slow request + - \(POOL_APP_NOT_ENABLED\) + - overall HEALTH_ + - \(MGR_DOWN\) + - \(MON_DOWN\) + - \(PG_AVAILABILITY\) + - \(SLOW_OPS\) +- cephfs_test_runner: + modules: + - tasks.mon_connection_score
\ No newline at end of file diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml new file mode 100644 index 00000000000..69a54b0f1b7 --- /dev/null +++ b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml @@ -0,0 +1,57 @@ +roles: +- - mon.a + - mon.b + - mgr.a + - mgr.b + - osd.0 + - osd.1 + - osd.2 + - osd.3 +- - mon.c + - mon.d + - mgr.c + - mgr.d + - osd.4 + - osd.5 + - osd.6 + - osd.7 +- - mon.e +- - client.0 + +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +overrides: + ceph: + conf: + global: + osd pool default size: 3 + osd pool default min size: 2 + mon: + debug mon: 30 +tasks: +- install: +- ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + log-ignorelist: + - \(POOL_ + - \(CACHE_POOL_ + - overall HEALTH_ + - \(PG_AVAILABILITY\) + - Reduced data availability + - \(PG_DEGRADED\) + - \(MON_DOWN\) + - \(OSD_DATACENTER_DOWN\) + - \(OSD_DOWN\) + - \(OSD_HOST_DOWN\) + + +- workunit: + clients: + client.0: + - mon/mon-stretch-mode-5-mons-8-osds.sh +- cephfs_test_runner: + modules: + - tasks.stretch_mode_disable_enable diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml index 635085f7fc8..08070caa387 120000 --- a/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml +++ b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml @@ -1 +1 @@ -../thrash-erasure-code/objectstore/bluestore-bitmap.yaml
\ No newline at end of file +../thrash-erasure-code/objectstore/bluestore/bluestore-bitmap.yaml
\ No newline at end of file diff --git a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml index a8bbbafece0..b916bed1475 100644 --- a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml +++ b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml @@ -2,6 +2,9 @@ meta: - desc: | rbd object class functional tests tasks: -- exec: - client.2: - - ceph_test_cls_rbd --gtest_filter=-TestClsRbd.get_features:TestClsRbd.parents:TestClsRbd.mirror +- workunit: + clients: + client.2: + - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' diff --git a/qa/suites/rados/valgrind-leaks/1-start.yaml b/qa/suites/rados/valgrind-leaks/1-start.yaml index 1cdd8a688e8..cc8c8e53766 100644 --- a/qa/suites/rados/valgrind-leaks/1-start.yaml +++ b/qa/suites/rados/valgrind-leaks/1-start.yaml @@ -12,6 +12,7 @@ overrides: - overall HEALTH_ - \(PG_ - \(POOL_APP_NOT_ENABLED\) + - OSD bench result conf: global: osd heartbeat grace: 40 diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml index c70893893fd..17cf141b0cd 100644 --- a/qa/suites/rados/verify/validater/valgrind.yaml +++ b/qa/suites/rados/verify/validater/valgrind.yaml @@ -26,6 +26,8 @@ overrides: - \(MON_DOWN\) - \(SLOW_OPS\) - slow request + - OSD bench result + - OSD_DOWN valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] osd: [--tool=memcheck] diff --git a/qa/suites/rbd/iscsi/0-single-container-host.yaml b/qa/suites/rbd/iscsi/0-single-container-host.yaml deleted file mode 120000 index 7406e749cf5..00000000000 --- a/qa/suites/rbd/iscsi/0-single-container-host.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/single-container-host.yaml
\ No newline at end of file diff --git a/qa/suites/rbd/iscsi/base/install.yaml b/qa/suites/rbd/iscsi/base/install.yaml index 5c5a6c31f60..cca178cafe8 100644 --- a/qa/suites/rbd/iscsi/base/install.yaml +++ b/qa/suites/rbd/iscsi/base/install.yaml @@ -9,6 +9,10 @@ tasks: - ceph orch host ls - ceph orch device ls - install: - extra_packages: + extra_system_packages: + deb: + - open-iscsi + - multipath-tools + rpm: - iscsi-initiator-utils - device-mapper-multipath diff --git a/qa/suites/rbd/iscsi/supported-container-hosts$ b/qa/suites/rbd/iscsi/supported-container-hosts$ new file mode 120000 index 00000000000..30a61f1575f --- /dev/null +++ b/qa/suites/rbd/iscsi/supported-container-hosts$ @@ -0,0 +1 @@ +.qa/distros/supported-container-hosts/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/% b/qa/suites/rbd/migration-external/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rbd/migration-external/% diff --git a/qa/suites/rbd/migration-external/.qa b/qa/suites/rbd/migration-external/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/1-base/.qa b/qa/suites/rbd/migration-external/1-base/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/1-base/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/1-base/install.yaml b/qa/suites/rbd/migration-external/1-base/install.yaml new file mode 100644 index 00000000000..0728d3f206a --- /dev/null +++ b/qa/suites/rbd/migration-external/1-base/install.yaml @@ -0,0 +1,8 @@ +meta: +- desc: run two ceph clusters +tasks: +- install: +- ceph: + cluster: cluster1 +- ceph: + cluster: cluster2 diff --git a/qa/suites/rbd/migration-external/2-clusters/.qa b/qa/suites/rbd/migration-external/2-clusters/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/2-clusters/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/2-clusters/2-node.yaml b/qa/suites/rbd/migration-external/2-clusters/2-node.yaml new file mode 100644 index 00000000000..848e63055e9 --- /dev/null +++ b/qa/suites/rbd/migration-external/2-clusters/2-node.yaml @@ -0,0 +1,15 @@ +meta: +- desc: 2 ceph clusters with 1 mon and 3 osds each +roles: +- - cluster1.mon.a + - cluster1.mgr.x + - cluster1.osd.0 + - cluster1.osd.1 + - cluster1.osd.2 + - cluster1.client.0 +- - cluster2.mon.a + - cluster2.mgr.x + - cluster2.osd.0 + - cluster2.osd.1 + - cluster2.osd.2 + - cluster2.client.0 diff --git a/qa/suites/rbd/migration-external/3-objectstore b/qa/suites/rbd/migration-external/3-objectstore new file mode 120000 index 00000000000..c40bd326145 --- /dev/null +++ b/qa/suites/rbd/migration-external/3-objectstore @@ -0,0 +1 @@ +.qa/objectstore
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/4-supported-random-distro$ b/qa/suites/rbd/migration-external/4-supported-random-distro$ new file mode 120000 index 00000000000..0862b4457b3 --- /dev/null +++ b/qa/suites/rbd/migration-external/4-supported-random-distro$ @@ -0,0 +1 @@ +.qa/distros/supported-random-distro$
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/5-data-pool/.qa b/qa/suites/rbd/migration-external/5-data-pool/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/5-data-pool/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/5-data-pool/ec.yaml b/qa/suites/rbd/migration-external/5-data-pool/ec.yaml new file mode 100644 index 00000000000..f8a39979f97 --- /dev/null +++ b/qa/suites/rbd/migration-external/5-data-pool/ec.yaml @@ -0,0 +1,29 @@ +tasks: +- exec: + cluster1.client.0: + - sudo ceph --cluster cluster1 osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 + - sudo ceph --cluster cluster1 osd pool create datapool 4 4 erasure teuthologyprofile + - sudo ceph --cluster cluster1 osd pool set datapool allow_ec_overwrites true + - rbd --cluster cluster1 pool init datapool + cluster2.client.0: + - sudo ceph --cluster cluster2 osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 + - sudo ceph --cluster cluster2 osd pool create datapool 4 4 erasure teuthologyprofile + - sudo ceph --cluster cluster2 osd pool set datapool allow_ec_overwrites true + - rbd --cluster cluster2 pool init datapool + +overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 + ceph: + fs: xfs + conf: + client: + rbd default data pool: datapool + osd: # force bluestore since it's required for ec overwrites + osd objectstore: bluestore + bluestore block size: 96636764160 + enable experimental unrecoverable data corrupting features: "*" + osd debug randomize hobject sort order: false +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/migration-external/5-data-pool/none.yaml b/qa/suites/rbd/migration-external/5-data-pool/none.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rbd/migration-external/5-data-pool/none.yaml diff --git a/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml b/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml new file mode 100644 index 00000000000..3ecbaf8c127 --- /dev/null +++ b/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml @@ -0,0 +1,14 @@ +tasks: +- exec: + cluster1.client.0: + - sudo ceph --cluster cluster1 osd pool create datapool 4 + - rbd --cluster cluster1 pool init datapool + cluster2.client.0: + - sudo ceph --cluster cluster2 osd pool create datapool 4 + - rbd --cluster cluster2 pool init datapool + +overrides: + ceph: + conf: + client: + rbd default data pool: datapool diff --git a/qa/suites/rbd/migration-external/6-prepare/.qa b/qa/suites/rbd/migration-external/6-prepare/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/6-prepare/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml b/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml new file mode 100644 index 00000000000..2ca92dccfde --- /dev/null +++ b/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml @@ -0,0 +1,29 @@ +tasks: + - exec: + cluster2.client.0: + - echo '{"type":"qcow","stream":{"type":"http","url":"http://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd --cluster cluster2 migration prepare --import-only --source-spec-path - client.0.0-src + - rbd --cluster cluster2 migration execute client.0.0-src + - rbd --cluster cluster2 migration commit client.0.0-src + - rbd --cluster cluster2 snap create client.0.0-src@snap + - rbd --cluster cluster2 snap protect client.0.0-src@snap + - rbd --cluster cluster2 clone client.0.0-src@snap client.0.0 + - rbd --cluster cluster2 snap create client.0.0@snap + - rbd --cluster cluster2 create --size 1G client.0.1-src + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.1-src + - rbd --cluster cluster2 snap create client.0.1-src@snap + - rbd --cluster cluster2 snap protect client.0.1-src@snap + - rbd --cluster cluster2 clone client.0.1-src@snap client.0.1 + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.1 + - rbd --cluster cluster2 snap create client.0.1@snap + - rbd --cluster cluster2 create --size 1G client.0.2-src + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.2-src + - rbd --cluster cluster2 snap create client.0.2-src@snap + - rbd --cluster cluster2 snap protect client.0.2-src@snap + - rbd --cluster cluster2 clone client.0.2-src@snap client.0.2 + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.2 + - rbd --cluster cluster2 snap create client.0.2@snap + - exec: + cluster1.client.0: + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.0","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.0 + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.1","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.1 + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.2","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.2 diff --git a/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml b/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml new file mode 100644 index 00000000000..5fdf4d35c26 --- /dev/null +++ b/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml @@ -0,0 +1,18 @@ +tasks: + - exec: + cluster2.client.0: + - echo '{"type":"qcow","stream":{"type":"http","url":"http://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd --cluster cluster2 migration prepare --import-only --source-spec-path - client.0.0 + - rbd --cluster cluster2 migration execute client.0.0 + - rbd --cluster cluster2 migration commit client.0.0 + - rbd --cluster cluster2 snap create client.0.0@snap + - rbd --cluster cluster2 create --size 1G client.0.1 + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.1 + - rbd --cluster cluster2 snap create client.0.1@snap + - rbd --cluster cluster2 create --size 1G client.0.2 + - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.2 + - rbd --cluster cluster2 snap create client.0.2@snap + - exec: + cluster1.client.0: + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.0","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.0 + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.1","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.1 + - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.2","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.2 diff --git a/qa/suites/rbd/migration-external/7-io-workloads/.qa b/qa/suites/rbd/migration-external/7-io-workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/7-io-workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml b/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml new file mode 100644 index 00000000000..c44011f0837 --- /dev/null +++ b/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml @@ -0,0 +1,14 @@ +io_workload: + sequential: + - qemu: + cluster1.client.0: + type: block + disks: + - action: none + image_name: client.0.0 + - action: none + image_name: client.0.1 + - action: none + image_name: client.0.2 + test: qa/run_xfstests_qemu.sh +exclude_arch: armv7l diff --git a/qa/suites/rbd/migration-external/8-migrate-workloads/.qa b/qa/suites/rbd/migration-external/8-migrate-workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rbd/migration-external/8-migrate-workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml b/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml new file mode 100644 index 00000000000..d0afe7175a1 --- /dev/null +++ b/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml @@ -0,0 +1,14 @@ +tasks: + - parallel: + - io_workload + - migrate_workload +migrate_workload: + sequential: + - exec: + cluster1.client.0: + - sleep $((RANDOM % 600)) + - rbd --cluster cluster1 migration execute client.0.0 + - sleep $((RANDOM % 600)) + - rbd --cluster cluster1 migration commit client.0.0 + - sleep $((RANDOM % 600)) + - rbd --cluster cluster1 migration execute client.0.1 diff --git a/qa/suites/rbd/migration-external/conf b/qa/suites/rbd/migration-external/conf new file mode 120000 index 00000000000..4bc0fe86c63 --- /dev/null +++ b/qa/suites/rbd/migration-external/conf @@ -0,0 +1 @@ +.qa/rbd/conf
\ No newline at end of file diff --git a/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml new file mode 100644 index 00000000000..d2072c41a68 --- /dev/null +++ b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml @@ -0,0 +1,8 @@ +tasks: + - exec: + client.0: + - mkdir /home/ubuntu/cephtest/migration + - qemu-img create -f qcow2 /home/ubuntu/cephtest/migration/empty.qcow2 1G + - echo '{"type":"qcow","stream":{"type":"http","url":"https://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0 + - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.1 + - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.2 diff --git a/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml b/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml new file mode 100644 index 00000000000..b0e8af4d933 --- /dev/null +++ b/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml @@ -0,0 +1,12 @@ +tasks: + - exec: + client.0: + - mkdir /home/ubuntu/cephtest/migration + - wget -nv -O /home/ubuntu/cephtest/migration/base.client.0.qcow2 http://download.ceph.com/qa/ubuntu-12.04.qcow2 + - qemu-img create -f qcow2 /home/ubuntu/cephtest/migration/empty.qcow2 1G + - qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork /home/ubuntu/cephtest/migration/base.client.0.qcow2 + - qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork --socket /home/ubuntu/cephtest/migration/qemu-nbd-empty /home/ubuntu/cephtest/migration/empty.qcow2 + - chmod 0777 /home/ubuntu/cephtest/migration/qemu-nbd-empty + - echo '{"type":"raw","stream":{"type":"nbd","uri":"nbd://localhost"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0 + - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.1 + - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.2 diff --git a/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml b/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml new file mode 100644 index 00000000000..d5c2e60fed9 --- /dev/null +++ b/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml @@ -0,0 +1,13 @@ +tasks: + - exec: + client.0: + - mkdir /home/ubuntu/cephtest/migration + - wget -nv -O /home/ubuntu/cephtest/migration/base.client.0.qcow2 http://download.ceph.com/qa/ubuntu-12.04.qcow2 + - qemu-img convert -f qcow2 -O raw /home/ubuntu/cephtest/migration/base.client.0.qcow2 /home/ubuntu/cephtest/migration/base.client.0.raw + - dd if=/dev/zero of=/home/ubuntu/cephtest/migration/empty.raw count=1 bs=1G + - qemu-nbd -f raw --read-only --shared 10 --persistent --fork /home/ubuntu/cephtest/migration/base.client.0.raw + - qemu-nbd -f raw --read-only --shared 10 --persistent --fork --socket /home/ubuntu/cephtest/migration/qemu-nbd-empty /home/ubuntu/cephtest/migration/empty.raw + - chmod 0777 /home/ubuntu/cephtest/migration/qemu-nbd-empty + - echo '{"type":"raw","stream":{"type":"nbd","uri":"nbd://localhost"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0 + - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.1 + - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.2 diff --git a/qa/suites/rbd/migration/9-cleanup/cleanup.yaml b/qa/suites/rbd/migration/9-cleanup/cleanup.yaml index 18c2bb5f4c4..1d724d09086 100644 --- a/qa/suites/rbd/migration/9-cleanup/cleanup.yaml +++ b/qa/suites/rbd/migration/9-cleanup/cleanup.yaml @@ -1,4 +1,5 @@ tasks: - exec: client.0: + - pkill -9 qemu-nbd || true - rm -rf /home/ubuntu/cephtest/migration diff --git a/qa/suites/rgw/bucket-logging/% b/qa/suites/rgw/bucket-logging/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/bucket-logging/% diff --git a/qa/suites/rgw/bucket-logging/.qa b/qa/suites/rgw/bucket-logging/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/0-install.yaml b/qa/suites/rgw/bucket-logging/0-install.yaml new file mode 100644 index 00000000000..6cf82f57476 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/0-install.yaml @@ -0,0 +1,13 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: [client.0] +- tox: [client.0] + +overrides: + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/bucket-logging/beast.yaml b/qa/suites/rgw/bucket-logging/beast.yaml new file mode 120000 index 00000000000..09ced62c42a --- /dev/null +++ b/qa/suites/rgw/bucket-logging/beast.yaml @@ -0,0 +1 @@ +.qa/rgw_frontend/beast.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/fixed-1.yaml b/qa/suites/rgw/bucket-logging/fixed-1.yaml new file mode 120000 index 00000000000..02df5dd0cd0 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/fixed-1.yaml @@ -0,0 +1 @@ +.qa/clusters/fixed-1.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml new file mode 120000 index 00000000000..32340b1fa8b --- /dev/null +++ b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml @@ -0,0 +1 @@ +.qa/rgw/ignore-pg-availability.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/overrides.yaml b/qa/suites/rgw/bucket-logging/overrides.yaml new file mode 100644 index 00000000000..a448a323d36 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/overrides.yaml @@ -0,0 +1,10 @@ +overrides: + ceph: + conf: + client: + setuser: ceph + setgroup: ceph + debug rgw: 20 + rgw bucket logging obj roll time: 5 + rgw: + storage classes: LUKEWARM, FROZEN diff --git a/qa/suites/rgw/bucket-logging/s3tests-branch.yaml b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml new file mode 120000 index 00000000000..bdcaca48ae0 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml @@ -0,0 +1 @@ +.qa/rgw/s3tests-branch.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/supported-distros b/qa/suites/rgw/bucket-logging/supported-distros new file mode 120000 index 00000000000..78f2991b407 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/supported-distros @@ -0,0 +1 @@ +.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/bucket-logging/tasks/+ b/qa/suites/rgw/bucket-logging/tasks/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/bucket-logging/tasks/+ diff --git a/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml new file mode 100644 index 00000000000..c1d3b7192e1 --- /dev/null +++ b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml @@ -0,0 +1,6 @@ +tasks: +- s3tests: + client.0: + boto3_extensions: True + rgw_server: client.0 + extra_attrs: ["bucket_logging"] diff --git a/qa/suites/rgw/crypt/2-kms/barbican.yaml b/qa/suites/rgw/crypt/2-kms/barbican.yaml index 9bf5fb81131..e3f78810416 100644 --- a/qa/suites/rgw/crypt/2-kms/barbican.yaml +++ b/qa/suites/rgw/crypt/2-kms/barbican.yaml @@ -27,7 +27,7 @@ tasks: - tox: [ client.0 ] - keystone: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 services: - name: swift type: object-store @@ -68,7 +68,7 @@ tasks: project: s3 - barbican: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 use-keystone-role: client.0 keystone_authtoken: auth_plugin: password diff --git a/qa/suites/rgw/multifs/0-install.yaml b/qa/suites/rgw/multifs/0-install.yaml new file mode 100644 index 00000000000..7e83140e64a --- /dev/null +++ b/qa/suites/rgw/multifs/0-install.yaml @@ -0,0 +1,5 @@ +tasks: +- install: +- ceph: +- rgw: [client.0] +- tox: [client.0] diff --git a/qa/suites/rgw/multifs/tasks/+ b/qa/suites/rgw/multifs/tasks/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/multifs/tasks/+ diff --git a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml index e07c8b5ccfe..d9526c365c1 100644 --- a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml +++ b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml @@ -1,13 +1,5 @@ tasks: -- install: -- ceph: -- rgw: [client.0] - workunit: clients: client.0: - rgw/s3_bucket_quota.pl -overrides: - ceph: - conf: - client: - rgw relaxed s3 bucket names: true diff --git a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml index bac4f401626..ae32e928661 100644 --- a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml +++ b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml @@ -1,13 +1,5 @@ tasks: -- install: -- ceph: -- rgw: [client.0] - workunit: clients: client.0: - rgw/s3_multipart_upload.pl -overrides: - ceph: - conf: - client: - rgw relaxed s3 bucket names: true diff --git a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml index 66bdff817f5..184555660dc 100644 --- a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml +++ b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml @@ -1,8 +1,4 @@ tasks: -- install: -- ceph: -- rgw: [client.0] -- tox: [client.0] - ragweed: client.0: default-branch: ceph-master diff --git a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml index 92355f04963..573cffbc30a 100644 --- a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml +++ b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml @@ -1,8 +1,4 @@ tasks: -- install: -- ceph: -- rgw: [client.0] -- tox: [client.0] - s3tests: client.0: rgw_server: client.0 diff --git a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml index 92c63d2e850..393180e5c17 100644 --- a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml +++ b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml @@ -1,13 +1,5 @@ tasks: -- install: -- ceph: -- rgw: [client.0] - workunit: clients: client.0: - rgw/s3_user_quota.pl -overrides: - ceph: - conf: - client: - rgw relaxed s3 bucket names: true diff --git a/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled b/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled index 06f4cb48909..1266cf9c9c4 100644 --- a/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled +++ b/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled @@ -2,7 +2,7 @@ overrides: rgw-multisite: realm: name: test-realm - is default: true + is_default: true zonegroups: - name: test-zonegroup is_master: true diff --git a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml index 0836a953d74..ac2104cdd05 100644 --- a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled +++ b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml @@ -2,7 +2,7 @@ overrides: rgw-multisite: realm: name: test-realm - is default: true + is_default: true zonegroups: - name: a is_master: true @@ -28,4 +28,4 @@ overrides: - name: b2 endpoints: [c2.client.1] rgw-multisite-tests: - args: [tests.py] + args: [tests.py, -a, '!fails_with_rgw'] diff --git a/qa/suites/rgw/multisite/realms/two-zones.yaml b/qa/suites/rgw/multisite/realms/two-zones.yaml index 1bea381077c..9da708bc95e 100644 --- a/qa/suites/rgw/multisite/realms/two-zones.yaml +++ b/qa/suites/rgw/multisite/realms/two-zones.yaml @@ -2,7 +2,7 @@ overrides: rgw-multisite: realm: name: test-realm - is default: true + is_default: true zonegroups: - name: test-zonegroup is_master: true diff --git a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml index 462570e7727..303f98d540e 100644 --- a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml +++ b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml @@ -1,7 +1,7 @@ tasks: - kafka: client.0: - kafka_version: 2.6.0 + kafka_version: 3.8.1 - notification-tests: client.0: extra_attr: ["kafka_test", "data_path_v2_kafka_test"] diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml new file mode 100644 index 00000000000..5c83d5c0d23 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml @@ -0,0 +1,20 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: + client.0: + +overrides: + install: + ceph: + extra_system_packages: + rpm: + - java + deb: + - default-jre + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros new file mode 120000 index 00000000000..46280a42a96 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros @@ -0,0 +1 @@ +../../.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml new file mode 100644 index 00000000000..01d6fc637de --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml @@ -0,0 +1,8 @@ +tasks: +- kafka-failover: + client.0: + kafka_version: 3.8.1 +- notification-tests: + client.0: + extra_attr: ["kafka_failover"] + rgw_server: client.0 diff --git a/qa/suites/fs/functional/subvol_versions/.qa b/qa/suites/rgw/sts/auth-order/.qa index fea2489fdf6..fea2489fdf6 120000 --- a/qa/suites/fs/functional/subvol_versions/.qa +++ b/qa/suites/rgw/sts/auth-order/.qa diff --git a/qa/suites/rgw/sts/auth-order/local-sts.yaml b/qa/suites/rgw/sts/auth-order/local-sts.yaml new file mode 100644 index 00000000000..2f7dcc6b128 --- /dev/null +++ b/qa/suites/rgw/sts/auth-order/local-sts.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: local, sts, external diff --git a/qa/suites/rgw/sts/auth-order/sts-local.yaml b/qa/suites/rgw/sts/auth-order/sts-local.yaml new file mode 100644 index 00000000000..a7b00d00f0b --- /dev/null +++ b/qa/suites/rgw/sts/auth-order/sts-local.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, local, external diff --git a/qa/suites/rgw/tempest/0-install.yaml b/qa/suites/rgw/tempest/0-install.yaml index f968db20c2b..b6ef17de4ee 100644 --- a/qa/suites/rgw/tempest/0-install.yaml +++ b/qa/suites/rgw/tempest/0-install.yaml @@ -4,7 +4,7 @@ tasks: - tox: [ client.0 ] - keystone: client.0: - force-branch: stable/2023.1 + force-branch: stable/2024.1 services: - name: swift type: object-store diff --git a/qa/suites/rgw/tempest/tasks/s3/% b/qa/suites/rgw/tempest/tasks/s3/% new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/% diff --git a/qa/suites/rgw/tempest/tasks/s3/.qa b/qa/suites/rgw/tempest/tasks/s3/.qa new file mode 120000 index 00000000000..fea2489fdf6 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/.qa @@ -0,0 +1 @@ +../.qa
\ No newline at end of file diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa new file mode 120000 index 00000000000..fea2489fdf6 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa @@ -0,0 +1 @@ +../.qa
\ No newline at end of file diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml new file mode 100644 index 00000000000..c46a51e0958 --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, external, local diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml new file mode 100644 index 00000000000..a7b00d00f0b --- /dev/null +++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rgw s3 auth order: sts, local, external diff --git a/qa/suites/rgw/tempest/tasks/s3tests.yaml b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml index 4efb579fa83..4efb579fa83 100644 --- a/qa/suites/rgw/tempest/tasks/s3tests.yaml +++ b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml diff --git a/qa/suites/rgw/verify/overrides.yaml b/qa/suites/rgw/verify/overrides.yaml index 1b3b5abd7ad..afc368fc98c 100644 --- a/qa/suites/rgw/verify/overrides.yaml +++ b/qa/suites/rgw/verify/overrides.yaml @@ -14,6 +14,7 @@ overrides: rgw bucket counters cache: true rgw sts key: abcdefghijklmnop rgw s3 auth use sts: true + rgw reshard progress judge interval: 10 rgw: compression type: random storage classes: LUKEWARM, FROZEN diff --git a/qa/suites/rgw/verify/tasks/cls.yaml b/qa/suites/rgw/verify/tasks/cls.yaml index 8034715353f..26f948d42ec 100644 --- a/qa/suites/rgw/verify/tasks/cls.yaml +++ b/qa/suites/rgw/verify/tasks/cls.yaml @@ -1,3 +1,8 @@ +overrides: + ceph: + conf: + osd: + debug objclass: 20 tasks: - workunit: clients: diff --git a/qa/suites/rgw/verify/tasks/s3tests-java.yaml b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml index 9ad89cc6790..9ad89cc6790 100644 --- a/qa/suites/rgw/verify/tasks/s3tests-java.yaml +++ b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml diff --git a/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml b/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml index 66cf2bc7593..58e253bf6f4 120000 --- a/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml +++ b/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml @@ -1 +1 @@ -.qa/objectstore_debug/bluestore-bitmap.yaml
\ No newline at end of file +.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
\ No newline at end of file diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml index 57e455ba78d..a0adaecf9b2 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml @@ -19,6 +19,20 @@ overrides: - \(MGR_DOWN\) - slow request - \(MON_MSGR2_NOT_ENABLED\) + - \(OSD_DOWN\) + - \(OSD_HOST_DOWN\) + - \(POOL_APP_NOT_ENABLED\) + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - OSDMAP_FLAGS + - OSD_ROOT_DOWN + conf: global: enable experimental unrecoverable data corrupting features: "*" @@ -30,4 +44,3 @@ roles: - mgr.x - osd.0 - osd.1 - - osd.2
\ No newline at end of file diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml index e4897db4d35..48cfa2f756f 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml @@ -18,9 +18,6 @@ tasks: mon: mon_warn_on_insecure_global_id_reclaim: false mon_warn_on_insecure_global_id_reclaim_allowed: false - log-ignorelist: - - Not found or unloadable - - evicting unresponsive client - exec: osd.0: - ceph osd require-osd-release quincy @@ -30,14 +27,3 @@ overrides: conf: mon: mon warn on osd down out interval zero: false - log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) - - OSD_DOWN - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_DEGRADED - - Reduced data availability - - Degraded data redundancy - - OSDMAP_FLAGS diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml index 6aa429f18b5..fe4ff9bb113 100644 --- a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml +++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml @@ -3,14 +3,13 @@ meta: install upgrade ceph/-x on cluster restart : mons, osd.* tasks: +- print: "**** start install.upgrade of nodes" - install.upgrade: - mon.a: -- exec: - osd.0: - - ceph osd require-osd-release quincy + all: - print: "**** done install.upgrade of nodes" +- print: "**** start ceph.restart of all osds" - ceph.restart: - daemons: [mon.a,mgr.x,osd.0,osd.1,osd.2] + daemons: [osd.0,osd.1,osd.2] mon-health-to-clog: false wait-for-healthy: false wait-for-osds-up: false diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml index 40fbcefe728..62fb6427f72 100644 --- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml @@ -32,13 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml index e57e31f2fbe..f7167975aa9 100644 --- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml @@ -1,3 +1,8 @@ +overrides: + ceph: + log-ignorelist: + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml index 9c2ff9da185..9a0585cc074 100644 --- a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml @@ -9,4 +9,6 @@ workload: clients: client.0: - cls + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done end rados_api.yaml" diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml index a618ee77c11..5641471629e 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml @@ -1,16 +1,25 @@ overrides: ceph: log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml index b722f187361..a55dddf46f7 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ first-half-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml index 649b024a476..d54ba8039d0 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ stress-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml index 3814ea3efdb..62fb6427f72 100644 --- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml @@ -31,3 +31,23 @@ overrides: conf: osd: osd shutdown pgref assert: true + log-ignorelist: + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml index 299e3d1b9a0..b5160c2dd00 100644 --- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml @@ -1,10 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml new file mode 100644 index 00000000000..fa93b2f2ece --- /dev/null +++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + log-ignorelist: + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME + - reached quota + - running out of quota + - overall HEALTH_ + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED + - slow request + - noscrub + - nodeep-scrub diff --git a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml index a46e34db5dd..79cf1a96601 100644 --- a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml +++ b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml @@ -9,4 +9,6 @@ workload: clients: client.0: - cls + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done end rados_api.yaml" diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml index 4cd05432d5f..59ccfe2cd02 100644 --- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml @@ -1,3 +1,25 @@ +overrides: + ceph: + log-ignorelist: + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml index f092096f444..79ad2af8ea1 100644 --- a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ first-half-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml index 05bb672b3ac..166327a58f9 100644 --- a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml @@ -7,4 +7,6 @@ stress-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove' - print: "**** done cls/test_cls_rbd.sh 5-workload" diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml new file mode 100644 index 00000000000..fa93b2f2ece --- /dev/null +++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + log-ignorelist: + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME + - reached quota + - running out of quota + - overall HEALTH_ + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED + - slow request + - noscrub + - nodeep-scrub diff --git a/qa/tasks/barbican.py b/qa/tasks/barbican.py index 771304fba92..c32277c3c09 100644 --- a/qa/tasks/barbican.py +++ b/qa/tasks/barbican.py @@ -88,6 +88,14 @@ def run_in_barbican_venv(ctx, client, args): run.Raw('&&') ] + args) +def get_constraints_url(cconf): + version = cconf.get('force-branch', 'master') + if '/' in version: + # split stable/<version> to <version> + version = str(version).split('/')[1] + url = f"https://releases.openstack.org/constraints/upper/{version}" + return url + @contextlib.contextmanager def setup_venv(ctx, config): """ @@ -95,13 +103,14 @@ def setup_venv(ctx, config): """ assert isinstance(config, dict) log.info('Setting up virtualenv for barbican...') - for (client, _) in config.items(): + for (client, cconf) in config.items(): run_in_barbican_dir(ctx, client, ['python3', '-m', 'venv', '.barbicanenv']) run_in_barbican_venv(ctx, client, ['pip', 'install', '--upgrade', 'pip']) + url = get_constraints_url(cconf) run_in_barbican_venv(ctx, client, - ['pip', 'install', 'pytz', + ['pip', 'install', f'-c{url}', 'pytz', '-e', get_barbican_dir(ctx)]) yield diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py index 84e096520b4..e6a9dc8223c 100644 --- a/qa/tasks/cbt.py +++ b/qa/tasks/cbt.py @@ -47,22 +47,11 @@ class CBT(Task): benchmark_config = self.config.get('benchmarks') benchmark_type = next(iter(benchmark_config.keys())) + if benchmark_type in ['librbdfio', 'fio']: testdir = misc.get_testdir(self.ctx) benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio') - if benchmark_type == 'cosbench': - # create cosbench_dir and cosbench_xml_dir - testdir = misc.get_testdir(self.ctx) - benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos') - benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml') - self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']]) - benchmark_config['cosbench']['controller'] = osd_hosts[0] - - # set auth details - remotes_and_roles = self.ctx.cluster.remotes.items() - ips = [host for (host, port) in - (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] - benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0]) + client_endpoints_config = self.config.get('client_endpoints', None) monitoring_profiles = self.config.get('monitoring_profiles', {}) @@ -117,77 +106,6 @@ class CBT(Task): ] ) - if benchmark_type == 'cosbench': - # install cosbench - self.log.info('install dependencies for cosbench') - if system_type == 'rpm': - cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl'] - else: - cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl'] - self.first_mon.run(args=install_cmd + cosbench_depends) - testdir = misc.get_testdir(self.ctx) - cosbench_version = '0.4.2.c3' - cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip' - os_version = misc.get_system_type(self.first_mon, False, True) - - # additional requirements for bionic - if os_version == '18.04': - self.first_mon.run( - args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*']) - # use our own version of cosbench - cosbench_version = 'cosbench-0.4.2.c3.1' - # contains additional parameter "-N" to nc - cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip' - cosbench_dir = os.path.join(testdir, cosbench_version) - self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir]) - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'wget', - cosbench_location, run.Raw('&&'), - 'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version - ] - ) - else: - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'wget', - cosbench_location, run.Raw('&&'), - 'unzip', '{name}.zip'.format(name=cosbench_version) - ] - ) - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'ln', '-s', cosbench_version, 'cos', - ] - ) - self.first_mon.run( - args=[ - 'cd', os.path.join(testdir, 'cos'), run.Raw('&&'), - 'chmod', '+x', run.Raw('*.sh'), - ] - ) - - # start cosbench and check info - self.log.info('start cosbench') - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'start-all.sh' - ] - ) - self.log.info('check cosbench info') - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'cli.sh', 'info' - ] - ) - def checkout_cbt(self): testdir = misc.get_testdir(self.ctx) repo = self.config.get('repo', 'https://github.com/ceph/cbt.git') @@ -269,51 +187,6 @@ class CBT(Task): ] ) - if benchmark_type == 'cosbench': - os_version = misc.get_system_type(self.first_mon, False, True) - if os_version == '18.04': - cosbench_version = 'cosbench-0.4.2.c3.1' - else: - cosbench_version = '0.4.2.c3' - # note: stop-all requires 'nc' - self.first_mon.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'cos', run.Raw('&&'), - 'sh', 'stop-all.sh', - run.Raw('||'), 'true' - ] - ) - self.first_mon.run( - args=[ - 'sudo', 'killall', '-9', 'java', - run.Raw('||'), 'true' - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/cos'.format(tdir=testdir), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version), - ] - ) - self.first_mon.run( - args=[ - 'rm', '--one-file-system', '-rf', '--', - '{tdir}/xml'.format(tdir=testdir), - ] - ) # Collect cbt performance data cbt_performance = CBTperformance() cbt_performance.collect(self.ctx, self.config) diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index b01fe370ec0..8f666d2fa9b 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -376,7 +376,7 @@ def module_setup(ctx, config): cluster_name, 'mgr', 'module', - 'emable', + 'enable', m, ] log.info("enabling module %s", m) @@ -414,6 +414,15 @@ def conf_setup(ctx, config): for p in procs: log.debug("waiting for %s", p) p.wait() + cmd = [ + 'sudo', + 'ceph', + '--cluster', + cluster_name, + 'config', + 'dump', + ] + mon_remote.run(args=cmd) yield @contextlib.contextmanager @@ -1197,8 +1206,18 @@ def cluster(ctx, config): args.extend([ run.Raw('|'), 'head', '-n', '1', ]) - stdout = mon0_remote.sh(args) - return stdout or None + r = mon0_remote.run( + stdout=BytesIO(), + args=args, + stderr=StringIO(), + ) + stdout = r.stdout.getvalue().decode() + if stdout: + return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr + return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_ignorelist']) is not None: diff --git a/qa/tasks/ceph_iscsi_client.py b/qa/tasks/ceph_iscsi_client.py index 189b7fa31fe..0b0a355f925 100644 --- a/qa/tasks/ceph_iscsi_client.py +++ b/qa/tasks/ceph_iscsi_client.py @@ -31,8 +31,15 @@ def task(ctx, config): remote.run(args=['sudo', 'systemctl', 'restart', 'iscsid']) remote.run(args=['sudo', 'modprobe', 'dm_multipath']) - remote.run(args=['sudo', 'mpathconf', '--enable']) conf = dedent(''' + defaults { + user_friendly_names yes + find_multipaths yes + } + + blacklist { + } + devices { device { vendor "LIO-ORG" @@ -50,7 +57,7 @@ def task(ctx, config): } ''') path = "/etc/multipath.conf" - remote.sudo_write_file(path, conf, append=True) + remote.sudo_write_file(path, conf) remote.run(args=['sudo', 'systemctl', 'start', 'multipathd']) yield diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index ccf54648d43..57d22f3b5e6 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -2169,6 +2169,10 @@ class CephManager: when creating an erasure coded pool. """ with self.lock: + # msr rules require at least squid + if 'crush-osds-per-failure-domain' in profile: + self.raw_cluster_cmd( + 'osd', 'set-require-min-compat-client', 'squid') args = cmd_erasure_code_profile(profile_name, profile) self.raw_cluster_cmd(*args) @@ -2792,6 +2796,59 @@ class CephManager: num += 1 return num + def _print_not_active_clean_pg(self, pgs): + """ + Print the PGs that are not active+clean. + """ + for pg in pgs: + if not (pg['state'].count('active') and + pg['state'].count('clean') and + not pg['state'].count('stale')): + log.debug( + "PG %s is not active+clean, but %s", + pg['pgid'], pg['state'] + ) + + def pg_all_active_clean(self): + """ + Check if all pgs are active+clean + return: True if all pgs are active+clean else False + """ + pgs = self.get_pg_stats() + result = self._get_num_active_clean(pgs) == len(pgs) + if result: + log.debug("All PGs are active+clean") + else: + log.debug("Not all PGs are active+clean") + self._print_not_active_clean_pg(pgs) + return result + + def _print_not_active_pg(self, pgs): + """ + Print the PGs that are not active. + """ + for pg in pgs: + if not (pg['state'].count('active') + and not pg['state'].count('stale')): + log.debug( + "PG %s is not active, but %s", + pg['pgid'], pg['state'] + ) + + def pg_all_active(self): + """ + Check if all pgs are active + return: True if all pgs are active else False + """ + pgs = self.get_pg_stats() + result = self._get_num_active(pgs) == len(pgs) + if result: + log.debug("All PGs are active") + else: + log.debug("Not all PGs are active") + self._print_not_active_pg(pgs) + return result + def is_clean(self): """ True if all pgs are clean @@ -3233,6 +3290,26 @@ class CephManager: self.make_admin_daemon_dir(remote) self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart() + def get_crush_rule_id(self, crush_rule_name): + """ + Get crush rule id by name + :returns: int -- crush rule id + """ + out = self.raw_cluster_cmd('osd', 'crush', 'rule', 'dump', '--format=json') + j = json.loads('\n'.join(out.split('\n')[1:])) + for rule in j: + if rule['rule_name'] == crush_rule_name: + return rule['rule_id'] + assert False, 'rule %s not found' % crush_rule_name + + def get_mon_dump_json(self): + """ + mon dump --format=json converted to a python object + :returns: the python object + """ + out = self.raw_cluster_cmd('mon', 'dump', '--format=json') + return json.loads('\n'.join(out.split('\n')[1:])) + def get_mon_status(self, mon): """ Extract all the monitor status information from the cluster @@ -3336,6 +3413,23 @@ class CephManager: self.log(task_status) return task_status + # Stretch mode related functions + def is_degraded_stretch_mode(self): + """ + Return whether the cluster is in degraded stretch mode + """ + try: + osdmap = self.get_osd_dump_json() + stretch_mode = osdmap.get('stretch_mode', {}) + degraded_stretch_mode = stretch_mode.get('degraded_stretch_mode', 0) + self.log("is_degraded_stretch_mode: {0}".format(degraded_stretch_mode)) + return degraded_stretch_mode == 1 + except (TypeError, AttributeError) as e: + # Log the error or handle it as needed + self.log("Error accessing degraded_stretch_mode: {0}".format(e)) + return False + + def utility_task(name): """ Generate ceph_manager subtask corresponding to ceph_manager diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 8347b890629..7afcbc2f2eb 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd): while True: if condition(): success_time_elapsed = 0 - while success_time_elapsed < success_hold_time: - if condition(): - success_time_elapsed += 1 - time.sleep(1) - elapsed += 1 - else: - break + while success_time_elapsed < success_hold_time and condition(): + success_time_elapsed += 1 + time.sleep(1) + elapsed += 1 if success_time_elapsed == success_hold_time: log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time)) return diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index 166ea9537ee..0cde6050718 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -209,7 +209,9 @@ def normalize_hostnames(ctx): def download_cephadm(ctx, config, ref): cluster_name = config['cluster'] - if 'cephadm_binary_url' in config: + if 'cephadm_from_container' in config: + _fetch_cephadm_from_container(ctx, config) + elif 'cephadm_binary_url' in config: url = config['cephadm_binary_url'] _download_cephadm(ctx, url) elif config.get('cephadm_mode') != 'cephadm-package': @@ -232,6 +234,36 @@ def download_cephadm(ctx, config, ref): _rm_cephadm(ctx) +def _fetch_cephadm_from_container(ctx, config): + image = config['image'] + cengine = 'podman' + try: + log.info("Testing if podman is available") + ctx.cluster.run(args=['sudo', cengine, '--help']) + except CommandFailedError: + log.info("Failed to find podman. Using docker") + cengine = 'docker' + + ctx.cluster.run(args=['sudo', cengine, 'pull', image]) + ctx.cluster.run(args=[ + 'sudo', cengine, 'run', '--rm', '--entrypoint=cat', image, '/usr/sbin/cephadm', + run.Raw('>'), + ctx.cephadm, + ]) + + # sanity-check the resulting file and set executable bit + cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm) + ctx.cluster.run( + args=[ + 'test', '-s', ctx.cephadm, + run.Raw('&&'), + 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'), + run.Raw('&&'), + 'chmod', '+x', ctx.cephadm, + ], + ) + + def _fetch_cephadm_from_rpm(ctx): log.info("Copying cephadm installed from an RPM package") # cephadm already installed from redhat.install task @@ -443,12 +475,16 @@ def ceph_log(ctx, config): run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( - stdout=StringIO(), + stdout=BytesIO(), args=args, + stderr=StringIO(), ) - stdout = r.stdout.getvalue() - if stdout != '': + stdout = r.stdout.getvalue().decode() + if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None # NOTE: technically the first and third arg to first_in_ceph_log @@ -1817,6 +1853,12 @@ def conf_setup(ctx, config): for p in procs: log.debug("waiting for %s", p) p.wait() + cmd = [ + 'ceph', + 'config', + 'dump', + ] + _shell(ctx, cluster_name, remote, args=cmd) yield @contextlib.contextmanager diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py index c1312ec5efc..21b96d2b22b 100644 --- a/qa/tasks/cephfs/cephfs_test_case.py +++ b/qa/tasks/cephfs/cephfs_test_case.py @@ -252,8 +252,8 @@ class CephFSTestCase(CephTestCase): def get_session_data(self, client_id): return self._session_by_id(client_id) - def _session_list(self): - ls_data = self.fs.mds_asok(['session', 'ls']) + def _session_list(self, rank=None, status=None): + ls_data = self.fs.rank_asok(['session', 'ls'], rank=rank, status=status) ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] return ls_data @@ -269,9 +269,9 @@ class CephFSTestCase(CephTestCase): def perf_dump(self, rank=None, status=None): return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status) - def wait_until_evicted(self, client_id, timeout=30): + def wait_until_evicted(self, client_id, rank=None, timeout=30): def is_client_evicted(): - ls = self._session_list() + ls = self._session_list(rank=rank) for s in ls: if s['id'] == client_id: return False diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 1c00a49077d..3846ef23f97 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -640,12 +640,17 @@ class FilesystemBase(MDSClusterBase): def set_joinable(self, joinable=True): self.set_var("joinable", joinable) - def set_max_mds(self, max_mds): - self.set_var("max_mds", "%d" % max_mds) + def set_max_mds(self, max_mds, confirm=True): + if confirm: + self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it') + else: + self.set_var("max_mds", f"{max_mds}",) def set_session_timeout(self, timeout): self.set_var("session_timeout", "%d" % timeout) + def set_session_autoclose(self, autoclose_time): + self.set_var("session_autoclose", "%d" % autoclose_time) def set_allow_standby_replay(self, yes): self.set_var("allow_standby_replay", yes) diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py index 3654cde9ca0..52362d853dc 100644 --- a/qa/tasks/cephfs/mount.py +++ b/qa/tasks/cephfs/mount.py @@ -775,6 +775,10 @@ class CephFSMountBase(object): return self.client_remote.run(args=args, **kwargs) + def get_shell_stdout(self, args, timeout=300, **kwargs): + return self.run_shell(args=args, timeout=timeout, **kwargs).stdout.\ + getvalue().strip() + def run_shell_payload(self, payload, wait=True, timeout=900, **kwargs): kwargs.setdefault('cwd', self.mountpoint) kwargs.setdefault('omit_sudo', False) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py index ff9962e7310..beb41019e6d 100644 --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@ -324,6 +324,8 @@ class TestFsStatus(TestAdminCommands): Test "ceph fs status subcommand. """ + MDSS_REQUIRED = 3 + def test_fs_status(self): """ That `ceph fs status` command functions. @@ -338,6 +340,31 @@ class TestFsStatus(TestAdminCommands): mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"] self.assertEqual(mdsmap[0]["state"], "active") + def test_fs_status_standby_replay(self): + """ + That `ceph fs status` command functions. + """ + + self.fs.set_allow_standby_replay(True) + + s = self.get_ceph_cmd_stdout("fs", "status") + self.assertTrue("active" in s) + self.assertTrue("standby-replay" in s) + self.assertTrue("0-s" in s) + self.assertTrue("standby" in s) + + mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json-pretty"))["mdsmap"] + self.assertEqual(mdsmap[0]["state"], "active") + self.assertEqual(mdsmap[1]["state"], "standby-replay") + self.assertEqual(mdsmap[1]["rank"], "0-s") + self.assertEqual(mdsmap[2]["state"], "standby") + + mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"] + self.assertEqual(mdsmap[0]["state"], "active") + self.assertEqual(mdsmap[1]["state"], "standby-replay") + self.assertEqual(mdsmap[1]["rank"], "0-s") + self.assertEqual(mdsmap[2]["state"], "standby") + class TestAddDataPool(TestAdminCommands): """ @@ -2178,9 +2205,6 @@ class TestFsAuthorizeUpdate(CephFSTestCase): caps mon = "allow r fsname=a" caps osd = "allow rw tag cephfs data=a" """ - self.skipTest('this test is broken ATM, see ' - 'https://tracker.ceph.com/issues/65808') - PERM, PATH = 'rw', 'dir1' self.mount_a.run_shell(f'mkdir {PATH}') self.captester = CapTester(self.mount_a, PATH) @@ -2659,3 +2683,241 @@ class TestMDSFail(TestAdminCommands): errmsgs=health_warn) self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it') self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it') + + +class TestFSSetMaxMDS(TestAdminCommands): + + def test_when_unhealthy_without_confirm(self): + ''' + Test that command "ceph fs set <fsname> max_mds <num>" without the + confirmation flag (--yes-i-really-mean-it) fails when cluster is + unhealthy. + ''' + self.gen_health_warn_mds_cache_oversized() + + with self.assertRaises(CommandFailedError) as cfe: + self.fs.set_max_mds(2, confirm=False) + self.assertEqual(cfe.exception.exitstatus, errno.EPERM) + + def test_when_unhealthy_with_confirm(self): + ''' + Test that command "ceph fs set <fsname> max_mds <num> + --yes-i-really-mean-it" runs successfully when cluster is unhealthy. + ''' + self.gen_health_warn_mds_cache_oversized() + + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) + + def test_when_mds_trim_without_confirm(self): + ''' + Test that command "ceph fs set <fsname> max_mds <num>" without the + confirmation flag (--yes-i-really-mean-it) fails when cluster has + MDS_TRIM health warning. + ''' + self.gen_health_warn_mds_trim() + + with self.assertRaises(CommandFailedError) as cfe: + self.fs.set_max_mds(2, confirm=False) + self.assertEqual(cfe.exception.exitstatus, errno.EPERM) + + def test_when_mds_trim_when_with_confirm(self): + ''' + Test that command "ceph fs set <fsname> max_mds <num> + --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM + health warning. + ''' + self.gen_health_warn_mds_trim() + + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) + + def test_when_healthy_with_confirm(self): + ''' + Test that command "ceph fs set <fsname> max_mds <num> + --yes-i-really-mean-it" runs successfully also when cluster is + healthy. + ''' + self.fs.set_max_mds(2, confirm=True) + self.assertEqual(self.fs.get_var('max_mds'), 2) + + +class TestToggleVolumes(CephFSTestCase): + ''' + Contains code for enabling/disabling mgr/volumes plugin. + ''' + + VOL_MOD_NAME = 'volumes' + CONFIRM = '--yes-i-really-mean-it' + + def tearDown(self): + ''' + Ensure that the volumes plugin is enabled after the test has finished + running since not doing so might affect tearDown() of CephFSTestCase or + other superclasses. + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + if 'volumes' in json_output['force_disabled_modules']: + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + + super(TestToggleVolumes, self).tearDown() + + def test_force_disable_with_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes + --yes-i-really-mean-it" successfully disables volumes plugin. + + Also test "ceph mgr module ls" output after this. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + def test_force_disable_fails_without_confirmation(self): + ''' + Test that running "ceph mgr module force disable volumes" fails with + EPERM when confirmation flag is not passed along. + + Also test that output of this command suggests user to pass + --yes-i-really-mean-it. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + # ensure that the confirmation flag was recommended + self.assertIn(self.CONFIRM, proc_stderr) + + def test_force_disable_idempotency(self): + ''' + Test that running "ceph mgr module force disable volumes" passes when + volumes plugin was already force disabled. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + sleep(5) + + json_output = self.get_ceph_cmd_stdout('mgr module ls --format ' + 'json-pretty') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + + # XXX: this this test, running this command 2nd time should pass. + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}') + + def test_force_disable_nonexistent_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + proc = self.run_ceph_cmd( + f'mgr module force disable abcd {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_force_disable_non_alwayson_mod(self): + ''' + Test that passing non-existent name to "ceph mgr module force disable" + command leads to an error. + ''' + json_output = self.get_ceph_cmd_stdout( + 'mgr module ls --format json-pretty', check_status=False, + stderr=StringIO()) + output_dict = json.loads(json_output) + some_non_alwayson_mod = output_dict['enabled_modules'][0] + + proc = self.run_ceph_cmd( + f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}', + check_status=False, stderr=StringIO()) + self.assertEqual(proc.returncode, errno.EINVAL) + self.assertIn('EINVAL', proc.stderr.getvalue()) + + def test_enabled_by_default(self): + ''' + Test that volumes plugin is enabled by default and is also reported as + "always on". + ''' + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + def test_disable_fails(self): + ''' + Test that running "ceph mgr module disable volumes" fails with EPERM. + + This is expected since volumes is an always-on module and therefore + it can only be disabled using command "ceph mgr module force disable + volumes". + ''' + proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}', + stderr=StringIO(), check_status=False) + self.assertEqual(proc.returncode, errno.EPERM) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('EPERM', proc_stderr) + + def test_enable_idempotency(self): + ''' + Test that enabling volumes plugin when it is already enabled doesn't + exit with non-zero return value. + + Also test that it reports plugin as already enabled. + ''' + proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}', + stderr=StringIO()) + self.assertEqual(proc.returncode, 0) + + proc_stderr = proc.stderr.getvalue() + self.assertIn('already enabled', proc_stderr) + self.assertIn('always-on', proc_stderr) + + def test_enable_post_disabling(self): + ''' + Test that enabling volumes plugin after (force-)disabling it works + successfully. + + Alo test "ceph mgr module ls" output for volumes plugin afterwards. + ''' + self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} ' + f'{self.CONFIRM}') + # give bit of time for plugin to be disabled. + sleep(5) + + self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}') + # give bit of time for plugin to be functional again + sleep(5) + json_output = self.get_ceph_cmd_stdout('mgr module ls --format json') + json_output = json.loads(json_output) + self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules']) + self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules']) + + # plugin is reported properly by "ceph mgr module ls" command, check if + # it is also working fine. + self.run_ceph_cmd('fs volume ls') diff --git a/qa/tasks/cephfs/test_backtrace.py b/qa/tasks/cephfs/test_backtrace.py index 6b094569b7b..cd23c114bfb 100644 --- a/qa/tasks/cephfs/test_backtrace.py +++ b/qa/tasks/cephfs/test_backtrace.py @@ -100,3 +100,29 @@ class TestBacktrace(CephFSTestCase): # we don't update the layout in all the old pools whenever it changes old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name) self.assertEqual(old_pool_layout['object_size'], 4194304) + + def test_backtrace_flush_on_deleted_data_pool(self): + """ + that the MDS does not go read-only when handling backtrace update errors + when backtrace updates are batched and flushed to RADOS (during journal trim) + and some of the pool have been removed. + """ + data_pool = self.fs.get_data_pool_name() + extra_data_pool_name_1 = data_pool + '_extra1' + self.fs.add_data_pool(extra_data_pool_name_1) + + self.mount_a.run_shell(["mkdir", "dir_x"]) + self.mount_a.setfattr("dir_x", "ceph.dir.layout.pool", extra_data_pool_name_1) + self.mount_a.run_shell(["touch", "dir_x/file_x"]) + self.fs.flush() + + extra_data_pool_name_2 = data_pool + '_extra2' + self.fs.add_data_pool(extra_data_pool_name_2) + self.mount_a.setfattr("dir_x/file_x", "ceph.file.layout.pool", extra_data_pool_name_2) + self.mount_a.run_shell(["setfattr", "-x", "ceph.dir.layout", "dir_x"]) + self.run_ceph_cmd("fs", "rm_data_pool", self.fs.name, extra_data_pool_name_1) + self.fs.flush() + + # quick test to check if the mds has handled backtrace update failure + # on the deleted data pool without going read-only. + self.mount_a.run_shell(["mkdir", "dir_y"]) diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 16de379f54f..468378fce3d 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -4,6 +4,7 @@ import time from tasks.cephfs.fuse_mount import FuseMount from tasks.cephfs.cephfs_test_case import CephFSTestCase from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while, MaxWhileTries log = logging.getLogger(__name__) @@ -152,6 +153,8 @@ class TestExportPin(CephFSTestCase): # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap # to be written out every event. Yuck! self.config_set('mds', 'mds_debug_subtrees', False) + # make sure ESubtreeMap is written frequently enough: + self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4') self.config_rm('mds', 'mds bal split size') # don't split /top self.mount_a.run_shell_payload("rm -rf 1") @@ -628,3 +631,186 @@ done log.info("{0} migrations have occured due to the cluster resizing".format(count)) # rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2 self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget + +class TestDumpExportStates(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + + EXPORT_STATES = ['locking', 'discovering', 'freezing', 'prepping', 'warning', 'exporting'] + + def setUp(self): + super().setUp() + + self.fs.set_max_mds(self.MDSS_REQUIRED) + self.status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload('mkdir -p test/export') + + def tearDown(self): + super().tearDown() + + def _wait_for_export_target(self, source, target, sleep=2, timeout=10): + try: + with safe_while(sleep=sleep, tries=timeout//sleep) as proceed: + while proceed(): + info = self.fs.getinfo().get_rank(self.fs.id, source) + log.info(f'waiting for rank {target} to be added to the export target') + if target in info['export_targets']: + return + except MaxWhileTries as e: + raise RuntimeError(f'rank {target} has not been added to export target after {timeout}s') from e + + def _dump_export_state(self, rank): + states = self.fs.rank_asok(['dump_export_states'], rank=rank, status=self.status) + self.assertTrue(type(states) is list) + self.assertEqual(len(states), 1) + return states[0] + + def _test_base(self, path, source, target, state_index, kill): + self.fs.rank_asok(['config', 'set', 'mds_kill_import_at', str(kill)], rank=target, status=self.status) + + self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status) + self._wait_for_export_target(source, target) + + target_rank = self.fs.get_rank(rank=target, status=self.status) + self.delete_mds_coredump(target_rank['name']) + + state = self._dump_export_state(source) + + self.assertTrue(type(state['tid']) is int) + self.assertEqual(state['path'], path) + self.assertEqual(state['state'], self.EXPORT_STATES[state_index]) + self.assertEqual(state['peer'], target) + + return state + + def _test_state_history(self, state): + history = state['state_history'] + self.assertTrue(type(history) is dict) + size = 0 + for name in self.EXPORT_STATES: + self.assertTrue(type(history[name]) is dict) + size += 1 + if name == state['state']: + break + self.assertEqual(len(history), size) + + def _test_freeze_tree(self, state, waiters): + self.assertTrue(type(state['freeze_tree_time']) is float) + self.assertEqual(state['unfreeze_tree_waiters'], waiters) + + def test_discovering(self): + state = self._test_base('/test', 0, 1, 1, 1) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertEqual(state['last_cum_auth_pins'], 0) + self.assertEqual(state['num_remote_waiters'], 0) + + def test_prepping(self): + client_id = self.mount_a.get_global_id() + + state = self._test_base('/test', 0, 1, 3, 3) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertEqual(state['flushed_clients'], [client_id]) + self.assertTrue(type(state['warning_ack_waiting']) is list) + + def test_exporting(self): + state = self._test_base('/test', 0, 1, 5, 5) + + self._test_state_history(state) + self._test_freeze_tree(state, 0) + + self.assertTrue(type(state['notify_ack_waiting']) is list) + +class TestKillExports(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + + def setUp(self): + CephFSTestCase.setUp(self) + + self.fs.set_max_mds(self.MDSS_REQUIRED) + self.status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload('mkdir -p test/export') + + def tearDown(self): + super().tearDown() + + def _kill_export_as(self, rank, kill): + self.fs.rank_asok(['config', 'set', 'mds_kill_export_at', str(kill)], rank=rank, status=self.status) + + def _export_dir(self, path, source, target): + self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status) + + def _wait_failover(self): + self.wait_until_true(lambda: self.fs.status().hadfailover(self.status), timeout=self.fs.beacon_timeout) + + def _clear_coredump(self, rank): + crash_rank = self.fs.get_rank(rank=rank, status=self.status) + self.delete_mds_coredump(crash_rank['name']) + + def _run_kill_export(self, kill_at, exporter_rank=0, importer_rank=1, restart=True): + self._kill_export_as(exporter_rank, kill_at) + self._export_dir("/test", exporter_rank, importer_rank) + self._wait_failover() + self._clear_coredump(exporter_rank) + + if restart: + self.fs.rank_restart(rank=exporter_rank, status=self.status) + self.status = self.fs.wait_for_daemons() + + def test_session_cleanup(self): + """ + Test importer's session cleanup after an export subtree task is interrupted. + Set 'mds_kill_export_at' to 9 or 10 so that the importer will wait for the exporter + to restart while the state is 'acking'. + + See https://tracker.ceph.com/issues/61459 + """ + + kill_export_at = [9, 10] + + exporter_rank = 0 + importer_rank = 1 + + for kill in kill_export_at: + log.info(f"kill_export_at: {kill}") + self._run_kill_export(kill, exporter_rank, importer_rank) + + if len(self._session_list(importer_rank, self.status)) > 0: + client_id = self.mount_a.get_global_id() + self.fs.rank_asok(['session', 'evict', "%s" % client_id], rank=importer_rank, status=self.status) + + # timeout if buggy + self.wait_until_evicted(client_id, importer_rank) + + # for multiple tests + self.mount_a.remount() + + def test_client_eviction(self): + # modify the timeout so that we don't have to wait too long + timeout = 30 + self.fs.set_session_timeout(timeout) + self.fs.set_session_autoclose(timeout + 5) + + kill_export_at = [9, 10] + + exporter_rank = 0 + importer_rank = 1 + + for kill in kill_export_at: + log.info(f"kill_export_at: {kill}") + self._run_kill_export(kill, exporter_rank, importer_rank) + + client_id = self.mount_a.get_global_id() + self.wait_until_evicted(client_id, importer_rank, timeout + 10) + time.sleep(1) + + # failed if buggy + self.mount_a.ls() diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 29af1e76a4f..46139163ddd 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,4 @@ +import re import time import signal import logging @@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase): self.fs.wait_for_daemons(timeout=90) +class TestFailoverBeaconHealth(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def initiate_journal_replay(self, num_files=100): + """ Initiate journal replay by creating files and restarting mds server.""" + + self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000") + self.mounts[0].test_files = [str(x) for x in range(num_files)] + self.mounts[0].create_files() + self.fs.fail() + self.fs.set_joinable() + + def test_replay_beacon_estimated_time(self): + """ + That beacon emits warning message with estimated time to complete replay + """ + self.initiate_journal_replay() + self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60) + # remove the config so that replay finishes and the cluster + # is HEALTH_OK + self.config_rm("mds", "mds_delay_journal_replay_for_testing") + self.wait_for_health_clear(timeout=60) + + def test_replay_estimated_time_accuracy(self): + self.initiate_journal_replay(250) + def replay_complete(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + return 'MDS_ESTIMATED_REPLAY_TIME' not in codes + + def get_estimated_time(): + completion_percentage = 0.0 + time_duration = pending_duration = 0 + with safe_while(sleep=5, tries=360) as proceed: + while proceed(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + if 'MDS_ESTIMATED_REPLAY_TIME' in codes: + message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message'] + ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s" + m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message) + if not m: + continue + completion_percentage = float(m.group(1)) + time_duration = int(m.group(3)) + pending_duration = int(m.group(4)) + log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}") + if completion_percentage >= 50: + return (completion_percentage, time_duration, pending_duration) + _, _, pending_duration = get_estimated_time() + # wait for 25% more time to avoid false negative failures + self.wait_until_true(replay_complete, timeout=pending_duration * 1.25) + class TestFailover(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 diff --git a/qa/tasks/cephfs/test_fscrypt.py b/qa/tasks/cephfs/test_fscrypt.py index d327c43c1fc..c1405415c63 100644 --- a/qa/tasks/cephfs/test_fscrypt.py +++ b/qa/tasks/cephfs/test_fscrypt.py @@ -83,9 +83,11 @@ class TestFSCryptRecovery(FSCryptTestCase): self.fs.set_joinable() self.fs.wait_for_daemons() + # load all inodes into cache (may be cleared by journal reset) + self.mount_a.run_shell_payload(f"cd {self.path} && find") + verify_alternate_name() - self.mount_a.run_shell_payload(f"cd {self.path} && find") self.mount_a.run_shell_payload(f"cd {self.path} && stat {file}") diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py index 2f9ebe6b1d5..078db6a4a6d 100644 --- a/qa/tasks/cephfs/test_mirroring.py +++ b/qa/tasks/cephfs/test_mirroring.py @@ -204,6 +204,17 @@ class TestMirroring(CephFSTestCase): self.assertTrue(res[dir_name]['last_synced_snap']['name'] == expected_snap_name) self.assertTrue(res[dir_name]['snaps_synced'] == expected_snap_count) + def check_peer_status_idle(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_name, + expected_snap_count): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue(dir_name in res) + self.assertTrue('idle' == res[dir_name]['state']) + self.assertTrue(expected_snap_name == res[dir_name]['last_synced_snap']['name']) + self.assertTrue(expected_snap_count == res[dir_name]['snaps_synced']) + def check_peer_status_deleted_snap(self, fs_name, fs_id, peer_spec, dir_name, expected_delete_count): peer_uuid = self.get_peer_uuid(peer_spec) @@ -421,6 +432,34 @@ class TestMirroring(CephFSTestCase): self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) self.mount_a.run_shell(["rmdir", "d1"]) + def test_directory_command_ls(self): + dir1 = 'dls1' + dir2 = 'dls2' + self.mount_a.run_shell(["mkdir", dir1]) + self.mount_a.run_shell(["mkdir", dir2]) + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}') + self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}') + time.sleep(10) + dirs_list = json.loads(self.get_ceph_cmd_stdout("fs", "snapshot", "mirror", "ls", self.primary_fs_name)) + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + dir_count = res['snap_dirs']['dir_count'] + self.assertTrue(len(dirs_list) == dir_count and f'/{dir1}' in dirs_list and f'/{dir2}' in dirs_list) + except CommandFailedError: + raise RuntimeError('Error listing directories') + except AssertionError: + raise RuntimeError('Wrong number of directories listed') + finally: + self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}') + self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rmdir", dir1]) + self.mount_a.run_shell(["rmdir", dir2]) + def test_add_relative_directory_path(self): self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) try: @@ -549,7 +588,7 @@ class TestMirroring(CephFSTestCase): # create a bunch of files in a directory to snap self.mount_a.run_shell(["mkdir", "d0"]) - for i in range(50): + for i in range(100): self.mount_a.write_n_mb(os.path.join('d0', f'file.{i}'), 1) self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) @@ -563,7 +602,7 @@ class TestMirroring(CephFSTestCase): # take a snapshot self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) - time.sleep(30) + time.sleep(60) self.check_peer_status(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", '/d0', 'snap0', 1) self.verify_snapshot('d0', 'snap0') @@ -575,10 +614,10 @@ class TestMirroring(CephFSTestCase): self.assertGreater(second["counters"]["last_synced_start"], first["counters"]["last_synced_start"]) self.assertGreater(second["counters"]["last_synced_end"], second["counters"]["last_synced_start"]) self.assertGreater(second["counters"]["last_synced_duration"], 0) - self.assertEquals(second["counters"]["last_synced_bytes"], 52428800) # last_synced_bytes = 50 files of 1MB size each + self.assertEquals(second["counters"]["last_synced_bytes"], 104857600) # last_synced_bytes = 100 files of 1MB size each # some more IO - for i in range(75): + for i in range(150): self.mount_a.write_n_mb(os.path.join('d0', f'more_file.{i}'), 1) time.sleep(60) @@ -586,7 +625,7 @@ class TestMirroring(CephFSTestCase): # take another snapshot self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"]) - time.sleep(60) + time.sleep(120) self.check_peer_status(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", '/d0', 'snap1', 2) self.verify_snapshot('d0', 'snap1') @@ -598,7 +637,7 @@ class TestMirroring(CephFSTestCase): self.assertGreater(third["counters"]["last_synced_start"], second["counters"]["last_synced_end"]) self.assertGreater(third["counters"]["last_synced_end"], third["counters"]["last_synced_start"]) self.assertGreater(third["counters"]["last_synced_duration"], 0) - self.assertEquals(third["counters"]["last_synced_bytes"], 78643200) # last_synced_bytes = 75 files of 1MB size each + self.assertEquals(third["counters"]["last_synced_bytes"], 157286400) # last_synced_bytes = 150 files of 1MB size each # delete a snapshot self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"]) @@ -1361,7 +1400,7 @@ class TestMirroring(CephFSTestCase): self.mount_b.umount_wait() self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) - # create a bunch of files in a directory to snap + # create some large files in 3 directories to snap self.mount_a.run_shell(["mkdir", "d0"]) self.mount_a.run_shell(["mkdir", "d1"]) self.mount_a.run_shell(["mkdir", "d2"]) @@ -1384,30 +1423,38 @@ class TestMirroring(CephFSTestCase): vbefore = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0] # take snapshots log.debug('taking snapshots') - self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) - self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"]) - self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"]) + snap_name = "snap0" + self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"]) + self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"]) + self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"]) - time.sleep(10) log.debug('checking snap in progress') - self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d0', 'snap0') - self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d1', 'snap0') - self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d2', 'snap0') + peer_spec = "client.mirror_remote@ceph" + peer_uuid = self.get_peer_uuid(peer_spec) + with safe_while(sleep=3, tries=100, action=f'wait for status: {peer_spec}') as proceed: + while proceed(): + res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{self.primary_fs_name}@{self.primary_fs_id}', + peer_uuid) + if ('syncing' == res["/d0"]['state'] and 'syncing' == res["/d1"]['state'] and \ + 'syncing' == res["/d2"]['state']): + break - log.debug('removing directories 1') + log.debug('removing directory 1') self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') - log.debug('removing directories 2') + log.debug('removing directory 2') self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1') - log.debug('removing directories 3') + log.debug('removing directory 3') self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2') + # Wait a while for the sync backoff + time.sleep(500) + log.debug('removing snapshots') - self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"]) - self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"]) - self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", f"d0/.snap/{snap_name}"]) + self.mount_a.run_shell(["rmdir", f"d1/.snap/{snap_name}"]) + self.mount_a.run_shell(["rmdir", f"d2/.snap/{snap_name}"]) for i in range(4): filename = f'file.{i}' @@ -1427,26 +1474,27 @@ class TestMirroring(CephFSTestCase): self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2') log.debug('creating new snapshots...') - self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) - self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"]) - self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"]) + self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"]) + self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"]) + self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"]) + + # Wait for the threads to finish + time.sleep(500) - time.sleep(60) self.check_peer_status(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d0', 'snap0', 1) - self.verify_snapshot('d0', 'snap0') + "client.mirror_remote@ceph", '/d0', f'{snap_name}', 1) + self.verify_snapshot('d0', f'{snap_name}') self.check_peer_status(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d1', 'snap0', 1) - self.verify_snapshot('d1', 'snap0') + "client.mirror_remote@ceph", '/d1', f'{snap_name}', 1) + self.verify_snapshot('d1', f'{snap_name}') self.check_peer_status(self.primary_fs_name, self.primary_fs_id, - "client.mirror_remote@ceph", '/d2', 'snap0', 1) - self.verify_snapshot('d2', 'snap0') + "client.mirror_remote@ceph", '/d2', f'{snap_name}', 1) + self.verify_snapshot('d2', f'{snap_name}') res = self.mirror_daemon_command(f'counter dump for fs: {self.primary_fs_name}', 'counter', 'dump') vafter = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0] self.assertGreater(vafter["counters"]["snaps_synced"], vbefore["counters"]["snaps_synced"]) - self.assertGreater(vafter["counters"]["snaps_deleted"], vbefore["counters"]["snaps_deleted"]) self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) @@ -1494,8 +1542,86 @@ class TestMirroring(CephFSTestCase): """ That get/set ceph.mirror.dirty_snap_id attribute succeeds in a remote filesystem. """ + log.debug('reconfigure client auth caps') + self.get_ceph_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), + self.backup_fs.get_data_pool_name())) + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + log.debug('setting ceph.mirror.dirty_snap_id attribute') self.mount_b.run_shell(["mkdir", "-p", "d1/d2/d3"]) attr = str(random.randint(1, 10)) self.mount_b.setfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id", attr) + log.debug('getting ceph.mirror.dirty_snap_id attribute') val = self.mount_b.getfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id") self.assertEqual(attr, val, f"Mismatch for ceph.mirror.dirty_snap_id value: {attr} vs {val}") + + def test_cephfs_mirror_remote_snap_corrupt_fails_synced_snapshot(self): + """ + That making manual changes to the remote .snap directory shows 'peer status' state: "failed" + for a synced snapshot and then restores to "idle" when those changes are reverted. + """ + log.debug('reconfigure client auth caps') + self.get_ceph_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rwps', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), + self.backup_fs.get_data_pool_name())) + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + peer_spec = "client.mirror_remote@ceph" + self.peer_add(self.primary_fs_name, self.primary_fs_id, peer_spec, self.secondary_fs_name) + dir_name = 'd0' + self.mount_a.run_shell(['mkdir', dir_name]) + self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir_name}') + + # take a snapshot + snap_name = "snap_a" + expected_snap_count = 1 + self.mount_a.run_shell(['mkdir', f'{dir_name}/.snap/{snap_name}']) + + time.sleep(30) + # confirm snapshot synced and status 'idle' + self.check_peer_status_idle(self.primary_fs_name, self.primary_fs_id, + peer_spec, f'/{dir_name}', snap_name, expected_snap_count) + + remote_snap_name = 'snap_b' + remote_snap_path = f'{dir_name}/.snap/{remote_snap_name}' + failure_reason = f"snapshot '{remote_snap_name}' has invalid metadata" + dir_name = f'/{dir_name}' + + # create a directory in the remote fs and check status 'failed' + self.mount_b.run_shell(['sudo', 'mkdir', remote_snap_path], omit_sudo=False) + peer_uuid = self.get_peer_uuid(peer_spec) + with safe_while(sleep=1, tries=60, action=f'wait for failed status: {peer_spec}') as proceed: + while proceed(): + res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid) + if('failed' == res[dir_name]['state'] and \ + failure_reason == res.get(dir_name, {}).get('failure_reason', {}) and \ + snap_name == res[dir_name]['last_synced_snap']['name'] and \ + expected_snap_count == res[dir_name]['snaps_synced']): + break + # remove the directory in the remote fs and check status restores to 'idle' + self.mount_b.run_shell(['sudo', 'rmdir', remote_snap_path], omit_sudo=False) + with safe_while(sleep=1, tries=60, action=f'wait for idle status: {peer_spec}') as proceed: + while proceed(): + res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid) + if('idle' == res[dir_name]['state'] and 'failure_reason' not in res and \ + snap_name == res[dir_name]['last_synced_snap']['name'] and \ + expected_snap_count == res[dir_name]['snaps_synced']): + break + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py index 7917bd9202f..14f54a784e7 100644 --- a/qa/tasks/cephfs/test_misc.py +++ b/qa/tasks/cephfs/test_misc.py @@ -558,16 +558,18 @@ class TestSessionClientEvict(CephFSTestCase): self.assertEqual(ce.exception.exitstatus, errno.EINVAL) def _evict_with_invalid_id(self, cmd): + info_initial = self.fs.rank_asok(cmd + ['ls']) # with invalid id - with self.assertRaises(CommandFailedError) as ce: - self.fs.rank_tell(cmd + ['evict', 'id=1']) - self.assertEqual(ce.exception.exitstatus, errno.ESRCH) + self.fs.rank_tell(cmd + ['evict', 'id=1']) + info = self.fs.rank_asok(cmd + ['ls']) + self.assertEqual(len(info), len(info_initial)) # session list is status-quo def _evict_with_negative_id(self, cmd): + info_initial = self.fs.rank_asok(cmd + ['ls']) # with negative id - with self.assertRaises(CommandFailedError) as ce: - self.fs.rank_tell(cmd + ['evict', 'id=-9']) - self.assertEqual(ce.exception.exitstatus, errno.ESRCH) + self.fs.rank_tell(cmd + ['evict', 'id=-9']) + info = self.fs.rank_asok(cmd + ['ls']) + self.assertEqual(len(info), len(info_initial)) # session list is status-quo def _evict_with_valid_id(self, cmd): info_initial = self.fs.rank_asok(cmd + ['ls']) diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index 6d1c65dfb7d..0a1c07dce04 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -8,12 +8,15 @@ from io import BytesIO, StringIO from tasks.mgr.mgr_test_case import MgrTestCase from teuthology import contextutil from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.run import Raw log = logging.getLogger(__name__) NFS_POOL_NAME = '.nfs' # should match mgr_module.py # TODO Add test for cluster update when ganesha can be deployed on multiple ports. + + class TestNFS(MgrTestCase): def _cmd(self, *args): return self.get_ceph_cmd_stdout(args) @@ -52,15 +55,16 @@ class TestNFS(MgrTestCase): "squash": "none", "security_label": True, "protocols": [ - 4 + 3, 4 ], "transports": [ "TCP" ], "fsal": { "name": "CEPH", - "user_id": "nfs.test.1", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": self.fs_name, + "cmount_path": "/", }, "clients": [] } @@ -118,7 +122,7 @@ class TestNFS(MgrTestCase): return self.fail(fail_msg) - def _check_auth_ls(self, export_id=1, check_in=False): + def _check_auth_ls(self, fs_name, check_in=False, user_id=None): ''' Tests export user id creation or deletion. :param export_id: Denotes export number @@ -126,10 +130,12 @@ class TestNFS(MgrTestCase): ''' output = self._cmd('auth', 'ls') client_id = f'client.nfs.{self.cluster_id}' + search_id = f'client.{user_id}' if user_id else f'{client_id}.{fs_name}' + if check_in: - self.assertIn(f'{client_id}.{export_id}', output) + self.assertIn(search_id, output) else: - self.assertNotIn(f'{client_id}.{export_id}', output) + self.assertNotIn(search_id, output) def _test_idempotency(self, cmd_func, cmd_args): ''' @@ -216,7 +222,7 @@ class TestNFS(MgrTestCase): # Runs the nfs export create command self._cmd(*export_cmd) # Check if user id for export is created - self._check_auth_ls(export_id, check_in=True) + self._check_auth_ls(self.fs_name, check_in=True) res = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'get', f'export-{export_id}', '-']) # Check if export object is created @@ -230,12 +236,12 @@ class TestNFS(MgrTestCase): self._test_create_cluster() self._create_export(export_id='1', create_fs=True) - def _delete_export(self): + def _delete_export(self, pseduo_path=None, check_in=False, user_id=None): ''' Delete an export. ''' - self._nfs_cmd('export', 'rm', self.cluster_id, self.pseudo_path) - self._check_auth_ls() + self._nfs_cmd('export', 'rm', self.cluster_id, pseduo_path if pseduo_path else self.pseudo_path) + self._check_auth_ls(self.fs_name, check_in, user_id) def _test_list_export(self): ''' @@ -256,26 +262,27 @@ class TestNFS(MgrTestCase): self.sample_export['export_id'] = 2 self.sample_export['pseudo'] = self.pseudo_path + '1' self.sample_export['access_type'] = 'RO' - self.sample_export['fsal']['user_id'] = f'{self.expected_name}.2' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603' self.assertDictEqual(self.sample_export, nfs_output[1]) # Export-3 for subvolume with r only self.sample_export['export_id'] = 3 self.sample_export['path'] = sub_vol_path self.sample_export['pseudo'] = self.pseudo_path + '2' - self.sample_export['fsal']['user_id'] = f'{self.expected_name}.3' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603' self.assertDictEqual(self.sample_export, nfs_output[2]) # Export-4 for subvolume self.sample_export['export_id'] = 4 self.sample_export['pseudo'] = self.pseudo_path + '3' self.sample_export['access_type'] = 'RW' - self.sample_export['fsal']['user_id'] = f'{self.expected_name}.4' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603' self.assertDictEqual(self.sample_export, nfs_output[3]) - def _get_export(self): + def _get_export(self, pseudo_path=None): ''' Returns export block in json format ''' - return json.loads(self._nfs_cmd('export', 'info', self.cluster_id, self.pseudo_path)) + return json.loads(self._nfs_cmd('export', 'info', self.cluster_id, + pseudo_path if pseudo_path else self.pseudo_path)) def _test_get_export(self): ''' @@ -313,7 +320,7 @@ class TestNFS(MgrTestCase): else: log.warning(f'{e}, retrying') - def _test_mnt(self, pseudo_path, port, ip, check=True): + def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False): ''' Test mounting of created exports :param pseudo_path: It is the pseudo root name @@ -341,10 +348,64 @@ class TestNFS(MgrTestCase): self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) try: + # Clean up volumes directory created by subvolume create by some tests + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes']) self.ctx.cluster.run(args=['touch', '/mnt/test']) out_mnt = self._sys_cmd(['ls', '/mnt']) self.assertEqual(out_mnt, b'test\n') + if datarw: + self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1']) + out_test1 = self._sys_cmd(['cat', '/mnt/test1']) + self.assertEqual(out_test1, b'test data\n') + finally: + self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + + def _test_data_read_write(self, pseudo_path, port, ip): + ''' + Check if read/write works fine + ''' + try: + self._test_mnt(pseudo_path, port, ip, True, True) + except CommandFailedError as e: + self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}") + + def _mnt_nfs(self, pseudo_path, port, ip): + ''' + Mount created export + :param pseudo_path: It is the pseudo root name + :param port: Port of deployed nfs cluster + :param ip: IP of deployed nfs cluster + ''' + tries = 3 + while True: + try: + self.ctx.cluster.run( + args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}', + f'{ip}:{pseudo_path}', '/mnt']) + break + except CommandFailedError: + if tries: + tries -= 1 + time.sleep(2) + continue + raise + + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) + + def _test_fio(self, pseudo_path, port, ip): + ''' + run fio with libaio on /mnt/fio + :param mnt_path: nfs mount point + ''' + try: + self._mnt_nfs(pseudo_path, port, ip) + self.ctx.cluster.run(args=['mkdir', '/mnt/fio']) + fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50'] + self.ctx.cluster.run(args=fio_cmd) + except CommandFailedError as e: + self.fail(f"expected fio to be successful but failed with {e.exitstatus}") finally: + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio']) self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) def _write_to_read_only_export(self, pseudo_path, port, ip): @@ -506,7 +567,7 @@ class TestNFS(MgrTestCase): self._test_delete_cluster() # Check if rados ganesha conf object is deleted self._check_export_obj_deleted(conf_obj=True) - self._check_auth_ls() + self._check_auth_ls(self.fs_name) def test_exports_on_mgr_restart(self): ''' @@ -593,6 +654,30 @@ class TestNFS(MgrTestCase): self._write_to_read_only_export(self.pseudo_path, port, ip) self._test_delete_cluster() + def test_data_read_write(self): + ''' + Test date read and write on export. + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_data_read_write(self.pseudo_path, port, ip) + self._test_delete_cluster() + + def test_async_io_fio(self): + ''' + Test async io using fio. Expect completion without hang or crash + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_fio(self.pseudo_path, port, ip) + self._test_delete_cluster() + def test_cluster_info(self): ''' Test cluster info outputs correct ip and hostname @@ -935,7 +1020,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.1", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": self.fs_name } }, @@ -948,7 +1033,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.2", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": "invalid_fs_name" # invalid fs } }, @@ -961,7 +1046,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.3", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": self.fs_name } } @@ -1008,7 +1093,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.1", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": "invalid_fs_name" # invalid fs } } @@ -1048,7 +1133,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.1", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": self.fs_name } }, @@ -1061,7 +1146,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.2", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": self.fs_name } }, @@ -1075,7 +1160,7 @@ class TestNFS(MgrTestCase): "protocols": [4], "fsal": { "name": "CEPH", - "user_id": "nfs.test.3", + "user_id": "nfs.test.nfs-cephfs.3746f603", "fs_name": "invalid_fs_name" } } @@ -1211,3 +1296,65 @@ class TestNFS(MgrTestCase): finally: self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}/*']) self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode) + + def test_nfs_export_creation_without_cmount_path(self): + """ + Test that ensure cmount_path is present in FSAL block + """ + self._create_cluster_with_fs(self.fs_name) + + pseudo_path = '/test_without_cmount' + self._create_export(export_id='1', + extra_cmd=['--pseudo-path', pseudo_path]) + nfs_output = self._get_export(pseudo_path) + self.assertIn('cmount_path', nfs_output['fsal']) + + self._delete_export(pseudo_path) + + def test_nfs_exports_with_same_and_diff_user_id(self): + """ + Test that exports with same FSAL share same user_id + """ + self._create_cluster_with_fs(self.fs_name) + + pseudo_path_1 = '/test1' + pseudo_path_2 = '/test2' + pseudo_path_3 = '/test3' + + # Create subvolumes + self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol_1') + self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol_2') + + fs_path_1 = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol_1').strip() + fs_path_2 = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol_2').strip() + # Both exports should have same user_id(since cmount_path=/ & fs_name is same) + self._create_export(export_id='1', + extra_cmd=['--pseudo-path', pseudo_path_1, + '--path', fs_path_1]) + self._create_export(export_id='2', + extra_cmd=['--pseudo-path', pseudo_path_2, + '--path', fs_path_2]) + + nfs_output_1 = self._get_export(pseudo_path_1) + nfs_output_2 = self._get_export(pseudo_path_2) + # Check if both exports have same user_id + self.assertEqual(nfs_output_2['fsal']['user_id'], nfs_output_1['fsal']['user_id']) + self.assertEqual(nfs_output_1['fsal']['user_id'], 'nfs.test.nfs-cephfs.3746f603') + + cmount_path = '/volumes' + self._create_export(export_id='3', + extra_cmd=['--pseudo-path', pseudo_path_3, + '--path', fs_path_1, + '--cmount-path', cmount_path]) + + nfs_output_3 = self._get_export(pseudo_path_3) + self.assertNotEqual(nfs_output_3['fsal']['user_id'], nfs_output_1['fsal']['user_id']) + self.assertEqual(nfs_output_3['fsal']['user_id'], 'nfs.test.nfs-cephfs.32cd8545') + + # Deleting export with same user_id should not delete the user_id + self._delete_export(pseudo_path_1, True, nfs_output_1['fsal']['user_id']) + # Deleting export 22 should delete the user_id since it's only export left with that user_id + self._delete_export(pseudo_path_2, False, nfs_output_2['fsal']['user_id']) + + # Deleting export 23 should delete the user_id since it's only export with that user_id + self._delete_export(pseudo_path_3, False, nfs_output_3['fsal']['user_id']) diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py index b5691c83852..ae1c1f2056c 100644 --- a/qa/tasks/cephfs/test_quota.py +++ b/qa/tasks/cephfs/test_quota.py @@ -115,9 +115,11 @@ class TestQuota(CephFSTestCase): readable_values = {"10K": "10240", "100Ki": "102400", + "100KiB": "102400", "10M": "10485760", "100Mi": "104857600", "2G": "2147483648", + "2GB": "2147483648", "4Gi": "4294967296", "1T": "1099511627776", "2Ti": "2199023255552"} @@ -135,7 +137,8 @@ class TestQuota(CephFSTestCase): self.mount_a.run_shell(["mkdir", "subdir"]) - invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1"] + invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1", + "1GT", "2MM", "5Di", "8Bi", "i", "7iB"] for invalid_value in invalid_values: with self.assertRaises(CommandFailedError): self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", diff --git a/qa/tasks/cephfs/test_snap_schedules.py b/qa/tasks/cephfs/test_snap_schedules.py index 1fff047f468..bdfec3db540 100644 --- a/qa/tasks/cephfs/test_snap_schedules.py +++ b/qa/tasks/cephfs/test_snap_schedules.py @@ -1093,6 +1093,56 @@ class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper): self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY]) +class TestSnapSchedulesFetchForeignConfig(TestSnapSchedulesHelper): + def test_fetch_for_mds_max_snaps_per_dir(self): + """Test the correctness of snap directory name""" + dir_path = TestSnapSchedulesHelper.TEST_DIRECTORY + sdn = self.get_snap_dir_name() + + self.mount_a.run_shell(['mkdir', '-p', dir_path]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=dir_path, snap_schedule='1m') + + self.config_set('mds', 'mds_max_snaps_per_dir', 10) + + time.sleep(11*60) # wait for 9 snaps to be retained + + snap_path = f"{dir_path}/{sdn}" + snapshots = self.mount_a.ls(path=snap_path) + fs_count = len(snapshots) + + self.assertTrue(fs_count == 9) + + self.config_set('mds', 'mds_max_snaps_per_dir', 8) + + time.sleep(1*60 + 10) # wait for max_snaps_per_dir limit to be breached + + snap_path = f"{dir_path}/{sdn}" + snapshots = self.mount_a.ls(path=snap_path) + fs_count = len(snapshots) + + self.assertTrue(fs_count == 7) + + self.config_set('mds', 'mds_max_snaps_per_dir', 10) + + time.sleep(2*60 + 10) # wait for more snaps to be created + + snap_path = f"{dir_path}/{sdn}" + snapshots = self.mount_a.ls(path=snap_path) + fs_count = len(snapshots) + + self.assertTrue(fs_count == 9) + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=dir_path) + + # remove all scheduled snapshots + self.remove_snapshots(dir_path, sdn) + + self.mount_a.run_shell(['rmdir', dir_path]) + + """ Note that the class TestSnapSchedulesMandatoryFSArgument tests snap-schedule commands only for multi-fs scenario. Commands for a single default fs should diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py index ba3bc0fbd8a..c2184c41eff 100644 --- a/qa/tasks/cephfs/test_snapshots.py +++ b/qa/tasks/cephfs/test_snapshots.py @@ -376,6 +376,32 @@ class TestSnapshots(CephFSTestCase): self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")]) + def test_snapshot_check_access(self): + """ + """ + + self.mount_a.run_shell_payload("mkdir -p dir1/dir2") + self.mount_a.umount_wait(require_clean=True) + + newid = 'foo' + keyring = self.fs.authorize(newid, ('/dir1', 'rws')) + keyring_path = self.mount_a.client_remote.mktemp(data=keyring) + self.mount_a.remount(client_id=newid, client_keyring_path=keyring_path, cephfs_mntpt='/dir1') + + self.mount_a.run_shell_payload("pushd dir2; dd if=/dev/urandom of=file bs=4k count=1;") + self.mount_a.run_shell_payload("mkdir .snap/one") + self.mount_a.run_shell_payload("rm -rf dir2") + # ??? + # Session check_access path ~mds0/stray3/10000000001/file + # 2024-07-04T02:05:07.884+0000 7f319ce86640 20 Session check_access: [inode 0x10000000002 [2,2] ~mds0/stray2/10000000001/file ...] caller_uid=1141 caller_gid=1141 caller_gid_list=[1000,1141] + # 2024-07-04T02:05:07.884+0000 7f319ce86640 20 Session check_access path ~mds0/stray2/10000000001/file + # should be + # 2024-07-04T02:11:26.990+0000 7f6b14e71640 20 Session check_access: [inode 0x10000000002 [2,2] ~mds0/stray2/10000000001/file ...] caller_uid=1141 caller_gid=1141 caller_gid_list=[1000,1141] + # 2024-07-04T02:11:26.990+0000 7f6b14e71640 20 Session check_access stray_prior_path /dir1/dir2 + # 2024-07-04T02:11:26.990+0000 7f6b14e71640 10 MDSAuthCap is_capable inode(path /dir1/dir2 owner 1141:1141 mode 0100644) by caller 1141:1141 mask 1 new 0:0 cap: MDSAuthCaps[allow rws fsname=cephfs path="/dir1"] + self.mount_a.run_shell_payload("stat .snap/one/dir2/file") + + def test_multimds_mksnap(self): """ check if snapshot takes effect across authority subtrees diff --git a/qa/tasks/cephfs/test_uninlining.py b/qa/tasks/cephfs/test_uninlining.py new file mode 100644 index 00000000000..91d34a0e277 --- /dev/null +++ b/qa/tasks/cephfs/test_uninlining.py @@ -0,0 +1,332 @@ + +""" +Test that data is uninlined using scrubbing. + +The idea is to untar a linux-5.4.0 kernel tarball's kernel/ dir +consisting of about 8000 files and uninline about 5145 of those which are +less than or equal to client_max_inline_size bytes and can be inlined when +written to while the inline_data config option is enabled. + +This test runs across 1 or 2 active MDS, where a subset of the dirs under the +kernel/ dir are pinned to either of the MDS. +""" + +import os +import logging +import threading +import time +import json + +from io import StringIO +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.mount import CephFSMount + +log = logging.getLogger(__name__) + + +def remote_mntpt_cmd(mount, cmd): + final_cmd = f'cd {mount.hostfs_mntpt} && ' + cmd + out = mount.client_remote.sh(final_cmd, stdout=StringIO()) + return out.strip() + + +class InlineDataInfo: + def __init__(self, length: int, version: int): + self.inline_data_length = length + self.inline_data_version = version + + +class SnapshotterThread(threading.Thread): + def __init__(self, base_dir: str, snap_count: int, mount: CephFSMount): + super(SnapshotterThread, self).__init__() + self.base_dir: str = base_dir + self.snap_count: int = snap_count + self.mount = mount + + def run(self): + for i in range(self.snap_count): + cmd = f"mkdir {self.base_dir}/.snap/snap_{i}" + remote_mntpt_cmd(self.mount, cmd) + time.sleep(1) + + +class TestDataUninlining(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 2 + + # data version number of uninlined inode: ((1 << 64) - 1) + CEPH_INLINE_NONE = 18446744073709551615 + + NUM_SNAPS = 10 + DUMP_INODE_RETRIES = 10 + + def setUp(self): + super(TestDataUninlining, self).setUp() + self.cache_info = dict() + self.unmount_info = dict() + self.mount_openbg_info = dict() + self.multimds_info = dict() + self.snapshot_info = dict() + + self.cache_info[0] = "without clearing cache" + self.cache_info[1] = "clear cache before scrub" + self.cache_info[2] = "clear cache after scrub" + self.unmount_info[0] = "without unmount client" + self.unmount_info[1] = "unmount client before scrub" + self.unmount_info[2] = "unmount client after scrub" + self.mount_openbg_info[0] = "without mount.open_background" + self.mount_openbg_info[1] = "with mount.open_background" + self.multimds_info[0] = "without multimds" + self.multimds_info[1] = "with multimds" + self.snapshot_info[0] = "without snapshots" + self.snapshot_info[1] = "with snapshots" + + def tearDown(self): + super(TestDataUninlining, self).tearDown() + + def extract_inodes(self, files): + inodes = [] + for fil in files: + log.debug(f"getting inode for:{fil}") + cmd = f'ls -i {fil}' + o = remote_mntpt_cmd(self.mount_a, cmd) + inodes.append(o.split(' ')[0]) + return inodes + + def get_inline_data_info(self, inodes, files, dir_pins, num_mds): + def get_inode_dump(inode, rank, retries): + for i in range(retries): + log.debug(f"try #{i+1} - dump inode {inode}") + try: + json_out = self.fs.rank_tell(['dump', 'inode', inode], rank=rank) + if len(json_out) != 0: + return json_out + except json.decoder.JSONDecodeError: + time.sleep(1) + finally: + if len(json_out) == 0: + time.sleep(1) + raise json.decoder.JSONDecodeError(f'No JSON found after {retries} attempts', None, 0) + + info = [] + for i in range(len(inodes)): + inode = inodes[i] + log.debug(f"getting inode info #{i+1} of {len(inodes)}:{inode}") + path = os.path.dirname(files[i]) + rank = dir_pins[path] if path in dir_pins else 0 + r = rank + while r < rank + num_mds: + try: + json_out = get_inode_dump(inode, + r % num_mds, + self.DUMP_INODE_RETRIES) + break + except json.decoder.JSONDecodeError: + pass + finally: + r += 1 + self.assertTrue(json_out is not None) + self.assertTrue('inline_data_length' in json_out) + self.assertTrue('inline_data_version' in json_out) + info.append(InlineDataInfo(json_out['inline_data_length'], + json_out['inline_data_version'])) + return info + + def run_test_worker(self, + opt_clear_cache, + opt_unmount, + opt_mount_openbg, + opt_multimds, + opt_snapshot): + log.info("Running Data Uninlining test with: " + f"{self.cache_info[opt_clear_cache]}, " + f"{self.unmount_info[opt_unmount]}, " + f"{self.mount_openbg_info[opt_mount_openbg]}, " + f"{self.multimds_info[opt_multimds]}, " + f"{self.snapshot_info[opt_snapshot]}") + + # Set max_mds to 1 or 2 + num_mds = 2 if opt_multimds else 1 + log.debug(f"setting max_mds:{num_mds}") + self.fs.set_max_mds(num_mds) + + # Get configured max inline data size + log.debug("getting client_max_inline_size") + idsize = self.fs.fs_config.get('client_max_inline_size', 4096) + idsize = int(idsize) + log.debug(f"got client_max_inline_size:{idsize}") + + # IMPORTANT + # At this time, the kernel client doesn't work correctly if + # client_max_inline_size is greater tham 4096 + self.assertTrue(idsize == 4096) + + snapshotter = None + if opt_snapshot: + log.debug("starting snapshotter thread") + cmd = 'mkdir linux-5.4' + remote_mntpt_cmd(self.mount_b, cmd) + snapshotter = SnapshotterThread("linux-5.4", + self.NUM_SNAPS, + self.mount_b) + snapshotter.start() + + # Extract test data tarball + # FIXME + log.debug("extracting tarball") + cmd = 'tar -x -z -f linux-5.4.tar.gz linux-5.4/fs/ceph linux-5.4/fs/orangefs linux-5.4/fs/ext2' + # cmd = 'tar -x -z -f linux-5.4.tar.gz' + remote_mntpt_cmd(self.mount_a, cmd) + + bg_proc = None + # the data uninlining or snapshot should cause the caps to be revoked + # and get the data uninlined without any problems + if opt_mount_openbg: + log.debug("opening file in background") + cap_test_dir = "linux-5.4/fs/cap_revoke_test" + cmd = f"mkdir {cap_test_dir}" + remote_mntpt_cmd(self.mount_b, cmd) + test_file = f"{cap_test_dir}/test_file" + bg_proc = self.mount_b.open_background(test_file, True) + + # Get dirs under linux-5.4.0/kernel/ + # FIXME + log.debug("fetching dir list") + cmd = 'find linux-5.4/ -mindepth 2 -maxdepth 2 -type d' + # cmd = 'find linux-5.4/ -mindepth 1 -maxdepth 1 -type d' + o = remote_mntpt_cmd(self.mount_a, cmd) + dirs = o.split('\n') + + # Pin dirs alternately to available mds + dir_pins = {} + log.debug("distributing dir pins") + for i in range(len(dirs)): + self.mount_a.setfattr(dirs[i], 'ceph.dir.pin', str(i % num_mds)) + dir_pins[dirs[i]] = i % num_mds + + # Count files with size <= idsize + log.debug(f"listing files with size <= {idsize}") + cmd = f'find linux-5.4/ -type f -size -{idsize + 1}c' + o = remote_mntpt_cmd(self.mount_a, cmd) + files = o.split('\n') + + # Dump file count + log.info(f'Found {len(files)} inlined files') + + if opt_unmount == 1: + log.debug("unmounting mount_a before scrub") + self.mount_a.umount() + + if opt_clear_cache == 1: + log.debug("clearing cache") + for i in range(num_mds): + self.fs.rank_tell(['cache', 'drop'], rank=i) + + # Start recursive scrub on rank 0 + log.debug("starting scrub") + out_json = self.fs.run_scrub(["start", "/", "recursive"]) + log.debug(f"scrub start response: {out_json}") + + # Wait for scrub completion + log.debug("waiting for scrub to complete") + status = self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]) + self.assertEqual(status, True) + + if opt_unmount == 2: + log.debug("unmounting mount_a after scrub") + self.mount_a.umount() + + if opt_snapshot: + log.debug("joining snapshotter thread") + snapshotter.join() + for i in range(self.NUM_SNAPS): + cmd = f"rmdir linux-5.4/.snap/snap_{i}" + remote_mntpt_cmd(self.mount_b, cmd) + + if opt_clear_cache == 2: + log.debug("clearing cache") + for i in range(num_mds): + self.fs.rank_tell(['cache', 'drop'], rank=i) + + if opt_unmount > 0: + log.debug("remounting mount_a") + self.mount_a.mount() + + # Extract inode numbers of inlined files + log.debug("extracting inodes") + inodes = self.extract_inodes(files) + + # Dump inode info of files with size <= idsize + self.assertEqual(len(files), len(inodes)) + + log.debug("getting inline data info") + info = self.get_inline_data_info(inodes, files, dir_pins, num_mds) + + # cleanup + if opt_mount_openbg: + log.debug("killing background open file process") + self.mount_b.kill_background(bg_proc) + + log.debug("removing dir linux-5.4") + remote_mntpt_cmd(self.mount_a, "rm -rf linux-5.4/") + + self.assertEqual(len(info), len(inodes)) + + # Count files with inline_data_length == 0 and validate + zero_length_count = 0 + for finfo in info: + if int(finfo.inline_data_length) == 0: + zero_length_count += 1 + log.info(f'Found {zero_length_count} files with ' + 'inline_data_length == 0') + self.assertTrue(zero_length_count == len(files)) + + # Count files with inline_data_version == 18446744073709551615 + # and validate + uninlined_version_count = 0 + for finfo in info: + if int(finfo.inline_data_version) == self.CEPH_INLINE_NONE: + uninlined_version_count += 1 + log.info(f'Found {uninlined_version_count} files with ' + 'inline_data_version == CEPH_INLINE_NONE') + self.assertTrue(uninlined_version_count == len(files)) + + def test_data_uninlining(self): + # Enable inline_data + log.debug("setting inline_data:1") + self.fs.set_var('inline_data', '1', '--yes-i-really-really-mean-it') + + # Fetch tarball + log.debug("fetching tarball") + cmd = 'wget http://download.ceph.com/qa/linux-5.4.tar.gz' + remote_mntpt_cmd(self.mount_a, cmd) + + # multimds + # 0: without multimds + # 1: with multimds + for opt_multimds in [0, 1]: + # unmount + # 0: do not unmount + # 1: unmount before scrub + # 2: unmount after scrub + for opt_unmount in [0, 1, 2]: + # mount + # 0: no mount.open_background + # 1: mount.open_background + for opt_mount_openbg in [0, 1]: + # clear cache + # 0: do not clear cache + # 1: clear cache before scrub + # 2: clear cache after scrub + for opt_clear_cache in [0, 1, 2]: + # snapshots + # 0: without snapshots + # 1: with snapshots + for opt_snapshot in [0, 1]: + self.run_test_worker(opt_clear_cache, + opt_unmount, + opt_mount_openbg, + opt_multimds, + opt_snapshot) + + remote_mntpt_cmd(self.mount_a, "rm -f linux-5.4.tar.gz") diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index 037b046304e..2ee3b6ac052 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -13,10 +13,18 @@ from io import StringIO from tasks.cephfs.cephfs_test_case import CephFSTestCase from tasks.cephfs.fuse_mount import FuseMount +from teuthology.contextutil import safe_while from teuthology.exceptions import CommandFailedError log = logging.getLogger(__name__) + +class RsizeDoesntMatch(Exception): + + def __init__(self, msg): + self.msg = msg + + class TestVolumesHelper(CephFSTestCase): """Helper class for testing FS volume, subvolume group and subvolume operations.""" TEST_FILE_NAME_PREFIX="subvolume_file" @@ -35,19 +43,26 @@ class TestVolumesHelper(CephFSTestCase): def _raw_cmd(self, *args): return self.get_ceph_cmd_stdout(args) - def __check_clone_state(self, state, clone, clone_group=None, timo=120): - check = 0 + def __check_clone_state(self, states, clone, clone_group=None, timo=120): + if isinstance(states, str): + states = (states, ) + args = ["clone", "status", self.volname, clone] if clone_group: args.append(clone_group) args = tuple(args) - while check < timo: - result = json.loads(self._fs_cmd(*args)) - if result["status"]["state"] == state: - break - check += 1 - time.sleep(1) - self.assertTrue(check < timo) + + msg = (f'Executed cmd "{args}" {timo} times; clone was never in ' + f'"{states}" state(s).') + + with safe_while(tries=timo, sleep=1, action=msg) as proceed: + while proceed(): + result = json.loads(self._fs_cmd(*args)) + current_state = result["status"]["state"] + + log.debug(f'current clone state = {current_state}') + if current_state in states: + return def _get_clone_status(self, clone, clone_group=None): args = ["clone", "status", self.volname, clone] @@ -57,6 +72,23 @@ class TestVolumesHelper(CephFSTestCase): result = json.loads(self._fs_cmd(*args)) return result + def _wait_for_clone_to_be_pending(self, clone, clone_group=None, + timo=120): + # check for "in-progress" state too along with "pending" state, because + # if former has occurred it means latter has occured before (which can + # happen for such a small time that it is easy to miss) and it won't + # occur again. + states = ('pending', 'in-progress') + self.__check_clone_state(states, clone, clone_group, timo) + + def _wait_for_clone_to_be_canceled(self, clone, clone_group=None, + timo=120): + # check for "cancelled" state too along with "complete" state, because + # it takes some time for a clone job to be cancelled and in that time + # a clone job might finish. + states = ('canceled', 'complete') + self.__check_clone_state(states, clone, clone_group, timo) + def _wait_for_clone_to_complete(self, clone, clone_group=None, timo=120): self.__check_clone_state("complete", clone, clone_group, timo) @@ -280,6 +312,8 @@ class TestVolumesHelper(CephFSTestCase): filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) self.mount_a.write_n_mb(os.path.join(io_path, filename), file_size) + return number_of_files * file_size * 1024 * 1024 + def _do_subvolume_io_mixed(self, subvolume, subvolume_group=None): subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group) @@ -2333,6 +2367,124 @@ class TestSubvolumes(TestVolumesHelper): # verify trash dir is clean. self._wait_for_trash_empty() + + def test_subvolume_create_with_earmark(self): + # create subvolume with earmark + subvolume = self._gen_subvol_name() + earmark = "nfs.test" + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--earmark", earmark) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolume) + self.assertNotEqual(subvolpath, None) + + # verify the earmark + get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume) + self.assertEqual(get_earmark.rstrip('\n'), earmark) + + def test_subvolume_set_and_get_earmark(self): + # create subvolume + subvolume = self._gen_subvol_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # set earmark + earmark = "smb" + self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) + + # get earmark + get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume) + self.assertEqual(get_earmark.rstrip('\n'), earmark) + + def test_subvolume_clear_earmark(self): + # create subvolume + subvolume = self._gen_subvol_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # set earmark + earmark = "smb" + self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) + + # remove earmark + self._fs_cmd("subvolume", "earmark", "rm", self.volname, subvolume) + + # get earmark + get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume) + self.assertEqual(get_earmark, "") + + def test_earmark_on_non_existing_subvolume(self): + subvolume = "non_existing_subvol" + earmark = "nfs.test" + commands = [ + ("set", earmark), + ("get", None), + ("rm", None), + ] + + for action, arg in commands: + try: + # Build the command arguments + cmd_args = ["subvolume", "earmark", action, self.volname, subvolume] + if arg is not None: + cmd_args.extend(["--earmark", arg]) + + # Execute the command with built arguments + self._fs_cmd(*cmd_args) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + + def test_get_remove_earmark_when_not_set(self): + # Create a subvolume without setting an earmark + subvolume = self._gen_subvol_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # Attempt to get an earmark when it's not set + get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume) + self.assertEqual(get_earmark, "") + + # Attempt to remove an earmark when it's not set + self._fs_cmd("subvolume", "earmark", "rm", self.volname, subvolume) + + def test_set_invalid_earmark(self): + # Create a subvolume + subvolume = self._gen_subvol_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # Attempt to set an invalid earmark + invalid_earmark = "invalid_format" + expected_message = ( + f"Invalid earmark specified: '{invalid_earmark}'. A valid earmark should " + "either be empty or start with 'nfs' or 'smb', followed by dot-separated " + "non-empty components." + ) + try: + self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", invalid_earmark) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, expected_message) + + def test_earmark_on_deleted_subvolume_with_retained_snapshot(self): + subvolume = self._gen_subvol_name() + snapshot = self._gen_subvol_snap_name() + + # Create subvolume and snapshot + self._fs_cmd("subvolume", "create", self.volname, subvolume) + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Delete subvolume while retaining the snapshot + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # Define the expected error message + error_message = f'subvolume "{subvolume}" is removed and has only snapshots retained' + + # Test cases for setting, getting, and removing earmarks + for operation in ["get", "rm", "set"]: + try: + extra_arg = "smb" if operation == "set" else None + if operation == "set": + self._fs_cmd("subvolume", "earmark", operation, self.volname, subvolume, "--earmark", extra_arg) + else: + self._fs_cmd("subvolume", "earmark", operation, self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, error_message) def test_subvolume_expand(self): """ @@ -2406,6 +2558,14 @@ class TestSubvolumes(TestVolumesHelper): for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']: self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature)) + # set earmark + earmark = "smb" + self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark) + + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + + self.assertEqual(subvol_info["earmark"], earmark) + # remove subvolumes self._fs_cmd("subvolume", "rm", self.volname, subvolume) @@ -5811,7 +5971,7 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper): self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) # insert delay at the beginning of snapshot clone - self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 15) # disable "capped" clones self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', False) @@ -7653,6 +7813,778 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper): self._wait_for_trash_empty() +# NOTE: these tests consumes considerable amount of CPU and RAM due generation +# random of files and due to multiple cloning jobs that are run simultaneously. +# +# NOTE: mgr/vol code generates progress bars for cloning jobs and these tests +# capture them through "ceph status --format json-pretty" and checks if they +# are as expected. If cloning happens too fast, these tests will fail to +# capture progress bars, at least in desired state. Thus, these tests are +# slightly racy by their very nature. +# +# Two measure can be taken to avoid this (and thereby inconsistent results in +# testing) - +# 1. Slow down cloning. This was done by adding a sleep after every file is +# copied. However, this method was rejected since a new config for this would +# have to be added. +# 2. Amount of data that will cloned is big enough so that cloning takes enough +# time for test code to capture the progress bar in desired state and finish +# running. This is method that has been currently employed. This consumes +# significantly more time, CPU and RAM in comparison. +class TestCloneProgressReporter(TestVolumesHelper): + ''' + This class contains tests for features that show how much progress cloning + jobs have made. + ''' + + CLIENTS_REQUIRED = 1 + + def setUp(self): + super(TestCloneProgressReporter, self).setUp() + + # save this config value so that it can be set again at the end of test + # and therefore other tests that might depend on this won't be + # disturbed unnecessarily. + self.num_of_cloner_threads_def = self.get_ceph_cmd_stdout( + 'config get mgr mgr/volumes/max_concurrent_clones').strip() + + # set number of cloner threads to 4, tests in this class depend on this. + self.run_ceph_cmd('config set mgr mgr/volumes/max_concurrent_clones 4') + + def tearDown(self): + v = self.volname + o = self.get_ceph_cmd_stdout('fs volume ls') + if self.volname not in o: + super(TestCloneProgressReporter, self).tearDown() + return + + subvols = self.get_ceph_cmd_stdout(f'fs subvolume ls {v} --format ' + 'json') + subvols = json.loads(subvols) + for i in subvols: + sv = tuple(i.values())[0] + if 'clone' in sv: + self.run_ceph_cmd(f'fs subvolume rm --force {v} {sv}') + continue + + p = self.run_ceph_cmd(f'fs subvolume snapshot ls {v} {sv} ' + '--format json', stdout=StringIO()) + snaps = p.stdout.getvalue().strip() + snaps = json.loads(snaps) + for j in snaps: + ss = tuple(j.values())[0] + self.run_ceph_cmd('fs subvolume snapshot rm --force ' + f'--format json {v} {sv} {ss}') + + try: + self.run_ceph_cmd(f'fs subvolume rm {v} {sv}') + except CommandFailedError as e: + if e.exitstatus == errno.ENOENT: + log.info( + 'ignoring this error, perhaps subvolume was deleted ' + 'during the test and snapshot deleted above is a ' + 'retained snapshot. when a retained snapshot (which is ' + 'snapshot retained despite of subvolume deletion) is ' + 'deleted, the subvolume directory is also deleted ' + 'along. and before retained snapshot deletion, the ' + 'subvolume is reported by "subvolume ls" command, which' + 'is what probably caused confusion here') + pass + else: + raise + + # verify trash dir is clean + self._wait_for_trash_empty() + + self.run_ceph_cmd('config set mgr mgr/volumes/max_concurrent_clones ' + f'{self.num_of_cloner_threads_def}') + + # this doesn't work as expected because cleanup is not done when a + # volume is deleted. + # + # delete volumes so that all async purge threads, async cloner + # threads, progress bars, etc. associated with it are removed from + # Ceph cluster. + #self.run_ceph_cmd(f'fs volume rm {self.volname} --yes-i-really-mean-it') + + super(self.__class__, self).tearDown() + + # XXX: it is important to wait for rbytes value to catch up to actual size of + # subvolume so that progress bar shows sensible amount of progress + def wait_till_rbytes_is_right(self, v_name, sv_name, exp_size, + grp_name=None, sleep=2, max_count=60): + getpath_cmd = f'fs subvolume getpath {v_name} {sv_name}' + if grp_name: + getpath_cmd += f' {grp_name}' + sv_path = self.get_ceph_cmd_stdout(getpath_cmd) + sv_path = sv_path[1:] + + for i in range(max_count): + r_size = self.mount_a.get_shell_stdout( + f'getfattr -n ceph.dir.rbytes {sv_path}').split('rbytes=')[1] + r_size = int(r_size.replace('"', '').replace('"', '')) + log.info(f'r_size = {r_size} exp_size = {exp_size}') + if exp_size == r_size: + break + + time.sleep(sleep) + else: + msg = ('size reported by rstat is not the expected size.\n' + f'expected size = {exp_size}\n' + f'size reported by rstat = {r_size}') + raise RsizeDoesntMatch(msg) + + def test_progress_is_printed_in_clone_status_output(self): + ''' + Test that the command "ceph fs clone status" prints progress stats + for the clone. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + # "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 3, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}') + self._wait_for_clone_to_be_in_progress(c) + + with safe_while(tries=120, sleep=1) as proceed: + while proceed(): + o = self.get_ceph_cmd_stdout(f'fs clone status {v} {c}') + o = json.loads(o) + + try: + p = o['status']['progress_report']['percentage cloned'] + log.debug(f'percentage cloned = {p}') + except KeyError: + # if KeyError is caught, either progress_report is present + # or clone is complete + if 'progress_report' in ['status']: + self.assertEqual(o['status']['state'], 'complete') + break + + self._wait_for_clone_to_complete(c) + + def filter_in_only_clone_pevs(self, progress_events): + ''' + Progress events dictionary in output of "ceph status --format json" + has the progress bars and message associated with each progress bar. + Sometimes during testing of clone progress bars, and sometimes + otherwise too, an extra progress bar is seen with message "Global + Recovery Event". This extra progress bar interferes with testing of + progress bars for cloning. + + This helper methods goes through this dictionary and picks only + (filters in) clone events. + ''' + clone_pevs = {} + + for k, v in progress_events.items(): + if 'mgr-vol-ongoing-clones' in k or 'mgr-vol-total-clones' in k: + clone_pevs[k] = v + + return clone_pevs + + def get_pevs_from_ceph_status(self, clones=None, check=True): + o = self.get_ceph_cmd_stdout('status --format json-pretty') + o = json.loads(o) + + try: + pevs = o['progress_events'] # pevs = progress events + except KeyError as e: + try: + if check and clones: + self.__check_clone_state('completed', clone=clones, timo=1) + except: + msg = ('Didn\'t find expected entries in dictionary ' + '"progress_events" which is obtained from the ' + 'output of command "ceph status".\n' + f'Exception - {e}\npev -\n{pevs}') + raise Exception(msg) + + pevs = self.filter_in_only_clone_pevs(pevs) + + return pevs + + def test_clones_less_than_cloner_threads(self): + ''' + Test that one progress bar is printed in output of "ceph status" output + when number of clone jobs is less than number of cloner threads. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + # XXX: "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 10, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}') + + with safe_while(tries=10, sleep=1) as proceed: + while proceed(): + pev = self.get_pevs_from_ceph_status(c) + + if len(pev) < 1: + continue + elif len(pev) > 1: + raise RuntimeError('For 1 clone "ceph status" output has 2 ' + 'progress bars, it should have only 1 ' + f'progress bar.\npev -\n{pev}') + + # ensure that exactly 1 progress bar for cloning is present in + # "ceph status" output + msg = ('"progress_events" dict in "ceph status" output must have ' + f'exactly one entry.\nprogress_event dict -\n{pev}') + self.assertEqual(len(pev), 1, msg) + + pev_msg = tuple(pev.values())[0]['message'] + self.assertIn('1 ongoing clones', pev_msg) + break + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def test_clone_to_diff_group_and_less_than_cloner_threads(self): + ''' + Initiate cloning where clone subvolume and source subvolume are located + in different groups and then test that when this clone is in progress, + one progress bar is printed in output of command "ceph status" that + shows progress of this clone. + ''' + v = self.volname + group = 'group1' + sv = 'sv1' + ss = 'ss1' + # XXX: "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + self.run_ceph_cmd(f'fs subvolumegroup create {v} {group}') + self.run_ceph_cmd(f'fs subvolume create {v} {sv} {group} --mode=777') + size = self._do_subvolume_io(sv, group, None, 10, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss} {group}') + self.wait_till_rbytes_is_right(v, sv, size, group) + + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c} ' + f'--group-name {group}') + + with safe_while(tries=10, sleep=1) as proceed: + while proceed(): + pev = self.get_pevs_from_ceph_status(c) + + if len(pev) < 1: + continue + elif len(pev) > 1: + raise RuntimeError('For 1 clone "ceph status" output has 2 ' + 'progress bars, it should have only 1 ' + f'progress bar.\npev -\n{pev}') + + # ensure that exactly 1 progress bar for cloning is present in + # "ceph status" output + msg = ('"progress_events" dict in "ceph status" output must have ' + f'exactly one entry.\nprogress_event dict -\n{pev}') + self.assertEqual(len(pev), 1, msg) + + pev_msg = tuple(pev.values())[0]['message'] + self.assertIn('1 ongoing clones', pev_msg) + break + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def test_clone_after_subvol_is_removed(self): + ''' + Initiate cloning after source subvolume has been deleted but with + snapshots retained and then test that, when this clone is in progress, + one progress bar is printed in output of command "ceph status" that + shows progress of this clone. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + # XXX: "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + # XXX: without setting mds_snap_rstat to true rstats are not updated on + # a subvolume snapshot and therefore clone progress bar will not show + # any progress. + self.config_set('mds', 'mds_snap_rstat', 'true') + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 10, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots') + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}') + + with safe_while(tries=15, sleep=10) as proceed: + while proceed(): + pev = self.get_pevs_from_ceph_status(c) + + if len(pev) < 1: + continue + elif len(pev) > 1: + raise RuntimeError('For 1 clone "ceph status" output has 2 ' + 'progress bars, it should have only 1 ' + f'progress bar.\npev -\n{pev}') + + # ensure that exactly 1 progress bar for cloning is present in + # "ceph status" output + msg = ('"progress_events" dict in "ceph status" output must have ' + f'exactly one entry.\nprogress_event dict -\n{pev}') + self.assertEqual(len(pev), 1, msg) + + pev_msg = tuple(pev.values())[0]['message'] + self.assertIn('1 ongoing clones', pev_msg) + break + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def test_clones_equal_to_cloner_threads(self): + ''' + Test that one progress bar is printed in output of "ceph status" output + when number of clone jobs is equal to number of cloner threads. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + c = self._gen_subvol_clone_name(4) + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 10, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + for i in c: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + + with safe_while(tries=10, sleep=1) as proceed: + while proceed(): + pev = self.get_pevs_from_ceph_status(c) + + if len(pev) < 1: + time.sleep(1) + continue + elif len(pev) > 1: + raise RuntimeError('For 1 clone "ceph status" output has 2 ' + 'progress bars, it should have only 1 ' + f'progress bar.\npev -\n{pev}') + + # ensure that exactly 1 progress bar for cloning is present in + # "ceph status" output + msg = ('"progress_events" dict in "ceph status" output must have ' + f'exactly one entry.\nprogress_event dict -\n{pev}') + self.assertEqual(len(pev), 1, msg) + + pev_msg = tuple(pev.values())[0]['message'] + self.assertIn('ongoing clones', pev_msg) + break + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def wait_for_both_progress_bars_to_appear(self, sleep=1, iters=20): + pevs = [] + msg = (f'Waited for {iters*sleep} seconds but couldn\'t 2 progress ' + 'bars in output of "ceph status" command.') + with safe_while(tries=iters, sleep=sleep, action=msg) as proceed: + while proceed(): + o = self.get_ceph_cmd_stdout('status --format json-pretty') + o = json.loads(o) + pevs = o['progress_events'] + pevs = self.filter_in_only_clone_pevs(pevs) + if len(pevs) == 2: + v = tuple(pevs.values()) + if 'ongoing+pending' in v[1]['message']: + self.assertIn('ongoing', v[0]['message']) + else: + self.assertIn('ongoing', v[1]['message']) + self.assertIn('ongoing+pending', v[0]['message']) + break + + def test_clones_more_than_cloner_threads(self): + ''' + Test that 2 progress bars are printed in output of "ceph status" + command when number of clone jobs is greater than number of cloner + threads. + + Also, test that one of these progress bars is for ongoing clones and + other progress bar for ongoing+pending clones. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + c = self._gen_subvol_clone_name(7) + + self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false') + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 3, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + for i in c: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + + msg = ('messages for progress bars for snapshot cloning are not how ' + 'they were expected') + with safe_while(tries=20, sleep=1, action=msg) as proceed: + while proceed(): + pevs = self.get_pevs_from_ceph_status(c) + + if len(pevs) <= 1: + continue # let's wait for second progress bar to appear + elif len(pevs) > 2: + raise RuntimeError( + 'More than 2 progress bars were found in the output ' + 'of "ceph status" command.\nprogress events -' + f'\n{pevs}') + + msg = ('"progress_events" dict in "ceph -s" output must have ' + f'only two entries.\n{pevs}') + self.assertEqual(len(pevs), 2, msg) + pev1, pev2 = pevs.values() + if ('ongoing clones' in pev1['message'].lower() and + 'total ' in pev2['message'].lower()): + break + elif ('ongoing clones' in pev2['message'].lower() or + 'total ' in pev1['message'].lower()): + break + else: + raise RuntimeError(msg) + + # allowing clone jobs to finish will consume too much time, space and + # CPU and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def get_onpen_count(self, pev): + ''' + Return number of clones reported in the message of progress bar for + ongoing+pending clones. + ''' + i = pev['message'].find('ongoing+pending') + if i == -1: + return + count = pev['message'][:i] + count = count[:-1] # remomve trailing space + count = int(count) + return count + + def get_both_progress_fractions_and_onpen_count(self): + ''' + Go through output of "ceph status --format json-pretty" and return + progress made by both clones (that is progress fractions) and return + number of clones in reported in message of ongoing+pending progress + bar. + ''' + msg = 'Expected 2 progress bars but found ' # rest continued in loop + with safe_while(tries=20, sleep=1, action=msg) as proceed: + while proceed(): + o = self.get_ceph_cmd_stdout('status --format json-pretty') + o = json.loads(o) + pevs = o['progress_events'] + pevs = self.filter_in_only_clone_pevs(pevs) + if len(pevs.values()) == 2: + break + else: + msg += f'{len(pevs)} instead' + + log.info(f'pevs -\n{pevs}') + # on_p - progress fraction for ongoing clone jobs + # onpen_p - progress fraction for ongoing+pending clone jobs + pev1, pev2 = tuple(pevs.values()) + if 'ongoing+pending' in pev1['message']: + onpen_p = pev1['progress'] + onpen_count = self.get_onpen_count(pev1) + on_p = pev2['progress'] + else: + onpen_p = pev2['progress'] + onpen_count = self.get_onpen_count(pev2) + on_p = pev1['progress'] + + on_p = float(on_p) + onpen_p = float(onpen_p) + + return on_p, onpen_p, onpen_count + + # "ceph fs clone cancel" command takes considerable time to finish running. + # test cases where more than 4 clones are being cancelled, this error is + # seen, and can be safely ignored since it only implies that cloning has + # been finished. + def cancel_clones_and_ignore_if_finished(self, clones): + if isinstance(clones, str): + clones = (clones, ) + + for c in clones: + cmdargs = f'fs clone cancel {self.volname} {c}' + proc = self.run_ceph_cmd(args=cmdargs, stderr=StringIO(), + check_status=False) + + stderr = proc.stderr.getvalue().strip().lower() + if proc.exitstatus == 0: + continue + elif proc.exitstatus == 22 and 'clone finished' in stderr: + continue + else: + cmdargs = './bin/ceph ' + cmdargs + raise CommandFailedError(cmdargs, proc.exitstatus) + + def cancel_clones(self, clones, check_status=True): + v = self.volname + if not isinstance(clones, (tuple, list)): + clones = (clones, ) + + for i in clones: + self.run_ceph_cmd(f'fs clone cancel {v} {i}', + check_status=check_status) + time.sleep(2) + + # check status is False since this method is meant to cleanup clones at + # the end of a test case and some clones might already be complete. + def cancel_clones_and_confirm(self, clones, check_status=False): + if not isinstance(clones, (tuple, list)): + clones = (clones, ) + + self.cancel_clones(clones, check_status) + + for i in clones: + self._wait_for_clone_to_be_canceled(i) + + def cancel_clones_and_assert(self, clones): + v = self.volname + if not isinstance(clones, (tuple, list)): + clones = (clones, ) + + self.cancel_clones(clones, True) + + for i in clones: + o = self.get_ceph_cmd_stdout(f'fs clone status {v} {i}') + try: + self.assertIn('canceled', o) + except AssertionError: + self.assertIn('complete', o) + + def test_progress_drops_when_new_jobs_are_added(self): + ''' + Test that progress indicated by progress bar for ongoing+pending clones + drops when more clone jobs are launched. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + c = self._gen_subvol_clone_name(20) + + self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false') + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + size = self._do_subvolume_io(sv, None, None, 3, 1024) + + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + for i in c[:5]: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + + tuple_ = self.get_both_progress_fractions_and_onpen_count() + if isinstance(tuple_, (list, tuple)) and len(tuple_) == 3: + on_p, onpen_p, onpen_count = tuple_ + + # this should cause onpen progress bar to go back + for i in c[5:]: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + time.sleep(2) + + with safe_while(tries=30, sleep=0.5) as proceed: + while proceed(): + tuple_ = self.get_both_progress_fractions_and_onpen_count() + new_on_p, new_onpen_p, new_onpen_count = tuple_ + if new_onpen_p < onpen_p: + log.info('new_onpen_p is less than onpen_p.') + log.info(f'new_onpen_p = {new_onpen_p}; onpen_p = {onpen_p}') + break + log.info(f'on_p = {on_p} new_on_p = {new_on_p}') + log.info(f'onpen_p = {onpen_p} new_onpen_p = {new_onpen_p}') + log.info(f'onpen_count = {onpen_count} new_onpen_count = ' + f'{new_onpen_count}') + else: + self.cancel_clones_and_ignore_if_finished(c) + raise RuntimeError('Test failed: it was expected for ' + '"new_onpen_p < onpen_p" to be true.') + + # average progress for "ongoing + pending" clone jobs must + # reduce since a new job was added to penidng state + self.assertLess(new_onpen_p, onpen_p) + + # allowing clone jobs to finish will consume too much time and space + # and not cancelling these clone doesnt affect this test case. + self.cancel_clones_and_ignore_if_finished(c) + + def _wait_for_clone_progress_bars_to_be_removed(self): + with safe_while(tries=10, sleep=0.5) as proceed: + while proceed(): + o = self.get_ceph_cmd_stdout('status --format json-pretty') + o = json.loads(o) + + pevs = o['progress_events'] # pevs = progress events + pevs = self.filter_in_only_clone_pevs(pevs) + if not pevs: + break + + def test_when_clones_cancelled_are_less_than_cloner_threads(self): + ''' + Test that the progress bar that is printed for 1 ongoing clone job is + removed from the output of "ceph status" command when a clone is + cancelled. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + # "clone" must be part of clone name for sake of tearDown() + c = 'ss1clone1' + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + + sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}') + sv_path = sv_path[1:] + + size = self._do_subvolume_io(sv, None, None, 3, 1024) + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}') + time.sleep(1) + self.cancel_clones_and_ignore_if_finished(c) + self._wait_for_clone_to_be_canceled(c) + self._wait_for_clone_progress_bars_to_be_removed() + + # test that cloning had begun but didn't finish. + try: + sv_path = sv_path.replace(sv, c) + o = self.mount_a.run_shell(f'ls -lh {sv_path}') + o = o.stdout.getvalue().strip() + # ensure that all files were not copied. 'ls -lh' will print 1 file + # per line with an extra line for summary, so this command must + # print less than 4 lines + self.assertLess(len(o.split('\n')), 4) + except CommandFailedError as cfe: + # if command failed due to errno 2 (no such file or dir), this + # means cloning hadn't begun yet. that too is fine + if cfe.exitstatus == 2: + pass + else: + raise + + def test_when_clones_cancelled_are_equal_to_cloner_threads(self): + ''' + Test that progress bars, that printed for 3 ongoing clone jobs, are + removed from the output of "ceph status" command when all 3 clone jobs + are cancelled. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + c = self._gen_subvol_clone_name(3) + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + + sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}') + sv_path = sv_path[1:] + + size = self._do_subvolume_io(sv, None, None, 3, 1024) + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + for i in c: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + time.sleep(1) + self.cancel_clones_and_ignore_if_finished(c) + for i in c: + self._wait_for_clone_to_be_canceled(i) + self._wait_for_clone_progress_bars_to_be_removed() + + try: + sv_path = sv_path.replace(sv, c[0]) + o = self.mount_a.run_shell(f'ls -lh {sv_path}') + o = o.stdout.getvalue().strip() + log.info(o) + # ensure that all files were not copied. 'ls -lh' will print 1 file + # per line with an extra line for summary, so this command must + # print less than 4 lines + self.assertLess(len(o.split('\n')), 4) + except CommandFailedError as cfe: + # if command failed due to errno 2 (no such file or dir), this + # means cloning hadn't begun yet. that too is fine + if cfe.exitstatus == errno.ENOENT: + pass + else: + raise + + def test_when_clones_cancelled_are_more_than_cloner_threads(self): + ''' + Test that both the progress bars, that are printed for all 7 clone + jobs, are removed from the output of "ceph status" command when all + these clones are cancelled. + ''' + v = self.volname + sv = 'sv1' + ss = 'ss1' + c = self._gen_subvol_clone_name(7) + + self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false') + + self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777') + + sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}') + sv_path = sv_path[1:] + + size = self._do_subvolume_io(sv, None, None, 3, 1024) + self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}') + self.wait_till_rbytes_is_right(v, sv, size) + + for i in c: + self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}') + time.sleep(1) + self.cancel_clones_and_ignore_if_finished(c) + for i in c: + self._wait_for_clone_to_be_canceled(i) + self._wait_for_clone_progress_bars_to_be_removed() + + try: + sv_path = sv_path.replace(sv, c[0]) + o = self.mount_a.run_shell(f'ls -lh {sv_path}') + o = o.stdout.getvalue().strip() + log.info(o) + # ensure that all files were not copied. 'ls -lh' will print 1 file + # per line with an extra line for summary, so this command must + # print less than 4 lines + self.assertLess(len(o.split('\n')), 4) + except CommandFailedError as cfe: + # if command failed due to errno 2 (no such file or dir), this + # means cloning hadn't begun yet. that too is fine + if cfe.exitstatus == errno.ENOENT: + pass + else: + raise + + class TestMisc(TestVolumesHelper): """Miscellaneous tests related to FS volume, subvolume group, and subvolume operations.""" def test_connection_expiration(self): diff --git a/qa/tasks/check_counter.py b/qa/tasks/check_counter.py index 40818f3f475..1f63b6a0bd4 100644 --- a/qa/tasks/check_counter.py +++ b/qa/tasks/check_counter.py @@ -1,11 +1,14 @@ import logging import json +import errno from teuthology.task import Task from teuthology import misc from tasks import ceph_manager +from tasks.cephfs.filesystem import MDSCluster +from teuthology.exceptions import CommandFailedError log = logging.getLogger(__name__) @@ -61,6 +64,9 @@ class CheckCounter(Task): mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager')) active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"] + mds_cluster = MDSCluster(self.ctx) + status = mds_cluster.status() + for daemon_type, counters in targets.items(): # List of 'a', 'b', 'c'... daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type)) @@ -80,13 +86,31 @@ class CheckCounter(Task): else: log.debug("Getting stats from {0}".format(daemon_id)) - manager = self.ctx.managers[cluster_name] - proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) - response_data = proc.stdout.getvalue().strip() + if daemon_type == 'mds': + mds_info = status.get_mds(daemon_id) + if not mds_info: + continue + mds = f"mds.{mds_info['gid']}" + if mds_info['state'] != "up:active": + log.debug(f"skipping {mds}") + continue + log.debug(f"Getting stats from {mds}") + try: + proc = mon_manager.raw_cluster_cmd("tell", mds, "perf", "dump", + "--format=json-pretty") + response_data = proc.strip() + except CommandFailedError as e: + if e.exitstatus == errno.ENOENT: + log.debug(f"Failed to do 'perf dump' on {mds}") + continue + else: + manager = self.ctx.managers[cluster_name] + proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"]) + response_data = proc.stdout.getvalue().strip() if response_data: perf_dump = json.loads(response_data) else: - log.warning("No admin socket response from {0}, skipping".format(daemon_id)) + log.warning("No response from {0}, skipping".format(daemon_id)) continue minval = '' diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py index 2ac92439de6..d955d232c2c 100644 --- a/qa/tasks/fwd_scrub.py +++ b/qa/tasks/fwd_scrub.py @@ -33,6 +33,8 @@ class ForwardScrubber(ThrasherGreenlet): def _run(self): try: self.do_scrub() + except ThrasherGreenlet.Stopped: + pass except Exception as e: self.set_thrasher_exception(e) self.logger.exception("exception:") diff --git a/qa/tasks/kafka.py b/qa/tasks/kafka.py index 5e6c208ca30..833f03babf6 100644 --- a/qa/tasks/kafka.py +++ b/qa/tasks/kafka.py @@ -4,6 +4,7 @@ Deploy and configure Kafka for Teuthology import contextlib import logging import time +import os from teuthology import misc as teuthology from teuthology import contextutil @@ -33,6 +34,13 @@ def install_kafka(ctx, config): assert isinstance(config, dict) log.info('Installing Kafka...') + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + for (client, _) in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() test_dir=teuthology.get_testdir(ctx) @@ -40,7 +48,8 @@ def install_kafka(ctx, config): kafka_file = kafka_prefix + current_version + '.tgz' - link1 = 'https://archive.apache.org/dist/kafka/' + current_version + '/' + kafka_file + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file ctx.cluster.only(client).run( args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], ) diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py new file mode 100644 index 00000000000..3ca60ab84fc --- /dev/null +++ b/qa/tasks/kafka_failover.py @@ -0,0 +1,244 @@ +""" +Deploy and configure Kafka for Teuthology +""" +import contextlib +import logging +import time +import os + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def get_kafka_version(config): + for client, client_config in config.items(): + if 'kafka_version' in client_config: + kafka_version = client_config.get('kafka_version') + return kafka_version + +kafka_prefix = 'kafka_2.13-' + +def get_kafka_dir(ctx, config): + kafka_version = get_kafka_version(config) + current_version = kafka_prefix + kafka_version + return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version) + + +@contextlib.contextmanager +def install_kafka(ctx, config): + """ + Downloading the kafka tar file. + """ + assert isinstance(config, dict) + log.info('Installing Kafka...') + + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + test_dir=teuthology.get_testdir(ctx) + current_version = get_kafka_version(config) + + kafka_file = kafka_prefix + current_version + '.tgz' + + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file], + ) + + kafka_dir = get_kafka_dir(ctx, config) + # create config for second broker + second_broker_config_name = "server2.properties" + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/") + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'), + 'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir) + ], + ) + + # edit config + ctx.cluster.only(client).run( + args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name) + ] + ) + + try: + yield + finally: + log.info('Removing packaged dependencies of Kafka...') + test_dir=get_kafka_dir(ctx, config) + current_version = get_kafka_version(config) + for (client,_) in config.items(): + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', test_dir], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)], + ) + + +@contextlib.contextmanager +def run_kafka(ctx,config): + """ + This includes two parts: + 1. Starting Zookeeper service + 2. Starting Kafka service + """ + assert isinstance(config, dict) + log.info('Bringing up Zookeeper and Kafka services...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + kafka_dir = get_kafka_dir(ctx, config) + + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_java_log_dir = "{}/java_logs".format(second_broker_data) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './zookeeper-server-start.sh', + '{tir}/config/zookeeper.properties'.format(tir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './kafka-server-start.sh', + '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)), + './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + log.info('Stopping Zookeeper and Kafka Services...') + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-server-stop.sh', + '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './zookeeper-server-stop.sh', + '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run(args=['killall', '-9', 'java']) + + +@contextlib.contextmanager +def run_admin_cmds(ctx,config): + """ + Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic. + """ + assert isinstance(config, dict) + log.info('Checking kafka server through producer/consumer commands...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-topics.sh', '--create', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + 'echo', "First", run.Raw('|'), + './kafka-console-producer.sh', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-console-consumer.sh', '--topic', 'quickstart-events', + '--from-beginning', + '--bootstrap-server', 'localhost:9092', + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + pass + + +@contextlib.contextmanager +def task(ctx,config): + """ + Following is the way how to run kafka:: + tasks: + - kafka: + client.0: + kafka_version: 2.6.0 + """ + assert config is None or isinstance(config, list) \ + or isinstance(config, dict), \ + "task kafka only supports a list or dictionary for configuration" + + all_clients = ['client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + if config is None: + config = all_clients + if isinstance(config, list): + config = dict.fromkeys(config) + + log.debug('Kafka config is %s', config) + + with contextutil.nested( + lambda: install_kafka(ctx=ctx, config=config), + lambda: run_kafka(ctx=ctx, config=config), + lambda: run_admin_cmds(ctx=ctx, config=config), + ): + yield + diff --git a/qa/tasks/mgr/dashboard/helper.py b/qa/tasks/mgr/dashboard/helper.py index d80e238a2a8..55355048a36 100644 --- a/qa/tasks/mgr/dashboard/helper.py +++ b/qa/tasks/mgr/dashboard/helper.py @@ -9,7 +9,8 @@ import re import string import time from collections import namedtuple -from typing import List +from functools import wraps +from typing import List, Optional, Tuple, Type, Union import requests from tasks.mgr.mgr_test_case import MgrTestCase @@ -219,13 +220,11 @@ class DashboardTestCase(MgrTestCase): # To avoid any issues with e.g. unlink bugs, we destroy and recreate # the filesystem rather than just doing a rm -rf of files - cls.mds_cluster.mds_stop() - cls.mds_cluster.mds_fail() cls.mds_cluster.delete_all_filesystems() + cls.mds_cluster.mds_restart() # to reset any run-time configs, etc. cls.fs = None # is now invalid! cls.fs = cls.mds_cluster.newfs(create=True) - cls.fs.mds_restart() # In case some test messed with auth caps, reset them # pylint: disable=not-an-iterable @@ -343,16 +342,16 @@ class DashboardTestCase(MgrTestCase): @classmethod def _view_cache_get(cls, url, retries=5): - retry = True - while retry and retries > 0: - retry = False + _retry = True + while _retry and retries > 0: + _retry = False res = cls._get(url, version=DEFAULT_API_VERSION) if isinstance(res, dict): res = [res] for view in res: assert 'value' in view if not view['value']: - retry = True + _retry = True retries -= 1 if retries == 0: raise Exception("{} view cache exceeded number of retries={}" @@ -722,3 +721,25 @@ def _validate_json(val, schema, path=[]): return _validate_json(val, JLeaf(schema), path) assert False, str(path) + + +def retry( + on_exception: Union[Type[Exception], Tuple[Type[Exception], ...]], + tries=3, + delay=0, + logger: Optional[logging.Logger] = None, +): + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + for i in range(tries): + try: + return func(*args, **kwargs) + except on_exception as e: + err = e + if logger: + logger.warn(f"Retried #{i+1}/{tries}: '{func.__name__}' raised '{e}'") + time.sleep(delay) + raise err + return wrapper + return decorator diff --git a/qa/tasks/mgr/dashboard/test_auth.py b/qa/tasks/mgr/dashboard/test_auth.py index a2266229bef..2b9240b635e 100644 --- a/qa/tasks/mgr/dashboard/test_auth.py +++ b/qa/tasks/mgr/dashboard/test_auth.py @@ -152,7 +152,8 @@ class AuthTest(DashboardTestCase): self._post("/api/auth/logout") self.assertStatus(200) self.assertJsonBody({ - "redirect_url": "#/login" + "redirect_url": "#/login", + "protocol": 'local' }) self._get("/api/host", version='1.1') self.assertStatus(401) @@ -167,7 +168,8 @@ class AuthTest(DashboardTestCase): self._post("/api/auth/logout", set_cookies=True) self.assertStatus(200) self.assertJsonBody({ - "redirect_url": "#/login" + "redirect_url": "#/login", + "protocol": 'local' }) self._get("/api/host", set_cookies=True, version='1.1') self.assertStatus(401) diff --git a/qa/tasks/mgr/dashboard/test_mgr_module.py b/qa/tasks/mgr/dashboard/test_mgr_module.py index 2b8b672f284..1dbdef23d34 100644 --- a/qa/tasks/mgr/dashboard/test_mgr_module.py +++ b/qa/tasks/mgr/dashboard/test_mgr_module.py @@ -4,9 +4,11 @@ from __future__ import absolute_import import logging import requests +from urllib3.exceptions import MaxRetryError from .helper import (DashboardTestCase, JLeaf, JList, JObj, - module_options_object_schema, module_options_schema) + module_options_object_schema, module_options_schema, + retry) logger = logging.getLogger(__name__) @@ -14,6 +16,7 @@ logger = logging.getLogger(__name__) class MgrModuleTestCase(DashboardTestCase): MGRS_REQUIRED = 1 + @retry(on_exception=RuntimeError, tries=2, delay=0.5, logger=logger) def wait_until_rest_api_accessible(self): """ Wait until the REST API is accessible. @@ -22,10 +25,11 @@ class MgrModuleTestCase(DashboardTestCase): def _check_connection(): try: # Try reaching an API endpoint successfully. + logger.info('Trying to reach the REST API endpoint') self._get('/api/mgr/module') if self._resp.status_code == 200: return True - except requests.ConnectionError: + except (MaxRetryError, requests.ConnectionError): pass return False diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py index 71cf3d87194..be7afccf331 100644 --- a/qa/tasks/mgr/dashboard/test_osd.py +++ b/qa/tasks/mgr/dashboard/test_osd.py @@ -5,12 +5,13 @@ from __future__ import absolute_import import json from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple, - devices_schema) + devices_schema, log, retry) class OsdTest(DashboardTestCase): AUTH_ROLES = ['cluster-manager'] + _VERSION = '1.1' @classmethod def setUpClass(cls): @@ -24,7 +25,7 @@ class OsdTest(DashboardTestCase): @DashboardTestCase.RunAs('test', 'test', ['block-manager']) def test_access_permissions(self): - self._get('/api/osd') + self._get('/api/osd', version=self._VERSION) self.assertStatus(403) self._get('/api/osd/0') self.assertStatus(403) @@ -33,7 +34,7 @@ class OsdTest(DashboardTestCase): self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True)) def test_list(self): - data = self._get('/api/osd') + data = self._get('/api/osd', version=self._VERSION) self.assertStatus(200) self.assertGreaterEqual(len(data), 1) @@ -283,13 +284,18 @@ class OsdFlagsTest(DashboardTestCase): if osd['osd'] == osd_initial['osd']: self.assertGreater(len(osd['flags']), len(osd_initial['flags'])) - self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2']) - flags_removed = self._get('/api/osd/flags/individual') - self.assertStatus(200) - for osd in flags_removed: - if osd['osd'] in [0, 1, 2]: - self.assertNotIn('noout', osd['flags']) - self.assertNotIn('noin', osd['flags']) + ret = self._ceph_cmd_result(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2']) + self.assertEqual(ret, 0) + + @retry(on_exception=AssertionError, tries=2, delay=0.5, logger=log) + def check_osd_flags(): + flags_removed = self._get('/api/osd/flags/individual') + self.assertStatus(200) + for osd in flags_removed: + if osd['osd'] in [0, 1, 2]: + self.assertNotIn('noout', osd['flags']) + self.assertNotIn('noin', osd['flags']) + check_osd_flags() def test_add_indiv_flag(self): flags_update = {'noup': None, 'nodown': None, 'noin': None, 'noout': True} diff --git a/qa/tasks/mgr/dashboard/test_rbd.py b/qa/tasks/mgr/dashboard/test_rbd.py index a872645e33e..83b3bf520c2 100644 --- a/qa/tasks/mgr/dashboard/test_rbd.py +++ b/qa/tasks/mgr/dashboard/test_rbd.py @@ -869,7 +869,19 @@ class RbdTest(DashboardTestCase): self.assertEqual(clone_format_version, 2) self.assertStatus(200) + # if empty list is sent, then the config will remain as it is value = [] + res = [{'section': "global", 'value': "2"}] + self._post('/api/cluster_conf', { + 'name': config_name, + 'value': value + }) + self.wait_until_equal( + lambda: _get_config_by_name(config_name), + res, + timeout=60) + + value = [{'section': "global", 'value': ""}] self._post('/api/cluster_conf', { 'name': config_name, 'value': value diff --git a/qa/tasks/mgr/dashboard/test_rgw.py b/qa/tasks/mgr/dashboard/test_rgw.py index 5c7b0329675..a9071bc2a3a 100644 --- a/qa/tasks/mgr/dashboard/test_rgw.py +++ b/qa/tasks/mgr/dashboard/test_rgw.py @@ -785,7 +785,7 @@ class RgwUserSubuserTest(RgwTestCase): 'access': 'readwrite', 'key_type': 'swift' }) - self.assertStatus(200) + self.assertStatus(201) data = self.jsonBody() subuser = self.find_object_in_list('id', 'teuth-test-user:tux', data) self.assertIsInstance(subuser, object) @@ -808,7 +808,7 @@ class RgwUserSubuserTest(RgwTestCase): 'access_key': 'yyy', 'secret_key': 'xxx' }) - self.assertStatus(200) + self.assertStatus(201) data = self.jsonBody() subuser = self.find_object_in_list('id', 'teuth-test-user:hugo', data) self.assertIsInstance(subuser, object) diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py index 74b1e9d850c..4a5506391f2 100644 --- a/qa/tasks/mgr/mgr_test_case.py +++ b/qa/tasks/mgr/mgr_test_case.py @@ -1,5 +1,6 @@ import json import logging +import socket from unittest import SkipTest @@ -108,7 +109,7 @@ class MgrTestCase(CephTestCase): # Unload all non-default plugins loaded = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd( "mgr", "module", "ls", "--format=json-pretty"))['enabled_modules'] - unload_modules = set(loaded) - {"cephadm", "restful"} + unload_modules = set(loaded) - {"cephadm"} for m in unload_modules: cls.mgr_cluster.mon_manager.raw_cluster_cmd( @@ -137,7 +138,7 @@ class MgrTestCase(CephTestCase): raise SkipTest( "Only have {0} manager daemons, {1} are required".format( len(cls.mgr_cluster.mgr_ids), cls.MGRS_REQUIRED)) - + # We expect laggy OSDs in this testing environment so turn off this warning. # See https://tracker.ceph.com/issues/61907 cls.mgr_cluster.mon_manager.raw_cluster_cmd('config', 'set', 'mds', @@ -229,15 +230,22 @@ class MgrTestCase(CephTestCase): """ # Start handing out ports well above Ceph's range. assign_port = min_port + ip_addr = cls.mgr_cluster.get_mgr_map()['active_addr'].split(':')[0] for mgr_id in cls.mgr_cluster.mgr_ids: cls.mgr_cluster.mgr_stop(mgr_id) cls.mgr_cluster.mgr_fail(mgr_id) + for mgr_id in cls.mgr_cluster.mgr_ids: - log.debug("Using port {0} for {1} on mgr.{2}".format( - assign_port, module_name, mgr_id - )) + # Find a port that isn't in use + while True: + if not cls.is_port_in_use(ip_addr, assign_port): + break + log.debug(f"Port {assign_port} in use, trying next") + assign_port += 1 + + log.debug(f"Using port {assign_port} for {module_name} on mgr.{mgr_id}") cls.mgr_cluster.set_module_localized_conf(module_name, mgr_id, config_name, str(assign_port), @@ -255,3 +263,8 @@ class MgrTestCase(CephTestCase): mgr_map['active_name'], mgr_map['active_gid'])) return done cls.wait_until_true(is_available, timeout=30) + + @classmethod + def is_port_in_use(cls, ip_addr: str, port: int) -> bool: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex((ip_addr, port)) == 0 diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py index 7ac2960371c..c41a95c71f7 100644 --- a/qa/tasks/mgr/test_module_selftest.py +++ b/qa/tasks/mgr/test_module_selftest.py @@ -36,13 +36,6 @@ class TestModuleSelftest(MgrTestCase): self.mgr_cluster.mon_manager.raw_cluster_cmd( "mgr", "self-test", "module", module_name) - def test_zabbix(self): - # Set these mandatory config fields so that the zabbix module - # won't trigger health/log errors on load/serve. - self.mgr_cluster.set_module_conf("zabbix", "zabbix_host", "localhost") - self.mgr_cluster.set_module_conf("zabbix", "identifier", "foo") - self._selftest_plugin("zabbix") - def test_prometheus(self): self._assign_ports("prometheus", "server_port", min_port=8100) self._selftest_plugin("prometheus") diff --git a/qa/tasks/mon_connection_score.py b/qa/tasks/mon_connection_score.py new file mode 100644 index 00000000000..3d1fdb2a736 --- /dev/null +++ b/qa/tasks/mon_connection_score.py @@ -0,0 +1,95 @@ +from tasks.ceph_test_case import CephTestCase +import json +import logging +log = logging.getLogger(__name__) + + +class TestStretchClusterNew(CephTestCase): + + CLUSTER = "ceph" + MONS = { + "a": { + "rank": 0, + }, + "b": { + "rank": 1, + }, + "c": { + "rank": 2, + } + } + WRITE_PERIOD = 10 + RECOVERY_PERIOD = WRITE_PERIOD * 6 + SUCCESS_HOLD_TIME = 10 + + def setUp(self): + """ + Set up the cluster for the test. + """ + super(TestStretchClusterNew, self).setUp() + + def tearDown(self): + """ + Clean up the cluter after the test. + """ + super(TestStretchClusterNew, self).tearDown() + + def _check_connection_score(self): + """ + Check the connection score of all the mons. + """ + for mon, _ in self.MONS.items(): + # get the connection score + cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd( + 'daemon', 'mon.{}'.format(mon), + 'connection', 'scores', 'dump') + # parse the connection score + cscore = json.loads(cscore) + # check if the current mon rank is correct + if cscore["rank"] != self.MONS[mon]["rank"]: + log.error( + "Rank mismatch {} != {}".format( + cscore["rank"], self.MONS[mon]["rank"] + ) + ) + return False + # check if current mon have all the peer reports and ourself + if len(cscore['reports']) != len(self.MONS): + log.error( + "Reports count mismatch {}".format(cscore['reports']) + ) + return False + + for report in cscore["reports"]: + report_rank = [] + for peer in report["peer_scores"]: + # check if the peer is alive + if not peer["peer_alive"]: + log.error("Peer {} is not alive".format(peer)) + return False + report_rank.append(peer["peer_rank"]) + + # check if current mon has all the ranks and no duplicates + expected_ranks = [ + rank + for data in self.MONS.values() + for rank in data.values() + ] + if report_rank.sort() != expected_ranks.sort(): + log.error("Rank mismatch in report {}".format(report)) + return False + + log.info("Connection score is clean!") + return True + + def test_connection_score(self): + # check if all mons are in quorum + self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3) + # check if all connection scores reflect this + self.wait_until_true_and_hold( + lambda: self._check_connection_score(), + # Wait for 4 minutes for the connection score to recover + timeout=self.RECOVERY_PERIOD * 4, + # Hold the clean connection score for 60 seconds + success_hold_time=self.SUCCESS_HOLD_TIME * 6 + ) diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py index 34aa1f9cc9e..84b0b6c521b 100644 --- a/qa/tasks/mon_thrash.py +++ b/qa/tasks/mon_thrash.py @@ -161,7 +161,7 @@ class MonitorThrasher(Thrasher): """ Stop the thrashing process. """ - self.stopping = True + self.stopping.set() def join(self): """ diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py index b4697a6f797..f1eae3c89c4 100644 --- a/qa/tasks/notification_tests.py +++ b/qa/tasks/notification_tests.py @@ -220,7 +220,7 @@ def run_tests(ctx, config): for client, client_config in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() - attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] + attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] if 'extra_attr' in client_config: attr = client_config.get('extra_attr') diff --git a/qa/tasks/nvme_loop.py b/qa/tasks/nvme_loop.py index 5b29c11f007..fdec467a16d 100644 --- a/qa/tasks/nvme_loop.py +++ b/qa/tasks/nvme_loop.py @@ -67,9 +67,10 @@ def task(ctx, config): with contextutil.safe_while(sleep=1, tries=15) as proceed: while proceed(): + remote.run(args=['lsblk'], stdout=StringIO()) p = remote.run(args=['sudo', 'nvme', 'list', '-o', 'json'], stdout=StringIO()) new_devs = [] - # `nvme list -o json` will return the following output: + # `nvme list -o json` will return one of the following output: '''{ "Devices" : [ { @@ -90,12 +91,112 @@ def task(ctx, config): } ] }''' + '''{ + "Devices":[ + { + "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4", + "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c", + "Subsystems":[ + { + "Subsystem":"nvme-subsys0", + "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT623300LN400BGN INTEL SSDPEDMD400G4", + "Controllers":[ + { + "Controller":"nvme0", + "Cntlid":"0", + "SerialNumber":"CVFT623300LN400BGN", + "ModelNumber":"INTEL SSDPEDMD400G4", + "Firmware":"8DV101H0", + "Transport":"pcie", + "Address":"0000:02:00.0", + "Slot":"2", + "Namespaces":[ + { + "NameSpace":"nvme0n1", + "Generic":"ng0n1", + "NSID":1, + "UsedBytes":400088457216, + "MaximumLBA":781422768, + "PhysicalSize":400088457216, + "SectorSize":512 + } + ], + "Paths":[ + ] + } + ], + "Namespaces":[ + ] + } + ] + } + ] + } + ''' + '''{ + "Devices":[ + { + "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4", + "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c", + "Subsystems":[ + { + "Subsystem":"nvme-subsys0", + "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT534400C2400BGN INTEL SSDPEDMD400G4", + "Controllers":[ + { + "Controller":"nvme0", + "Cntlid":"0", + "SerialNumber":"CVFT534400C2400BGN", + "ModelNumber":"INTEL SSDPEDMD400G4", + "Firmware":"8DV101H0", + "Transport":"pcie", + "Address":"0000:02:00.0", + "Slot":"2", + "Namespaces":[ + { + "NameSpace":"nvme0n1", + "Generic":"ng0n1", + "NSID":1, + "UsedBytes":400088457216, + "MaximumLBA":781422768, + "PhysicalSize":400088457216, + "SectorSize":512 + } + ], + "Paths":[ + ] + } + ], + "Namespaces":[ + ] + } + ] + } + ] + } + ''' nvme_list = json.loads(p.stdout.getvalue()) for device in nvme_list['Devices']: - dev = device['DevicePath'] - vendor = device['ModelNumber'] - if dev.startswith('/dev/') and vendor == 'Linux': - new_devs.append(dev) + try: + # first try format 1 / older format + dev = device['DevicePath'] + vendor = device['ModelNumber'] + if dev.startswith('/dev/') and vendor == 'Linux': + new_devs.append(dev) + bluestore_zap(remote, dev) + except KeyError: + for subsystem in device['Subsystems']: + # format 2 + if 'Namespaces' in subsystem and subsystem['Namespaces']: + dev = '/dev/' + subsystem['Namespaces'][0]['NameSpace'] + # try format 3 last + else: + dev = '/dev/' + subsystem['Controllers'][0]['Namespaces'][0]['NameSpace'] + # vendor is the same for format 2 and 3 + vendor = subsystem['Controllers'][0]['ModelNumber'] + if vendor == 'Linux': + new_devs.append(dev) + bluestore_zap(remote, dev) log.info(f'new_devs {new_devs}') assert len(new_devs) <= len(devs) if len(new_devs) == len(devs): @@ -128,3 +229,13 @@ def task(ctx, config): data=old_scratch_by_remote[remote], sudo=True ) + +def bluestore_zap(remote, device: str) -> None: + for offset in [0, 1073741824, 10737418240]: + remote.run(args=['sudo', 'dd', + 'if=/dev/zero', f'of={device}', + f'seek={offset}', 'bs=1', + 'count=4096'], stdout=StringIO()) + remote.run(args=['sudo', 'hexdump', '-n22', + '-C', f'-s{offset}', f'{device}'], + stdout=StringIO())
\ No newline at end of file diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index b56bcae0d0b..691a6f7dd86 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -17,14 +17,14 @@ from tasks.thrasher import Thrasher log = logging.getLogger(__name__) conf_file = '/etc/ceph/nvmeof.env' - +gw_yaml_file = '/etc/ceph/nvmeof-gw.yaml' class Nvmeof(Task): """ Setup nvmeof gateway on client and then share gateway config to target host. - nvmeof: - client: client.0 + installer: host.a // or 'nvmeof.nvmeof.a' version: default rbd: pool_name: mypool @@ -32,21 +32,18 @@ class Nvmeof(Task): gateway_config: namespaces_count: 10 cli_version: latest + create_mtls_secrets: False """ def setup(self): super(Nvmeof, self).setup() try: - self.client = self.config['client'] + host = self.config['installer'] except KeyError: - raise ConfigError('nvmeof requires a client to connect with') - - self.cluster_name, type_, self.client_id = misc.split_role(self.client) - if type_ != 'client': - msg = 'client role ({0}) must be a client'.format(self.client) - raise ConfigError(msg) - self.remote = get_remote_for_role(self.ctx, self.client) + raise ConfigError('nvmeof requires a installer host to deploy service') + self.cluster_name, _, _ = misc.split_role(host) + self.remote = get_remote_for_role(self.ctx, host) def begin(self): super(Nvmeof, self).begin() @@ -64,6 +61,8 @@ class Nvmeof(Task): gateway_config = self.config.get('gateway_config', {}) self.cli_image = gateway_config.get('cli_image', 'quay.io/ceph/nvmeof-cli:latest') + self.groups_count = gateway_config.get('groups_count', 1) + self.groups_prefix = gateway_config.get('groups_prefix', 'mygroup') self.nqn_prefix = gateway_config.get('subsystem_nqn_prefix', 'nqn.2016-06.io.spdk:cnode') self.subsystems_count = gateway_config.get('subsystems_count', 1) self.namespaces_count = gateway_config.get('namespaces_count', 1) # namepsaces per subsystem @@ -71,6 +70,7 @@ class Nvmeof(Task): self.serial = gateway_config.get('serial', 'SPDK00000000000001') self.port = gateway_config.get('port', '4420') self.srport = gateway_config.get('srport', '5500') + self.create_mtls_secrets = gateway_config.get('create_mtls_secrets', False) def deploy_nvmeof(self): """ @@ -114,23 +114,31 @@ class Nvmeof(Task): 'rbd', 'pool', 'init', poolname ]) - log.info(f'[nvmeof]: ceph orch apply nvmeof {poolname}') - _shell(self.ctx, self.cluster_name, self.remote, [ - 'ceph', 'orch', 'apply', 'nvmeof', poolname, - '--placement', str(len(nodes)) + ';' + ';'.join(nodes) - ]) + group_to_nodes = defaultdict(list) + for index, node in enumerate(nodes): + group_name = self.groups_prefix + str(index % int(self.groups_count)) + group_to_nodes[group_name] += [node] + for group_name in group_to_nodes: + gp_nodes = group_to_nodes[group_name] + log.info(f'[nvmeof]: ceph orch apply nvmeof {poolname} {group_name}') + _shell(self.ctx, self.cluster_name, self.remote, [ + 'ceph', 'orch', 'apply', 'nvmeof', poolname, group_name, + '--placement', ';'.join(gp_nodes) + ]) total_images = int(self.namespaces_count) * int(self.subsystems_count) log.info(f'[nvmeof]: creating {total_images} images') + rbd_create_cmd = [] for i in range(1, total_images + 1): imagename = self.image_name_prefix + str(i) - log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}') - _shell(self.ctx, self.cluster_name, self.remote, [ - 'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}' - ]) + rbd_create_cmd += ['rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}', run.Raw(';')] + _shell(self.ctx, self.cluster_name, self.remote, rbd_create_cmd) for role, i in daemons.items(): remote, id_ = i + _shell(self.ctx, self.cluster_name, remote, [ + 'ceph', 'orch', 'ls', 'nvmeof', '--export', run.Raw('>'), gw_yaml_file + ]) self.ctx.daemons.register_daemon( remote, 'nvmeof', id_, cluster=self.cluster_name, @@ -140,7 +148,38 @@ class Nvmeof(Task): started=True, ) log.info("[nvmeof]: executed deploy_nvmeof successfully!") - + + def write_mtls_config(self, gateway_ips): + log.info("[nvmeof]: writing mtls config...") + allowed_ips = "" + for ip in gateway_ips: + allowed_ips += ("IP:" + ip + ",") + self.remote.run( + args=[ + "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/server.key", + "-out", "/etc/ceph/server.crt", "-days", "3650", "-subj", "/CN=my.server", "-addext", f"subjectAltName={allowed_ips[:-1]}" + ] + ) + self.remote.run( + args=[ + "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/client.key", + "-out", "/etc/ceph/client.crt", "-days", "3650", "-subj", "/CN=client1" + ] + ) + secrets_files = {"/etc/ceph/server.key": None, + "/etc/ceph/server.crt": None, + "/etc/ceph/client.key": None, + "/etc/ceph/client.crt": None, + } + for file in secrets_files.keys(): + secrets_files[file] = self.remote.read_file(path=file, sudo=True) + + for remote in self.ctx.cluster.remotes.keys(): + for remote_file in secrets_files.keys(): + data = secrets_files[remote_file] + remote.sudo_write_file(path=remote_file, data=data, mode='0644') + log.info("[nvmeof]: written mtls config!") + def set_gateway_cfg(self): log.info('[nvmeof]: running set_gateway_cfg...') ip_address = self.remote.ip_address @@ -167,6 +206,8 @@ class Nvmeof(Task): data=conf_data, sudo=True ) + if self.create_mtls_secrets: + self.write_mtls_config(gateway_ips) log.info("[nvmeof]: executed set_gateway_cfg successfully!") @@ -209,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet): daemon_max_thrash_times: For now, NVMeoF daemons have limitation that each daemon can - be thrashed only 3 times in span of 30 mins. This option + be thrashed only 5 times in span of 30 mins. This option allows to set the amount of times it could be thrashed in a period - of time. (default: 3) + of time. (default: 5) daemon_max_thrash_period: This option goes with the above option. It sets the period of time over which each daemons can be thrashed for daemon_max_thrash_times @@ -264,17 +305,17 @@ class NvmeofThrasher(Thrasher, Greenlet): self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1)) # Limits on thrashing each daemon - self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3)) + self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 5)) self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60)) self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30)) - self.min_revive_delay = int(self.config.get('min_revive_delay', 100)) + self.min_revive_delay = int(self.config.get('min_revive_delay', 60)) self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30)) def _get_devices(self, remote): GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \ - "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'" + "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'" devices = remote.sh(GET_DEVICE_CMD).split() return devices @@ -305,6 +346,7 @@ class NvmeofThrasher(Thrasher, Greenlet): run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof', run.Raw('&&'), 'ceph', 'health', 'detail', run.Raw('&&'), 'ceph', '-s', + run.Raw('&&'), 'sudo', 'nvme', 'list', ] for dev in self.devices: check_cmd += [ @@ -335,6 +377,37 @@ class NvmeofThrasher(Thrasher, Greenlet): self.log('switch_task: done waiting for the other thrasher') other_thrasher.switch_thrasher.clear() + def kill_daemon(self, daemon): + kill_methods = [ + "ceph_daemon_stop", "systemctl_stop", + "daemon_remove", + ] + chosen_method = self.rng.choice(kill_methods) + d_name = '%s.%s' % (daemon.type_, daemon.id_) + if chosen_method == "ceph_daemon_stop": + daemon.remote.run(args=[ + "ceph", "orch", "daemon", "stop", + d_name + ], check_status=False) + elif chosen_method == "systemctl_stop": + daemon.stop() + elif chosen_method == "daemon_remove": + daemon.remote.run(args=[ + "ceph", "orch", "daemon", "rm", + d_name + ], check_status=False) + return chosen_method + + def revive_daemon(self, daemon, killed_method): + if killed_method == "ceph_daemon_stop": + name = '%s.%s' % (daemon.type_, daemon.id_) + daemon.remote.run(args=[ + "ceph", "orch", "daemon", "restart", + name + ]) + elif killed_method == "systemctl_stop": + daemon.restart() + def do_thrash(self): self.log('start thrashing') self.log(f'seed: {self.random_seed}, , '\ @@ -346,15 +419,13 @@ class NvmeofThrasher(Thrasher, Greenlet): summary = [] while not self.stopping.is_set(): - killed_daemons = [] + killed_daemons = defaultdict(list) - weight = 1.0 / len(self.daemons) - count = 0 + thrash_daemon_num = self.rng.randint(1, self.max_thrash_daemons) + selected_daemons = self.rng.sample(self.daemons, thrash_daemon_num) for daemon in self.daemons: - skip = self.rng.uniform(0.0, 1.0) - if weight <= skip: - self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format( - label=daemon.id_, skip=skip, weight=weight)) + if daemon not in selected_daemons: + self.log(f'skipping daemon {daemon.id_} ...') continue # For now, nvmeof daemons can only be thrashed 3 times in last 30mins. @@ -372,18 +443,16 @@ class NvmeofThrasher(Thrasher, Greenlet): continue self.log('kill {label}'.format(label=daemon.id_)) - daemon.stop() + kill_method = self.kill_daemon(daemon) - killed_daemons.append(daemon) + killed_daemons[kill_method].append(daemon) daemons_thrash_history[daemon.id_] += [datetime.now()] - # only thrash max_thrash_daemons amount of daemons - count += 1 - if count >= self.max_thrash_daemons: - break - if killed_daemons: - summary += ["killed: " + ", ".join([d.id_ for d in killed_daemons])] + iteration_summary = "thrashed- " + for kill_method in killed_daemons: + iteration_summary += (", ".join([d.id_ for d in killed_daemons[kill_method]]) + f" (by {kill_method}); ") + summary += [iteration_summary] # delay before reviving revive_delay = self.min_revive_delay if self.randomize: @@ -391,15 +460,17 @@ class NvmeofThrasher(Thrasher, Greenlet): self.log(f'waiting for {revive_delay} secs before reviving') time.sleep(revive_delay) # blocking wait - self.log('done waiting before reviving') + self.log(f'done waiting before reviving - iteration #{len(summary)}: {iteration_summary}') self.do_checks() self.switch_task() # revive after thrashing - for daemon in killed_daemons: - self.log('reviving {label}'.format(label=daemon.id_)) - daemon.restart() + for kill_method in killed_daemons: + for daemon in killed_daemons[kill_method]: + self.log('reviving {label}'.format(label=daemon.id_)) + # daemon.restart() + self.revive_daemon(daemon, kill_method) # delay before thrashing thrash_delay = self.min_thrash_delay @@ -408,7 +479,7 @@ class NvmeofThrasher(Thrasher, Greenlet): if thrash_delay > 0.0: self.log(f'waiting for {thrash_delay} secs before thrashing') time.sleep(thrash_delay) # blocking - self.log('done waiting before thrashing') + self.log('done waiting before thrashing - everything should be up now') self.do_checks() self.switch_task() diff --git a/qa/tasks/qemu.py b/qa/tasks/qemu.py index 760e4b82b73..e7ce73e45d0 100644 --- a/qa/tasks/qemu.py +++ b/qa/tasks/qemu.py @@ -29,7 +29,8 @@ DEFAULT_MEM = 4096 # in megabytes def normalize_disks(config): # normalize the 'disks' parameter into a list of dictionaries - for client, client_config in config.items(): + for role, client_config in config.items(): + _, typ, id_ = teuthology.split_role(role) clone = client_config.get('clone', False) image_url = client_config.get('image_url', DEFAULT_IMAGE_URL) device_type = client_config.get('type', 'filesystem') @@ -39,8 +40,8 @@ def normalize_disks(config): disks = client_config.get('disks', DEFAULT_NUM_DISKS) if not isinstance(disks, list): - disks = [{'image_name': '{client}.{num}'.format(client=client, - num=i)} + disks = [{'image_name': '{typ}.{id_}.{num}'.format(typ=typ, id_=id_, + num=i)} for i in range(int(disks))] client_config['disks'] = disks @@ -90,7 +91,7 @@ def normalize_disks(config): disks.append(clone) def create_images(ctx, config, managers): - for client, client_config in config.items(): + for role, client_config in config.items(): disks = client_config['disks'] for disk in disks: if disk.get('action') != 'create' or ( @@ -101,7 +102,7 @@ def create_images(ctx, config, managers): if disk['encryption_format'] != 'none': image_size += ENCRYPTION_HEADER_SIZE create_config = { - client: { + role: { 'image_name': disk['image_name'], 'image_format': 2, 'image_size': image_size, @@ -114,14 +115,14 @@ def create_images(ctx, config, managers): ) def create_clones(ctx, config, managers): - for client, client_config in config.items(): + for role, client_config in config.items(): disks = client_config['disks'] for disk in disks: if disk['action'] != 'clone': continue create_config = { - client: { + role: { 'image_name': disk['image_name'], 'parent_name': disk['parent_name'], 'encryption_format': disk['encryption_format'], @@ -133,7 +134,7 @@ def create_clones(ctx, config, managers): ) def create_encrypted_devices(ctx, config, managers): - for client, client_config in config.items(): + for role, client_config in config.items(): disks = client_config['disks'] for disk in disks: if (disk['encryption_format'] == 'none' and @@ -141,7 +142,7 @@ def create_encrypted_devices(ctx, config, managers): 'device_letter' not in disk: continue - dev_config = {client: disk} + dev_config = {role: disk} managers.append( lambda dev_config=dev_config: rbd.dev_create(ctx=ctx, config=dev_config) @@ -153,9 +154,9 @@ def create_dirs(ctx, config): Handle directory creation and cleanup """ testdir = teuthology.get_testdir(ctx) - for client, client_config in config.items(): + for role, client_config in config.items(): assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() + (remote,) = ctx.cluster.only(role).remotes.keys() remote.run( args=[ 'install', '-d', '-m0755', '--', @@ -166,9 +167,9 @@ def create_dirs(ctx, config): try: yield finally: - for client, client_config in config.items(): + for role, client_config in config.items(): assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() + (remote,) = ctx.cluster.only(role).remotes.keys() remote.run( args=[ 'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true', @@ -181,20 +182,20 @@ def install_block_rbd_driver(ctx, config): Make sure qemu rbd block driver (block-rbd.so) is installed """ packages = {} - for client, _ in config.items(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role, _ in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() if remote.os.package_type == 'rpm': - packages[client] = ['qemu-kvm-block-rbd'] + packages[role] = ['qemu-kvm-block-rbd'] else: - packages[client] = ['qemu-block-extra', 'qemu-utils'] - for pkg in packages[client]: + packages[role] = ['qemu-block-extra', 'qemu-utils'] + for pkg in packages[role]: install_package(pkg, remote) try: yield finally: - for client, _ in config.items(): - (remote,) = ctx.cluster.only(client).remotes.keys() - for pkg in packages[client]: + for role, _ in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + for pkg in packages[role]: remove_package(pkg, remote) @contextlib.contextmanager @@ -210,23 +211,23 @@ def generate_iso(ctx, config): git_url = teuth_config.get_ceph_qa_suite_git_url() log.info('Pulling tests from %s ref %s', git_url, refspec) - for client, client_config in config.items(): + for role, client_config in config.items(): assert 'test' in client_config, 'You must specify a test to run' test = client_config['test'] - (remote,) = ctx.cluster.only(client).remotes.keys() + (remote,) = ctx.cluster.only(role).remotes.keys() - clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client) + clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=role) remote.run(args=refspec.clone(git_url, clone_dir)) src_dir = os.path.dirname(__file__) - userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client) - metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client) + userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + role) + metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + role) with open(os.path.join(src_dir, 'userdata_setup.yaml')) as f: test_setup = ''.join(f.readlines()) # configuring the commands to setup the nfs mount - mnt_dir = "/export/{client}".format(client=client) + mnt_dir = "/export/{role}".format(role=role) test_setup = test_setup.format( mnt_dir=mnt_dir ) @@ -285,9 +286,10 @@ def generate_iso(ctx, config): with open(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f: remote.write_file(metadata_path, f) - test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client) + test_file = '{tdir}/qemu/{role}.test.sh'.format(tdir=testdir, role=role) + cluster, _, _ = teuthology.split_role(role) - log.info('fetching test %s for %s', test, client) + log.info('fetching test %s for %s', test, role) remote.run( args=[ 'cp', '--', os.path.join(clone_dir, test), test_file, @@ -299,28 +301,28 @@ def generate_iso(ctx, config): args=[ 'genisoimage', '-quiet', '-input-charset', 'utf-8', '-volid', 'cidata', '-joliet', '-rock', - '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), + '-o', '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role), '-graft-points', 'user-data={userdata}'.format(userdata=userdata_path), 'meta-data={metadata}'.format(metadata=metadata_path), - 'ceph.conf=/etc/ceph/ceph.conf', - 'ceph.keyring=/etc/ceph/ceph.keyring', + 'ceph.conf=/etc/ceph/{cluster}.conf'.format(cluster=cluster), + 'ceph.keyring=/etc/ceph/{cluster}.keyring'.format(cluster=cluster), 'test.sh={file}'.format(file=test_file), ], ) try: yield finally: - for client in config.keys(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role in config.keys(): + (remote,) = ctx.cluster.only(role).remotes.keys() remote.run( args=[ 'rm', '-rf', - '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - os.path.join(testdir, 'qemu', 'userdata.' + client), - os.path.join(testdir, 'qemu', 'metadata.' + client), - '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client), - '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client), + '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role), + os.path.join(testdir, 'qemu', 'userdata.' + role), + os.path.join(testdir, 'qemu', 'metadata.' + role), + '{tdir}/qemu/{role}.test.sh'.format(tdir=testdir, role=role), + '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=role), ], ) @@ -331,10 +333,11 @@ def download_image(ctx, config): testdir = teuthology.get_testdir(ctx) client_base_files = {} - for client, client_config in config.items(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role, client_config in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() - client_base_files[client] = [] + cluster, _, _ = teuthology.split_role(role) + client_base_files[role] = [] disks = client_config['disks'] for disk in disks: if disk['action'] != 'create' or 'image_url' not in disk: @@ -342,7 +345,7 @@ def download_image(ctx, config): base_file = '{tdir}/qemu/base.{name}.qcow2'.format(tdir=testdir, name=disk['image_name']) - client_base_files[client].append(base_file) + client_base_files[role].append(base_file) remote.run( args=[ @@ -354,15 +357,16 @@ def download_image(ctx, config): remote.run( args=[ 'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw', - base_file, 'rbd:rbd/{image_name}'.format(image_name=disk['image_name']) + base_file,'rbd:rbd/{image_name}:conf=/etc/ceph/{cluster}.conf'.format( + image_name=disk['image_name'], cluster=cluster) ] ) else: - dev_config = {client: {'image_name': disk['image_name'], - 'encryption_format': disk['encryption_format']}} + dev_config = {role: {'image_name': disk['image_name'], + 'encryption_format': disk['encryption_format']}} raw_file = '{tdir}/qemu/base.{name}.raw'.format( tdir=testdir, name=disk['image_name']) - client_base_files[client].append(raw_file) + client_base_files[role].append(raw_file) remote.run( args=[ 'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw', @@ -373,11 +377,12 @@ def download_image(ctx, config): remote.run( args=[ 'dd', 'if={name}'.format(name=raw_file), - 'of={name}'.format(name=dev_config[client]['device_path']), + 'of={name}'.format(name=dev_config[role]['device_path']), 'bs=4M', 'conv=fdatasync' ] ) + cluster, _, _ = teuthology.split_role(role) for disk in disks: if disk['action'] == 'clone' or \ disk['encryption_format'] != 'none' or \ @@ -386,7 +391,7 @@ def download_image(ctx, config): remote.run( args=[ - 'rbd', 'resize', + 'rbd', '--cluster', cluster, 'resize', '--size={image_size}M'.format(image_size=disk['image_size']), disk['image_name'], run.Raw('||'), 'true' ] @@ -396,8 +401,8 @@ def download_image(ctx, config): yield finally: log.debug('cleaning up base image files') - for client, base_files in client_base_files.items(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role, base_files in client_base_files.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() for base_file in base_files: remote.run( args=[ @@ -406,14 +411,14 @@ def download_image(ctx, config): ) -def _setup_nfs_mount(remote, client, service_name, mount_dir): +def _setup_nfs_mount(remote, role, service_name, mount_dir): """ Sets up an nfs mount on the remote that the guest can use to store logs. This nfs mount is also used to touch a file at the end of the test to indicate if the test was successful or not. """ - export_dir = "/export/{client}".format(client=client) + export_dir = "/export/{role}".format(role=role) log.info("Creating the nfs export directory...") remote.run(args=[ 'sudo', 'mkdir', '-p', export_dir, @@ -442,13 +447,13 @@ def _setup_nfs_mount(remote, client, service_name, mount_dir): remote.run(args=['sudo', 'systemctl', 'restart', service_name]) -def _teardown_nfs_mount(remote, client, service_name): +def _teardown_nfs_mount(remote, role, service_name): """ Tears down the nfs mount on the remote used for logging and reporting the status of the tests being ran in the guest. """ log.info("Tearing down the nfs mount for {remote}".format(remote=remote)) - export_dir = "/export/{client}".format(client=client) + export_dir = "/export/{role}".format(role=role) log.info("Stopping NFS...") if remote.os.package_type == "deb": remote.run(args=[ @@ -483,9 +488,9 @@ def run_qemu(ctx, config): """Setup kvm environment and start qemu""" procs = [] testdir = teuthology.get_testdir(ctx) - for client, client_config in config.items(): - (remote,) = ctx.cluster.only(client).remotes.keys() - log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client) + for role, client_config in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + log_dir = '{tdir}/archive/qemu/{role}'.format(tdir=testdir, role=role) remote.run( args=[ 'mkdir', log_dir, run.Raw('&&'), @@ -502,7 +507,7 @@ def run_qemu(ctx, config): # make an nfs mount to use for logging and to # allow to test to tell teuthology the tests outcome - _setup_nfs_mount(remote, client, nfs_service_name, log_dir) + _setup_nfs_mount(remote, role, nfs_service_name, log_dir) # Hack to make sure /dev/kvm permissions are set correctly # See http://tracker.ceph.com/issues/17977 and @@ -524,13 +529,13 @@ def run_qemu(ctx, config): '-smp', str(client_config.get('cpus', DEFAULT_CPUS)), '-m', str(client_config.get('memory', DEFAULT_MEM)), # cd holding metadata for cloud-init - '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), + '-cdrom', '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role), ] cachemode = 'none' - ceph_config = ctx.ceph['ceph'].conf.get('global', {}) - ceph_config.update(ctx.ceph['ceph'].conf.get('client', {})) - ceph_config.update(ctx.ceph['ceph'].conf.get(client, {})) + cluster, _, id_ = teuthology.split_role(role) + ceph_config = ctx.ceph[cluster].conf.get('global', {}) + ceph_config.update(ctx.ceph[cluster].conf.get('client', {})) if ceph_config.get('rbd cache', True): if ceph_config.get('rbd cache max dirty', 1) > 0: cachemode = 'writeback' @@ -545,10 +550,8 @@ def run_qemu(ctx, config): if disk['encryption_format'] == 'none' and \ disk.get('parent_encryption_format', 'none') == 'none': interface = 'virtio' - disk_spec = 'rbd:rbd/{img}:id={id}'.format( - img=disk['image_name'], - id=client[len('client.'):] - ) + disk_spec = 'rbd:rbd/{img}:conf=/etc/ceph/{cluster}.conf:id={id}'.format( + img=disk['image_name'], cluster=cluster, id=id_) else: # encrypted disks use ide as a temporary workaround for # a bug in qemu when using virtio over nbd @@ -570,7 +573,7 @@ def run_qemu(ctx, config): procs.append( remote.run( args=args, - logger=log.getChild(client), + logger=log.getChild(role), stdin=run.PIPE, wait=False, ) @@ -588,12 +591,12 @@ def run_qemu(ctx, config): time.sleep(time_wait) log.debug('checking that qemu tests succeeded...') - for client in config.keys(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role in config.keys(): + (remote,) = ctx.cluster.only(role).remotes.keys() # ensure we have permissions to all the logs - log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, - client=client) + log_dir = '{tdir}/archive/qemu/{role}'.format(tdir=testdir, + role=role) remote.run( args=[ 'sudo', 'chmod', 'a+rw', '-R', log_dir @@ -601,20 +604,20 @@ def run_qemu(ctx, config): ) # teardown nfs mount - _teardown_nfs_mount(remote, client, nfs_service_name) + _teardown_nfs_mount(remote, role, nfs_service_name) # check for test status remote.run( args=[ 'test', '-f', - '{tdir}/archive/qemu/{client}/success'.format( + '{tdir}/archive/qemu/{role}/success'.format( tdir=testdir, - client=client + role=role ), ], ) log.info("Deleting exported directory...") - for client in config.keys(): - (remote,) = ctx.cluster.only(client).remotes.keys() + for role in config.keys(): + (remote,) = ctx.cluster.only(role).remotes.keys() remote.run(args=[ 'sudo', 'rm', '-r', '/export' ]) @@ -693,6 +696,14 @@ def task(ctx, config): test data type: text/plain filename: /tmp/data + + This task supports roles that include a ceph cluster, e.g.:: + + tasks: + - ceph: + - qemu: + backup.client.0: [foo] + client.1: [bar] # cluster is implicitly 'ceph' """ assert isinstance(config, dict), \ "task qemu only supports a dictionary for configuration" diff --git a/qa/tasks/rabbitmq.py b/qa/tasks/rabbitmq.py index 944233d9775..e9e39cfdf4a 100644 --- a/qa/tasks/rabbitmq.py +++ b/qa/tasks/rabbitmq.py @@ -70,22 +70,25 @@ def run_rabbitmq(ctx, config): (remote,) = ctx.cluster.only(client).remotes.keys() ctx.cluster.only(client).run(args=[ - 'sudo', 'systemctl', 'enable', 'rabbitmq-server.service' + 'echo', 'loopback_users.guest = false', run.Raw('|'), 'sudo', 'tee', '-a', '/etc/rabbitmq/rabbitmq.conf' ], ) ctx.cluster.only(client).run(args=[ - 'sudo', '/sbin/service', 'rabbitmq-server', 'start' + 'sudo', 'systemctl', 'enable', 'rabbitmq-server' + ], + ) + + ctx.cluster.only(client).run(args=[ + 'sudo', 'systemctl', 'start', 'rabbitmq-server' ], ) - ''' # To check whether rabbitmq-server is running or not ctx.cluster.only(client).run(args=[ - 'sudo', '/sbin/service', 'rabbitmq-server', 'status' + 'sudo', 'systemctl', 'status', 'rabbitmq-server' ], ) - ''' try: yield @@ -96,7 +99,7 @@ def run_rabbitmq(ctx, config): (remote,) = ctx.cluster.only(client).remotes.keys() ctx.cluster.only(client).run(args=[ - 'sudo', '/sbin/service', 'rabbitmq-server', 'stop' + 'sudo', 'systemctl', 'stop', 'rabbitmq-server' ], ) diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py index d8eac5d886f..96bcc770511 100644 --- a/qa/tasks/rados.py +++ b/qa/tasks/rados.py @@ -36,6 +36,8 @@ def task(ctx, config): write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED. This mean data don't access in the near future. Let osd backend don't keep data in cache. + pct_update_delay: delay before primary propogates pct on write pause, + defaults to 5s if balance_reads is set For example:: @@ -139,6 +141,7 @@ def task(ctx, config): object_size = int(config.get('object_size', 4000000)) op_weights = config.get('op_weights', {}) testdir = teuthology.get_testdir(ctx) + pct_update_delay = None args = [ 'adjust-ulimits', 'ceph-coverage', @@ -166,6 +169,7 @@ def task(ctx, config): args.extend(['--pool-snaps']) if config.get('balance_reads', False): args.extend(['--balance-reads']) + pct_update_delay = config.get('pct_update_delay', 5); if config.get('localize_reads', False): args.extend(['--localize-reads']) if config.get('max_attr_len', None): @@ -274,6 +278,10 @@ def task(ctx, config): if config.get('fast_read', False): manager.raw_cluster_cmd( 'osd', 'pool', 'set', pool, 'fast_read', 'true') + if pct_update_delay: + manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool, + 'pct_update_delay', str(pct_update_delay)); min_size = config.get('min_size', None); if min_size is not None: manager.raw_cluster_cmd( diff --git a/qa/tasks/radosgw_admin.py b/qa/tasks/radosgw_admin.py index 3b98702acca..fb82378761b 100644 --- a/qa/tasks/radosgw_admin.py +++ b/qa/tasks/radosgw_admin.py @@ -16,6 +16,7 @@ import logging import time import datetime import sys +import errno from io import StringIO from queue import Queue @@ -725,6 +726,40 @@ def task(ctx, config): (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--tenant', tenant_name, '--uid', 'tenanteduser'], check_status=True) + account_id = 'RGW12312312312312312' + account_name = 'testacct' + rgwadmin(ctx, client, [ + 'account', 'create', + '--account-id', account_id, + '--account-name', account_name, + ], check_status=True) + rgwadmin(ctx, client, [ + 'user', 'create', + '--account-id', account_id, + '--uid', 'testacctuser', + '--display-name', 'accountuser', + '--gen-access-key', + '--gen-secret', + ], check_status=True) + + # TESTCASE 'bucket link', 'bucket', 'account user', 'fails' + (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--bucket', bucket_name, '--uid', 'testacctuser']) + assert err == errno.EINVAL + + rgwadmin(ctx, client, ['user', 'rm', '--uid', 'testacctuser'], check_status=True) + + # TESTCASE 'bucket link', 'bucket', 'account', 'succeeds' + rgwadmin(ctx, client, + ['bucket', 'link', '--bucket', bucket_name, '--account-id', account_id], + check_status=True) + + # relink the bucket to the first user and delete the account + rgwadmin(ctx, client, + ['bucket', 'link', '--bucket', bucket_name, '--uid', user1], + check_status=True) + rgwadmin(ctx, client, ['account', 'rm', '--account-id', account_id], + check_status=True) + # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' # upload an object diff --git a/qa/tasks/rbd.py b/qa/tasks/rbd.py index b0ffaba8386..026b695fb00 100644 --- a/qa/tasks/rbd.py +++ b/qa/tasks/rbd.py @@ -65,6 +65,7 @@ def create_image(ctx, config): size = properties.get('image_size', 10240) fmt = properties.get('image_format', 1) encryption_format = properties.get('encryption_format', 'none') + cluster, _, _ = teuthology.split_role(role) (remote,) = ctx.cluster.only(role).remotes.keys() log.info('Creating image {name} with size {size}'.format(name=name, size=size)) @@ -73,6 +74,7 @@ def create_image(ctx, config): 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), 'rbd', + '--cluster', cluster, '-p', 'rbd', 'create', '--size', str(size), @@ -99,6 +101,7 @@ def create_image(ctx, config): 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), 'rbd', + '--cluster', cluster, 'encryption', 'format', name, @@ -117,6 +120,7 @@ def create_image(ctx, config): if properties is None: properties = {} name = properties.get('image_name', default_image_name(role)) + cluster, _, _ = teuthology.split_role(role) (remote,) = ctx.cluster.only(role).remotes.keys() remote.run( args=[ @@ -124,6 +128,7 @@ def create_image(ctx, config): 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), 'rbd', + '--cluster', cluster, '-p', 'rbd', 'rm', name, @@ -160,6 +165,7 @@ def clone_image(ctx, config): properties = {} name = properties.get('image_name', default_image_name(role)) + cluster, _, _ = teuthology.split_role(role) parent_name = properties.get('parent_name') assert parent_name is not None, \ "parent_name is required" @@ -195,7 +201,7 @@ def clone_image(ctx, config): 'adjust-ulimits', 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', '-p', 'rbd' + 'rbd', '--cluster', cluster, '-p', 'rbd' ] args.extend(cmd) remote.run(args=args) @@ -209,6 +215,7 @@ def clone_image(ctx, config): if properties is None: properties = {} name = properties.get('image_name', default_image_name(role)) + cluster, _, _ = teuthology.split_role(role) parent_name = properties.get('parent_name') parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name) @@ -221,7 +228,7 @@ def clone_image(ctx, config): 'adjust-ulimits', 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', '-p', 'rbd' + 'rbd', '--cluster', cluster, '-p', 'rbd' ] args.extend(cmd) remote.run(args=args) @@ -305,6 +312,7 @@ def dev_create(ctx, config): if properties is None: properties = {} name = properties.get('image_name', default_image_name(role)) + cluster, _, _ = teuthology.split_role(role) parent_encryption_format = properties.get('parent_encryption_format', 'none') encryption_format = properties.get('encryption_format', @@ -365,6 +373,7 @@ def dev_create(ctx, config): 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=testdir), 'rbd', + '--cluster', cluster, '--id', role.rsplit('.')[-1], '-p', 'rbd', 'map', @@ -609,7 +618,8 @@ def xfstests(ctx, config): running_xfstests = {} for role, properties in runs: - assert role.startswith('client.'), \ + cluster, typ, _ = teuthology.split_role(role) + assert typ == "client", \ "task xfstests can only run on client nodes" for host, roles_for_host in ctx.cluster.remotes.items(): if role in roles_for_host: diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py index f5a6f5a2615..f93ca017fa2 100644 --- a/qa/tasks/rgw_multisite.py +++ b/qa/tasks/rgw_multisite.py @@ -139,7 +139,10 @@ class RGWMultisite(Task): if cluster != cluster1: # already created on master cluster log.info('pulling realm configuration to %s', cluster.name) - realm.pull(cluster, master_zone.gateways[0], creds) + + is_default = self.config['realm'].get('is_default', False) + args = ['--default'] if is_default else [] + realm.pull(cluster, master_zone.gateways[0], creds, args) # use the first zone's cluster to create the zonegroup if not zonegroup: @@ -358,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config): if endpoints: # replace client names with their gateway endpoints config['endpoints'] = extract_gateway_endpoints(gateways, endpoints) + if not config.get('api_name'): # otherwise it will be set to an empty string + config['api_name'] = config['name'] zonegroup = multisite.ZoneGroup(config['name'], period) # `zonegroup set` needs --default on command line, and 'is_master' in json args = is_default_arg(config) diff --git a/qa/tasks/rgw_multisite_tests.py b/qa/tasks/rgw_multisite_tests.py index 822cbcf7910..e0a38deadd2 100644 --- a/qa/tasks/rgw_multisite_tests.py +++ b/qa/tasks/rgw_multisite_tests.py @@ -72,7 +72,9 @@ class RGWMultisiteTests(Task): # create test account/user log.info('creating test user..') user = multisite.User('rgw-multisite-test-user', account='RGW11111111111111111') - master_zone.cluster.admin(['account', 'create', '--account-id', user.account]) + arg = ['--account-id', user.account] + arg += master_zone.zone_args() + master_zone.cluster.admin(['account', 'create'] + arg) user.create(master_zone, ['--display-name', 'TestUser', '--gen-access-key', '--gen-secret']) diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 6cb75173966..fae5ef3bf00 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -8,7 +8,7 @@ import json import logging import os import yaml -from io import BytesIO +from io import BytesIO, StringIO from tarfile import ReadError from tasks.ceph_manager import CephManager @@ -235,10 +235,14 @@ def ceph_log(ctx, config): r = ctx.rook[cluster_name].remote.run( stdout=BytesIO(), args=args, + stderr=StringIO(), ) stdout = r.stdout.getvalue().decode() if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', diff --git a/qa/tasks/s3a_hadoop.py b/qa/tasks/s3a_hadoop.py index 7b77359fcf2..4518a6f397c 100644 --- a/qa/tasks/s3a_hadoop.py +++ b/qa/tasks/s3a_hadoop.py @@ -1,5 +1,6 @@ import contextlib import logging +import os from teuthology import misc from teuthology.orchestra import run @@ -40,7 +41,7 @@ def task(ctx, config): # get versions maven_major = config.get('maven-major', 'maven-3') - maven_version = config.get('maven-version', '3.6.3') + maven_version = config.get('maven-version', '3.9.9') hadoop_ver = config.get('hadoop-version', '2.9.2') bucket_name = config.get('bucket-name', 's3atest') access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F') @@ -48,11 +49,19 @@ def task(ctx, config): 'secret-key', 'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb') + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + # set versions for cloning the repo apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format( maven_version=maven_version) - maven_link = 'http://archive.apache.org/dist/maven/' + \ - '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven + maven_link = '{apache_mirror_url_front}/maven/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + \ + apache_maven hadoop_git = 'https://github.com/apache/hadoop' hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver) if hadoop_ver == 'trunk': @@ -204,6 +213,7 @@ def run_s3atest(client, maven_version, testdir, test_options): run.Raw('&&'), run.Raw(rm_test), run.Raw('&&'), + run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'), run.Raw(run_test), run.Raw(test_options) ] diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py index cd0cd9d146d..85ab97d23cd 100644 --- a/qa/tasks/s3tests.py +++ b/qa/tasks/s3tests.py @@ -57,6 +57,17 @@ def download(ctx, config): 'git', 'reset', '--hard', sha1, ], ) + if client_config.get('boto3_extensions'): + ctx.cluster.only(client).run( + args=['mkdir', + '-p', + '/home/ubuntu/.aws/models/s3/2006-03-01/'] + ) + (remote,) = ctx.cluster.only(client).remotes.keys() + remote_file = '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json' + local_file = '{qadir}/../examples/rgw/boto3/service-2.sdk-extras.json'.format(qadir=ctx.config.get('suite_path')) + remote.put_file(local_file, remote_file) + try: yield finally: @@ -70,6 +81,17 @@ def download(ctx, config): '{tdir}/s3-tests-{client}'.format(tdir=testdir, client=client), ], ) + if client_config.get('boto3_extensions'): + ctx.cluster.only(client).run( + args=[ + 'rm', '-rf', '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json', + ], + ) + ctx.cluster.only(client).run( + args=[ + 'cd', '/home/ubuntu/', run.Raw('&&'), 'rmdir', '-p', '.aws/models/s3/2006-03-01/', + ], + ) def _config_user(s3tests_conf, section, user, email): @@ -89,6 +111,8 @@ def _config_user(s3tests_conf, section, user, email): s3tests_conf[section].setdefault('totp_seed', base64.b32encode(os.urandom(40)).decode()) s3tests_conf[section].setdefault('totp_seconds', '5') + if section == 's3 tenant': + s3tests_conf[section].setdefault('tenant', 'testx') @contextlib.contextmanager @@ -442,8 +466,10 @@ def run_tests(ctx, config): attrs += ['not fails_with_subdomain'] if not client_config.get('with-sse-s3'): attrs += ['not sse_s3'] - + attrs += client_config.get('extra_attrs', []) + if 'bucket_logging' not in attrs: + attrs += ['not bucket_logging'] if 'unit_test_scan' in client_config and client_config['unit_test_scan']: xmlfile_id = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S--") + str(uuid.uuid4()) xmlpath= f'{testdir}/archive/s3test-{xmlfile_id}.xml' diff --git a/qa/tasks/s3tests_java.py b/qa/tasks/s3tests_java.py index 3e20e10d06c..a58aa6cf0b4 100644 --- a/qa/tasks/s3tests_java.py +++ b/qa/tasks/s3tests_java.py @@ -284,6 +284,7 @@ class S3tests_java(Task): args = ['cd', '{tdir}/s3-tests-java'.format(tdir=testdir), run.Raw('&&'), + run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'), '/opt/gradle/gradle/bin/gradle', 'clean', 'test', '--rerun-tasks', '--no-build-cache', ] diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py new file mode 100644 index 00000000000..a84a85bb307 --- /dev/null +++ b/qa/tasks/stretch_mode_disable_enable.py @@ -0,0 +1,547 @@ +import logging +from tasks.mgr.mgr_test_case import MgrTestCase + +log = logging.getLogger(__name__) + +class TestStretchMode(MgrTestCase): + """ + Test the stretch mode feature of Ceph + """ + POOL = 'stretch_pool' + CLUSTER = "ceph" + WRITE_PERIOD = 10 + RECOVERY_PERIOD = WRITE_PERIOD * 6 + SUCCESS_HOLD_TIME = 7 + STRETCH_CRUSH_RULE = 'stretch_rule' + STRETCH_CRUSH_RULE_ID = None + STRETCH_BUCKET_TYPE = 'datacenter' + TIEBREAKER_MON_NAME = 'e' + DEFAULT_POOL_TYPE = 'replicated' + DEFAULT_POOL_CRUSH_RULE = 'replicated_rule' + DEFAULT_POOL_SIZE = 3 + DEFAULT_POOL_MIN_SIZE = 2 + DEFAULT_POOL_CRUSH_RULE_ID = None + # This dictionary maps the datacenter to the osd ids and hosts + DC_OSDS = { + 'dc1': { + "host01": [0, 1], + "host02": [2, 3], + }, + 'dc2': { + "host03": [4, 5], + "host04": [6, 7], + }, + } + DC_MONS = { + 'dc1': { + "host01": ['a'], + "host02": ['b'], + }, + 'dc2': { + "host03": ['c'], + "host04": ['d'], + }, + 'dc3': { + "host05": ['e'], + } + } + def _osd_count(self): + """ + Get the number of OSDs in the cluster. + """ + osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json() + return len(osd_map['osds']) + + def setUp(self): + """ + Setup the cluster and + ensure we have a clean condition before the test. + """ + # Ensure we have at least 6 OSDs + super(TestStretchMode, self).setUp() + self.DEFAULT_POOL_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.DEFAULT_POOL_CRUSH_RULE) + self.STRETCH_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.STRETCH_CRUSH_RULE) + if self._osd_count() < 4: + self.skipTest("Not enough OSDS!") + + # Remove any filesystems so that we can remove their pools + if self.mds_cluster: + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + self.mds_cluster.delete_all_filesystems() + + # Remove all other pools + for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']: + try: + self.mgr_cluster.mon_manager.remove_pool(pool['pool_name']) + except: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'delete', + pool['pool_name'], + pool['pool_name'], + '--yes-i-really-really-mean-it') + + def _setup_pool( + self, + pool_name=POOL, + pg_num=16, + pool_type=DEFAULT_POOL_TYPE, + crush_rule=DEFAULT_POOL_CRUSH_RULE, + size=None, + min_size=None + ): + """ + Create a pool, set its size and pool if specified. + """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', pool_name, str(pg_num), pool_type, crush_rule) + + if size is not None: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool_name, 'size', str(size)) + + if min_size is not None: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', pool_name, 'min_size', str(min_size)) + + def _write_some_data(self, t): + """ + Write some data to the pool to simulate a workload. + """ + args = [ + "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"] + self.mgr_cluster.admin_remote.run(args=args, wait=True) + + def _get_all_mons_from_all_dc(self): + """ + Get all mons from all datacenters. + """ + return [mon for dc in self.DC_MONS.values() for mons in dc.values() for mon in mons] + + def _bring_back_mon(self, mon): + """ + Bring back the mon. + """ + try: + self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart() + except Exception: + log.error("Failed to bring back mon.{}".format(str(mon))) + pass + + def _get_host(self, osd): + """ + Get the host of the osd. + """ + for dc, nodes in self.DC_OSDS.items(): + for node, osds in nodes.items(): + if osd in osds: + return node + return None + + def _move_osd_back_to_host(self, osd): + """ + Move the osd back to the host. + """ + host = self._get_host(osd) + assert host is not None, "The host of osd {} is not found.".format(osd) + log.debug("Moving osd.%d back to %s", osd, host) + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'crush', 'move', 'osd.{}'.format(str(osd)), + 'host={}'.format(host) + ) + + def tearDown(self): + """ + Clean up the cluster after the test. + """ + # Remove the pool + if self.POOL in self.mgr_cluster.mon_manager.pools: + self.mgr_cluster.mon_manager.remove_pool(self.POOL) + + osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json() + for osd in osd_map['osds']: + # mark all the osds in + if osd['weight'] == 0.0: + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'in', str(osd['osd'])) + # Bring back all the osds and move it back to the host. + if osd['up'] == 0: + self.mgr_cluster.mon_manager.revive_osd(osd['osd']) + self._move_osd_back_to_host(osd['osd']) + + # Bring back all the mons + mons = self._get_all_mons_from_all_dc() + for mon in mons: + self._bring_back_mon(mon) + super(TestStretchMode, self).tearDown() + + def _kill_osd(self, osd): + """ + Kill the osd. + """ + try: + self.ctx.daemons.get_daemon('osd', osd, self.CLUSTER).stop() + except Exception: + log.error("Failed to stop osd.{}".format(str(osd))) + pass + + def _get_osds_data(self, want_osds): + """ + Get the osd data + """ + all_osds_data = \ + self.mgr_cluster.mon_manager.get_osd_dump_json()['osds'] + return [ + osd_data for osd_data in all_osds_data + if int(osd_data['osd']) in want_osds + ] + + def _get_osds_by_dc(self, dc): + """ + Get osds by datacenter. + """ + ret = [] + for host, osds in self.DC_OSDS[dc].items(): + ret.extend(osds) + return ret + + def _fail_over_all_osds_in_dc(self, dc): + """ + Fail over all osds in specified <datacenter> + """ + if not isinstance(dc, str): + raise ValueError("dc must be a string") + if dc not in self.DC_OSDS: + raise ValueError( + "dc must be one of the following: %s" % self.DC_OSDS.keys() + ) + log.debug("Failing over all osds in %s", dc) + osds = self._get_osds_by_dc(dc) + # fail over all the OSDs in the DC + log.debug("OSDs to failed over: %s", osds) + for osd_id in osds: + self._kill_osd(osd_id) + # wait until all the osds are down + self.wait_until_true( + lambda: all([int(osd['up']) == 0 + for osd in self._get_osds_data(osds)]), + timeout=self.RECOVERY_PERIOD + ) + + def _check_mons_out_of_quorum(self, want_mons): + """ + Check if the mons are not in quorum. + """ + quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names() + return all([mon not in quorum_names for mon in want_mons]) + + def _kill_mon(self, mon): + """ + Kill the mon. + """ + try: + self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).stop() + except Exception: + log.error("Failed to stop mon.{}".format(str(mon))) + pass + + def _get_mons_by_dc(self, dc): + """ + Get mons by datacenter. + """ + ret = [] + for host, mons in self.DC_MONS[dc].items(): + ret.extend(mons) + return ret + + def _fail_over_all_mons_in_dc(self, dc): + """ + Fail over all mons in the specified <datacenter> + """ + if not isinstance(dc, str): + raise ValueError("dc must be a string") + if dc not in self.DC_MONS: + raise ValueError("dc must be one of the following: %s" % + ", ".join(self.DC_MONS.keys())) + log.debug("Failing over all mons %s", dc) + mons = self._get_mons_by_dc(dc) + log.debug("Mons to be failed over: %s", mons) + for mon in mons: + self._kill_mon(mon) + # wait until all the mons are out of quorum + self.wait_until_true( + lambda: self._check_mons_out_of_quorum(mons), + timeout=self.RECOVERY_PERIOD + ) + + def _stretch_mode_enabled_correctly(self): + """ + Evaluate whether the stretch mode is enabled correctly. + by checking the OSDMap and MonMap. + """ + # Checking the OSDMap + osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json() + for pool in osdmap['pools']: + # expects crush_rule to be stretch_rule + self.assertEqual( + self.STRETCH_CRUSH_RULE_ID, + pool['crush_rule'] + ) + # expects pool size to be 4 + self.assertEqual( + 4, + pool['size'] + ) + # expects pool min_size to be 2 + self.assertEqual( + 2, + pool['min_size'] + ) + # expects pool is_stretch_pool flag to be true + self.assertEqual( + True, + pool['is_stretch_pool'] + ) + # expects peering_crush_bucket_count = 2 (always this value for stretch mode) + self.assertEqual( + 2, + pool['peering_crush_bucket_count'] + ) + # expects peering_crush_bucket_target = 2 (always this value for stretch mode) + self.assertEqual( + 2, + pool['peering_crush_bucket_target'] + ) + # expects peering_crush_bucket_barrier = 8 (crush type of datacenter is 8) + self.assertEqual( + 8, + pool['peering_crush_bucket_barrier'] + ) + # expects stretch_mode_enabled to be True + self.assertEqual( + True, + osdmap['stretch_mode']['stretch_mode_enabled'] + ) + # expects stretch_mode_bucket_count to be 2 + self.assertEqual( + 2, + osdmap['stretch_mode']['stretch_bucket_count'] + ) + # expects degraded_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['degraded_stretch_mode'] + ) + # expects recovering_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['recovering_stretch_mode'] + ) + # expects stretch_mode_bucket to be 8 (datacenter crush type = 8) + self.assertEqual( + 8, + osdmap['stretch_mode']['stretch_mode_bucket'] + ) + # Checking the MonMap + monmap = self.mgr_cluster.mon_manager.get_mon_dump_json() + # expects stretch_mode to be True + self.assertEqual( + True, + monmap['stretch_mode'] + ) + # expects disallowed_leaders to be tiebreaker_mon + self.assertEqual( + self.TIEBREAKER_MON_NAME, + monmap['disallowed_leaders'] + ) + # expects tiebreaker_mon to be tiebreaker_mon + self.assertEqual( + self.TIEBREAKER_MON_NAME, + monmap['tiebreaker_mon'] + ) + + def _stretch_mode_disabled_correctly(self): + """ + Evaluate whether the stretch mode is disabled correctly. + by checking the OSDMap and MonMap. + """ + # Checking the OSDMap + osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json() + for pool in osdmap['pools']: + # expects crush_rule to be default + self.assertEqual( + self.DEFAULT_POOL_CRUSH_RULE_ID, + pool['crush_rule'] + ) + # expects pool size to be default + self.assertEqual( + self.DEFAULT_POOL_SIZE, + pool['size'] + ) + # expects pool min_size to be default + self.assertEqual( + self.DEFAULT_POOL_MIN_SIZE, + pool['min_size'] + ) + # expects pool is_stretch_pool flag to be false + self.assertEqual( + False, + pool['is_stretch_pool'] + ) + # expects peering_crush_bucket_count = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_count'] + ) + # expects peering_crush_bucket_target = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_target'] + ) + # expects peering_crush_bucket_barrier = 0 + self.assertEqual( + 0, + pool['peering_crush_bucket_barrier'] + ) + # expects stretch_mode_enabled to be False + self.assertEqual( + False, + osdmap['stretch_mode']['stretch_mode_enabled'] + ) + # expects stretch_mode_bucket to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['stretch_bucket_count'] + ) + # expects degraded_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['degraded_stretch_mode'] + ) + # expects recovering_stretch_mode to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['recovering_stretch_mode'] + ) + # expects stretch_mode_bucket to be 0 + self.assertEqual( + 0, + osdmap['stretch_mode']['stretch_mode_bucket'] + ) + # Checking the MonMap + monmap = self.mgr_cluster.mon_manager.get_mon_dump_json() + # expects stretch_mode to be False + self.assertEqual( + False, + monmap['stretch_mode'] + ) + # expects disallowed_leaders to be empty + self.assertEqual( + "", + monmap['disallowed_leaders'] + ) + # expects tiebreaker_mon to be empty + self.assertEqual( + "", + monmap['tiebreaker_mon'] + ) + + def test_disable_stretch_mode(self): + """ + Test disabling stretch mode with the following scenario: + 1. Healthy Stretch Mode + 2. Degraded Stretch Mode + """ + # Create a pool + self._setup_pool(self.POOL, 16, 'replicated', self.STRETCH_CRUSH_RULE, 4, 2) + # Write some data to the pool + self._write_some_data(self.WRITE_PERIOD) + # disable stretch mode without --yes-i-really-mean-it (expects -EPERM 1) + self.assertEqual( + 1, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode' + )) + # Disable stretch mode with non-existent crush rule (expects -EINVAL 22) + self.assertEqual( + 22, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + 'non_existent_rule', + '--yes-i-really-mean-it' + )) + # Disable stretch mode with the current stretch rule (expect -EINVAL 22) + self.assertEqual( + 22, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + self.STRETCH_CRUSH_RULE, + '--yes-i-really-mean-it', + + )) + # Disable stretch mode without crush rule (expect success 0) + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + '--yes-i-really-mean-it' + )) + # Check if stretch mode is disabled correctly + self._stretch_mode_disabled_correctly() + # all PGs are active + clean + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # write some data to the pool + self._write_some_data(self.WRITE_PERIOD) + # Enable stretch mode + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'enable_stretch_mode', + self.TIEBREAKER_MON_NAME, + self.STRETCH_CRUSH_RULE, + self.STRETCH_BUCKET_TYPE + )) + self._stretch_mode_enabled_correctly() + # all PGs are active + clean + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # write some data to the pool + # self._write_some_data(self.WRITE_PERIOD) + # Bring down dc1 + self._fail_over_all_osds_in_dc('dc1') + self._fail_over_all_mons_in_dc('dc1') + # should be in degraded stretch mode + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.is_degraded_stretch_mode(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) + # Disable stretch mode with valid crush rule (expect success 0) + self.assertEqual( + 0, + self.mgr_cluster.mon_manager.raw_cluster_cmd_result( + 'mon', + 'disable_stretch_mode', + self.DEFAULT_POOL_CRUSH_RULE, + '--yes-i-really-mean-it' + )) + # Check if stretch mode is disabled correctly + self._stretch_mode_disabled_correctly() + # all PGs are active + self.wait_until_true_and_hold( + lambda: self.mgr_cluster.mon_manager.pg_all_active(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml index b70583a75e1..dbde1ced0db 100644 --- a/qa/tasks/thrashosds-health.yaml +++ b/qa/tasks/thrashosds-health.yaml @@ -30,3 +30,4 @@ overrides: - out of quorum - noscrub - nodeep-scrub + - is down diff --git a/qa/tasks/tox.py b/qa/tasks/tox.py index 61c5b7411b4..4e4dee966d5 100644 --- a/qa/tasks/tox.py +++ b/qa/tasks/tox.py @@ -35,7 +35,7 @@ def task(ctx, config): ctx.cluster.only(client).run(args=[ 'source', '{tvdir}/bin/activate'.format(tvdir=tvdir), run.Raw('&&'), - 'pip', 'install', 'tox==3.15.0' + 'pip', 'install', 'tox' ]) # export the path Keystone and Tempest diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index ca929ba05b4..2ed21431330 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -233,6 +233,11 @@ class LocalRemoteProcess(object): else: self.stderr.write(err) + def _handle_subprocess_output(self, output, stream): + if isinstance(stream, StringIO): + return rm_nonascii_chars(output) + return output + def wait(self, timeout=None): # Null subproc.stdin so communicate() does not try flushing/closing it # again. @@ -250,7 +255,8 @@ class LocalRemoteProcess(object): return out, err = self.subproc.communicate(timeout=timeout) - out, err = rm_nonascii_chars(out), rm_nonascii_chars(err) + out = self._handle_subprocess_output(out, self.stdout) + err = self._handle_subprocess_output(err, self.stderr) self._write_stdout(out) self._write_stderr(err) diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py index f6e55c48cd6..4fd82eaea9d 100644 --- a/qa/tasks/workunit.py +++ b/qa/tasks/workunit.py @@ -441,8 +441,10 @@ def _run_tests(ctx, refspec, role, tests, env, basedir, remote.run(logger=log.getChild(role), args=args, timeout=(60*60)) finally: log.info('Stopping %s on %s...', tests, role) + # N.B. unlike before, don't cleanup path under variable "scratch_tmp" + # here! If the mount is broken then rm will hang. For context, see + # commit d4b8f94cf8d95ebb277b550fc6ebc3468052a39c. args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir] - # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang. remote.run( logger=log.getChild(role), args=args, diff --git a/qa/workunits/cephadm/test_iscsi_setup.sh b/qa/workunits/cephadm/test_iscsi_setup.sh new file mode 100755 index 00000000000..88f379918bc --- /dev/null +++ b/qa/workunits/cephadm/test_iscsi_setup.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# very basic set up of iscsi gw and client +# to make sure things are working + +set -ex + +if ! grep -q rhel /etc/*-release; then + echo "The script only supports CentOS." + exit 1 +fi + +# teuthology tends to put the cephadm binary built for our testing +# branch in /home/ubuntu/cephtest/. If it's there, lets just move it +# so we don't need to reference the full path. +if ! command -v cephadm && ls /home/ubuntu/cephtest/cephadm; then + sudo cp /home/ubuntu/cephtest/cephadm /usr/sbin/ +fi + +# make sure we haven't already created luns +! sudo ls /dev/disk/by-path | grep iscsi + +sudo dnf install jq -y + +ISCSI_CONT_ID=$(sudo podman ps -qa --filter='name=iscsi' | head -n 1) +ISCSI_DAEMON_NAME=$(sudo cephadm ls --no-detail | jq -r '.[] | select(.name | startswith("iscsi")) | .name') +ISCSI_DAEMON_ID=$(cut -d '.' -f2- <<< "$ISCSI_DAEMON_NAME") +HOSTNAME=$(sudo cephadm shell -- ceph orch ps --daemon-id "$ISCSI_DAEMON_ID" -f json | jq -r '.[] | .hostname') +NODE_IP=$(sudo cephadm shell -- ceph orch host ls --format json | jq --arg HOSTNAME "$HOSTNAME" -r '.[] | select(.hostname == $HOSTNAME) | .addr') +# The result of this python line is what iscsi will expect for the first gateway name +FQDN=$(python3 -c 'import socket; print(socket.getfqdn())') +# I am running this twice on purpose. I don't know why but in my testing the first time this would +# run it would return a different result then all subsequent runs (and take significantly longer to run). +# The result from the first run would cause gateway creation to fail when the return value is used +# later on. It was likely specific to my env, but it doesn't hurt to run it twice anyway. This +# was the case whether I ran it through cephadm shell or directly on the host machine. +FQDN=$(python3 -c 'import socket; print(socket.getfqdn())') +ISCSI_POOL=$(sudo cephadm shell -- ceph orch ls iscsi --format json | jq -r '.[] | .spec | .pool') +ISCSI_USER="adminadmin" +ISCSI_PASSWORD="adminadminadmin" + +# gateway setup +container_gwcli() { + sudo podman exec -it ${ISCSI_CONT_ID} gwcli "$@" +} + +container_gwcli /iscsi-targets create iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw +# I've seen this give a nonzero error code with an error message even when +# creating the gateway successfully, so this command is allowed to fail +# If it actually failed to make the gateway, some of the follow up commands will fail +container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/gateways create ${FQDN} ${NODE_IP} || true +container_gwcli /disks create pool=${ISCSI_POOL} image=disk_1 size=2G +container_gwcli /disks create pool=${ISCSI_POOL} image=disk_2 size=2G +container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts create iqn.1994-05.com.redhat:client1 +container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 auth username=${ISCSI_USER} password=${ISCSI_PASSWORD} +container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 disk add ${ISCSI_POOL}/disk_1 +container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 disk add ${ISCSI_POOL}/disk_2 + +# set up multipath and some iscsi config options +sudo dnf install -y iscsi-initiator-utils device-mapper-multipath + +# this next line is purposely being done without "-a" on the tee command to +# overwrite the current initiatorname.iscsi file if it is there +echo "GenerateName=no" | sudo tee /etc/iscsi/initiatorname.iscsi +echo "InitiatorName=iqn.1994-05.com.redhat:client1" | sudo tee -a /etc/iscsi/initiatorname.iscsi + +echo "node.session.auth.authmethod = CHAP" | sudo tee -a /etc/iscsi/iscsid.conf +echo "node.session.auth.username = ${ISCSI_USER}" | sudo tee -a /etc/iscsi/iscsid.conf +echo "node.session.auth.password = ${ISCSI_PASSWORD}" | sudo tee -a /etc/iscsi/iscsid.conf + +sudo tee -a /etc/multipath.conf > /dev/null << EOF +devices { + device { + vendor "LIO-ORG" + product "TCMU device" + hardware_handler "1 alua" + path_grouping_policy "failover" + path_selector "queue-length 0" + failback 60 + path_checker tur + prio alua + prio_args exclusive_pref_bit + fast_io_fail_tmo 25 + no_path_retry queue + } +} +EOF +sudo systemctl restart multipathd +sudo systemctl restart iscsid + +# client setup +sudo iscsiadm -m discovery -t st -p ${NODE_IP} +sudo iscsiadm -m node -T iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw -l +sudo iscsiadm -m session --rescan + +sleep 5 + +# make sure we can now see luns +sudo ls /dev/disk/by-path | grep iscsi diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index cdfff17d837..ad5950367e9 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -63,7 +63,7 @@ function retry_eagain() for count in $(seq 1 $max) ; do status=0 "$@" > $tmpfile 2>&1 || status=$? - if test $status = 0 || + if test $status = 0 || ! grep --quiet EAGAIN $tmpfile ; then break fi @@ -108,7 +108,7 @@ function check_response() exit 1 fi - if ! grep --quiet -- "$expected_string" $TMPFILE ; then + if ! grep --quiet -- "$expected_string" $TMPFILE ; then echo "Didn't find $expected_string in output" >&2 cat $TMPFILE >&2 exit 1 @@ -696,7 +696,7 @@ function test_auth_profiles() ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-ro ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-rw - + # add a new role-definer with the existing role-definer ceph -n client.xx-profile-rd -k client.xx.keyring \ auth add client.xx-profile-rd2 mon 'allow profile role-definer' @@ -730,7 +730,7 @@ function test_mon_caps() ceph-authtool -n client.bug --cap mon '' $TEMP_DIR/ceph.client.bug.keyring ceph auth add client.bug -i $TEMP_DIR/ceph.client.bug.keyring rados lspools --no-mon-config --keyring $TEMP_DIR/ceph.client.bug.keyring -n client.bug >& $TMPFILE || true - check_response "Permission denied" + check_response "Permission denied" } function test_mon_misc() @@ -780,7 +780,6 @@ function test_mon_misc() ceph mgr dump ceph mgr dump | jq -e '.active_clients[0].name' ceph mgr module ls - ceph mgr module enable restful expect_false ceph mgr module enable foodne ceph mgr module enable foodne --force ceph mgr module disable foodne @@ -1650,7 +1649,7 @@ function test_mon_osd() dump_json=$(ceph osd dump --format=json | \ jq -cM '.osds[] | select(.osd == 0)') [[ "${info_json}" == "${dump_json}" ]] - + info_plain="$(ceph osd info)" dump_plain="$(ceph osd dump | grep '^osd')" [[ "${info_plain}" == "${dump_plain}" ]] @@ -2244,7 +2243,7 @@ function test_mon_pg() # tell osd version # ceph tell osd.0 version - expect_false ceph tell osd.9999 version + expect_false ceph tell osd.9999 version expect_false ceph tell osd.foo version # back to pg stuff @@ -2336,7 +2335,7 @@ function test_mon_osd_pool_set() ceph osd pool get $TEST_POOL_GETSET deep_scrub_interval | expect_false grep '.' ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET recovery_priority 5 + ceph osd pool set $TEST_POOL_GETSET recovery_priority 5 ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: 5' ceph osd pool set $TEST_POOL_GETSET recovery_priority -5 ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: -5' @@ -2346,13 +2345,13 @@ function test_mon_osd_pool_set() expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 11 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5 + ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | grep 'recovery_op_priority: 5' ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 0 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.' ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.' - ceph osd pool set $TEST_POOL_GETSET scrub_priority 5 + ceph osd pool set $TEST_POOL_GETSET scrub_priority 5 ceph osd pool get $TEST_POOL_GETSET scrub_priority | grep 'scrub_priority: 5' ceph osd pool set $TEST_POOL_GETSET scrub_priority 0 ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.' @@ -2386,10 +2385,10 @@ function test_mon_osd_pool_set() ceph osd pool set $TEST_POOL_GETSET size 2 wait_for_clean ceph osd pool set $TEST_POOL_GETSET min_size 2 - + expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 0 ceph osd pool set $TEST_POOL_GETSET hashpspool 0 --yes-i-really-mean-it - + expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 1 ceph osd pool set $TEST_POOL_GETSET hashpspool 1 --yes-i-really-mean-it @@ -2587,7 +2586,7 @@ function test_mon_osd_misc() ceph osd map 2>$TMPFILE; check_response 'pool' $? 22 # expect error about unused argument foo - ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22 + ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22 # expect "not in range" for invalid overload percentage ceph osd reweight-by-utilization 80 2>$TMPFILE; check_response 'higher than 100' $? 22 diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh new file mode 100755 index 00000000000..88552aa50bd --- /dev/null +++ b/qa/workunits/client/test_oc_disabled.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +set -ex + +ceph_test_client --client_oc=false diff --git a/qa/workunits/dencoder/test_readable.py b/qa/workunits/dencoder/test_readable.py index f032f7a9bbe..6eba0a4eb3f 100755 --- a/qa/workunits/dencoder/test_readable.py +++ b/qa/workunits/dencoder/test_readable.py @@ -61,7 +61,7 @@ def process_type(file_path, type): cmd_determ = [CEPH_DENCODER, "type", type, "is_deterministic"] determ_res = subprocess.run(cmd_determ, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check if the command failed - if determ_res.returncode != 0: + if determ_res.returncode != 0 and determ_res.returncode != 1: error_message = determ_res.stderr.decode().strip() debug_print(f"Error running command: {error_message}") return 1 @@ -222,7 +222,7 @@ def check_backward_compat(): version_name = version.name _backward_compat[version_name] = {} type_dir = archive_dir / version_name / "forward_incompat" - if type_dir.exists() and type_dir.is_dir(): + if type_dir.exists(): for type_entry in type_dir.iterdir(): if type_entry.is_dir(): type_name = type_entry.name @@ -243,7 +243,8 @@ def check_backward_compat(): def process_batch(batch): results = [] - with concurrent.futures.ThreadPoolExecutor() as executor: + max_workers = 15 + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit( test_object_wrapper, batch_type, vdir, arversion, current_ver @@ -259,7 +260,8 @@ def process_batch(batch): # Create a generator that processes batches asynchronously def async_process_batches(task_batches): - with concurrent.futures.ProcessPoolExecutor() as executor: + max_workers = 10 + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: futures = [executor.submit(process_batch, batch) for batch in task_batches] for future in concurrent.futures.as_completed(futures): yield future.result() diff --git a/qa/workunits/erasure-code/bench.sh b/qa/workunits/erasure-code/bench.sh index fc75830dfd0..87e997c3500 100755 --- a/qa/workunits/erasure-code/bench.sh +++ b/qa/workunits/erasure-code/bench.sh @@ -17,7 +17,8 @@ # # Test that it works from sources with: # -# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ +# TOTAL_SIZE=$((4 * 1024 * 1024)) SIZE=4096 \ +# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=build/lib \ # qa/workunits/erasure-code/bench.sh fplot jerasure | # tee qa/workunits/erasure-code/bench.js @@ -34,10 +35,14 @@ # firefox qa/workunits/erasure-code/bench.html # # Once it is confirmed to work, it can be run with a more significant -# volume of data so that the measures are more reliable: +# volume of data so that the measures are more reliable. Ideally the size +# of the buffers (SIZE) should be larger than the L3 cache to avoid cache hits. +# The following example uses an 80MB (80 * 1024 * 1024) buffer. +# A larger buffer with fewer iterations (iterations = TOTAL SIZE / SIZE) should result in +# more time spent encoding/decoding and less time allocating/aligning buffers: # -# TOTAL_SIZE=$((4 * 1024 * 1024 * 1024)) \ -# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \ +# TOTAL_SIZE=$((100 * 80 * 1024 * 1024)) SIZE=$((80 * 1024 * 1024)) \ +# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \ # PLUGIN_DIRECTORY=build/lib \ # qa/workunits/erasure-code/bench.sh fplot jerasure | # tee qa/workunits/erasure-code/bench.js @@ -51,8 +56,8 @@ export PATH=/sbin:$PATH : ${PLUGIN_DIRECTORY:=/usr/lib/ceph/erasure-code} : ${PLUGINS:=isa jerasure} : ${TECHNIQUES:=vandermonde cauchy liberation reed_sol_r6_op blaum_roth liber8tion} -: ${TOTAL_SIZE:=$((1024 * 1024))} -: ${SIZE:=4096} +: ${TOTAL_SIZE:=$((100 * 80 * 1024 * 1024))} #TOTAL_SIZE / SIZE = number of encode or decode iterations to run +: ${SIZE:=$((80 * 1024 * 1024))} #size of buffer to encode/decode : ${PARAMETERS:=--parameter jerasure-per-chunk-alignment=true} declare -rA isa_techniques=( diff --git a/qa/workunits/fs/misc/fallocate.sh b/qa/workunits/fs/misc/fallocate.sh new file mode 100755 index 00000000000..253e6cb7a37 --- /dev/null +++ b/qa/workunits/fs/misc/fallocate.sh @@ -0,0 +1,17 @@ +#!/bin/sh -x + +# fallocate with mode 0 should fail with EOPNOTSUPP +set -e +mkdir -p testdir +cd testdir + +expect_failure() { + if "$@"; then return 1; else return 0; fi +} + +expect_failure fallocate -l 1M preallocated.txt +rm -f preallocated.txt + +cd .. +rmdir testdir +echo OK diff --git a/qa/workunits/fs/snaps/snaptest-double-null.sh b/qa/workunits/fs/snaps/snaptest-double-null.sh index cdf32e4f0ef..833c0fd696b 100755 --- a/qa/workunits/fs/snaps/snaptest-double-null.sh +++ b/qa/workunits/fs/snaps/snaptest-double-null.sh @@ -11,6 +11,7 @@ mkdir a cat > a/foo & mkdir a/.snap/one mkdir a/.snap/two +wait chmod 777 a/foo sync # this might crash the mds ps diff --git a/qa/workunits/fs/snaps/snaptest-git-ceph.sh b/qa/workunits/fs/snaps/snaptest-git-ceph.sh index 2b38720c9a5..6079ba8945b 100755 --- a/qa/workunits/fs/snaps/snaptest-git-ceph.sh +++ b/qa/workunits/fs/snaps/snaptest-git-ceph.sh @@ -4,7 +4,14 @@ set -e # increase the cache size sudo git config --global http.sslVerify false -sudo git config --global http.postBuffer 1048576000 +sudo git config --global http.postBuffer 1024MB # default is 1MB +sudo git config --global http.maxRequestBuffer 100M # default is 10MB +sudo git config --global core.compression 0 + +# enable the debug logs for git clone +export GIT_TRACE_PACKET=1 +export GIT_TRACE=1 +export GIT_CURL_VERBOSE=1 # try it again if the clone is slow and the second time retried=false @@ -19,6 +26,11 @@ timeout 1800 git clone https://git.ceph.com/ceph.git trap - EXIT cd ceph +# disable the debug logs for git clone +export GIT_TRACE_PACKET=0 +export GIT_TRACE=0 +export GIT_CURL_VERBOSE=0 + versions=`seq 1 90` for v in $versions diff --git a/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh new file mode 100755 index 00000000000..827fb0a0b13 --- /dev/null +++ b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh @@ -0,0 +1,72 @@ +#!/bin/bash -ex + +# A bash script for setting up stretch mode with 5 monitors and 8 OSDs. + +NUM_OSDS_UP=$(ceph osd df | grep "up" | wc -l) + +if [ $NUM_OSDS_UP -lt 8 ]; then + echo "test requires at least 8 OSDs up and running" + exit 1 +fi + +# ensure election strategy is set to "connectivity" +# See https://tracker.ceph.com/issues/69107 +ceph mon set election_strategy connectivity + +for dc in dc1 dc2 + do + ceph osd crush add-bucket $dc datacenter + ceph osd crush move $dc root=default + done + +ceph osd crush add-bucket host01 host +ceph osd crush add-bucket host02 host +ceph osd crush add-bucket host03 host +ceph osd crush add-bucket host04 host + +ceph osd crush move host01 datacenter=dc1 +ceph osd crush move host02 datacenter=dc1 +ceph osd crush move host03 datacenter=dc2 +ceph osd crush move host04 datacenter=dc2 + +ceph osd crush move osd.0 host=host01 +ceph osd crush move osd.1 host=host01 +ceph osd crush move osd.2 host=host02 +ceph osd crush move osd.3 host=host02 +ceph osd crush move osd.4 host=host03 +ceph osd crush move osd.5 host=host03 +ceph osd crush move osd.6 host=host04 +ceph osd crush move osd.7 host=host04 + +# set location for monitors +ceph mon set_location a datacenter=dc1 host=host01 +ceph mon set_location b datacenter=dc1 host=host02 +ceph mon set_location c datacenter=dc2 host=host03 +ceph mon set_location d datacenter=dc2 host=host04 + +# set location for tiebreaker monitor +ceph mon set_location e datacenter=dc3 host=host05 + +# remove the current host from crush map +hostname=$(hostname -s) +ceph osd crush remove $hostname +# create a new crush rule with stretch rule +ceph osd getcrushmap > crushmap +crushtool --decompile crushmap > crushmap.txt +sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt +cat >> crushmap_modified.txt << EOF +rule stretch_rule { + id 2 + type replicated + step take default + step choose firstn 2 type datacenter + step chooseleaf firstn 2 type host + step emit +} +# end crush map +EOF + +crushtool --compile crushmap_modified.txt -o crushmap.bin +ceph osd setcrushmap -i crushmap.bin + +ceph mon enable_stretch_mode e stretch_rule datacenter diff --git a/qa/workunits/rbd/nvmeof_basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh index dc6fd1669da..9e7a1f5134e 100755 --- a/qa/workunits/rbd/nvmeof_basic_tests.sh +++ b/qa/workunits/nvmeof/basic_tests.sh @@ -38,8 +38,10 @@ disconnect_all() { connect_all() { sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600 sleep 5 - output=$(sudo nvme list --output-format=json) - if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then + expected_devices_count=$1 + actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l) + if [ "$actual_devices" -ne "$expected_devices_count" ]; then + sudo nvme list --output-format=json return 1 fi } @@ -72,11 +74,13 @@ test_run connect test_run list_subsys 1 test_run disconnect_all test_run list_subsys 0 -test_run connect_all +devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) +test_run connect_all $devices_count gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) test_run list_subsys $multipath_count + echo "-------------Test Summary-------------" echo "[nvmeof] All nvmeof basic tests passed!" diff --git a/qa/workunits/rbd/nvmeof_fio_test.sh b/qa/workunits/nvmeof/fio_test.sh index 57d355a6318..f7f783afc67 100755 --- a/qa/workunits/rbd/nvmeof_fio_test.sh +++ b/qa/workunits/nvmeof/fio_test.sh @@ -5,6 +5,7 @@ sudo yum -y install sysstat namespace_range_start= namespace_range_end= +random_devices_count= rbd_iostat=false while [[ $# -gt 0 ]]; do @@ -17,6 +18,10 @@ while [[ $# -gt 0 ]]; do namespace_range_end=$2 shift 2 ;; + --random_devices) + random_devices_count=$2 + shift 2 + ;; --rbd_iostat) rbd_iostat=true shift @@ -29,7 +34,7 @@ done fio_file=$(mktemp -t nvmeof-fio-XXXX) all_drives_list=$(sudo nvme list --output-format=json | - jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath') + jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace') # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), # then fio runs on namespaces only in the defined range (which is 1 to 3 here). @@ -37,6 +42,8 @@ all_drives_list=$(sudo nvme list --output-format=json | # run on first 3 namespaces here. if [ "$namespace_range_start" ] || [ "$namespace_range_end" ]; then selected_drives=$(echo "${all_drives_list[@]}" | sed -n "${namespace_range_start},${namespace_range_end}p") +elif [ "$random_devices_count" ]; then + selected_drives=$(echo "${all_drives_list[@]}" | shuf -n $random_devices_count) else selected_drives="${all_drives_list[@]}" fi diff --git a/qa/workunits/nvmeof/mtls_test.sh b/qa/workunits/nvmeof/mtls_test.sh new file mode 100755 index 00000000000..e13ca530e8d --- /dev/null +++ b/qa/workunits/nvmeof/mtls_test.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -ex +source /etc/ceph/nvmeof.env + +# install yq +wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /tmp/yq && chmod +x /tmp/yq + +subjectAltName=$(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | sed 's/,/,IP:/g') + +# create mtls spec files +ceph orch ls nvmeof --export > /tmp/gw-conf-original.yaml +sudo /tmp/yq ".spec.enable_auth=true | \ + .spec.root_ca_cert=\"mountcert\" | \ + .spec.client_cert = load_str(\"/etc/ceph/client.crt\") | \ + .spec.client_key = load_str(\"/etc/ceph/client.key\") | \ + .spec.server_cert = load_str(\"/etc/ceph/server.crt\") | \ + .spec.server_key = load_str(\"/etc/ceph/server.key\")" /tmp/gw-conf-original.yaml > /tmp/gw-conf-with-mtls.yaml +cp /tmp/gw-conf-original.yaml /tmp/gw-conf-without-mtls.yaml +sudo /tmp/yq '.spec.enable_auth=false' -i /tmp/gw-conf-without-mtls.yaml + +wait_for_service() { + MAX_RETRIES=30 + for ((RETRY_COUNT=1; RETRY_COUNT<=MAX_RETRIES; RETRY_COUNT++)); do + + if ceph orch ls --refresh | grep -q "nvmeof"; then + echo "Found nvmeof in the output!" + break + fi + if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then + echo "Reached maximum retries ($MAX_RETRIES). Exiting." + break + fi + sleep 5 + done + ceph orch ps + ceph orch ls --refresh +} + +# deploy mtls +cat /tmp/gw-conf-with-mtls.yaml +ceph orch apply -i /tmp/gw-conf-with-mtls.yaml +ceph orch redeploy nvmeof.mypool.mygroup0 +sleep 100 +wait_for_service + + +# test +IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES" +for i in "${!gateway_ips[@]}" +do + ip="${gateway_ips[i]}" + sudo podman run -v /etc/ceph/server.crt:/server.crt:z -v /etc/ceph/client.crt:/client.crt:z \ + -v /etc/ceph/client.key:/client.key:z \ + -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \ + --client-key /client.key --client-cert /client.crt --server-cert /server.crt --format json subsystem list +done + + +# remove mtls +cat /tmp/gw-conf-without-mtls.yaml +ceph orch apply -i /tmp/gw-conf-without-mtls.yaml +ceph orch redeploy nvmeof.mypool.mygroup0 +sleep 100 +wait_for_service + + +# test +IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES" +for i in "${!gateway_ips[@]}" +do + ip="${gateway_ips[i]}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \ + --format json subsystem list +done + diff --git a/qa/workunits/nvmeof/namespace_test.sh b/qa/workunits/nvmeof/namespace_test.sh new file mode 100755 index 00000000000..ef331fd085b --- /dev/null +++ b/qa/workunits/nvmeof/namespace_test.sh @@ -0,0 +1,71 @@ +#!/bin/bash -xe + +# It's assumed in this test that each subsystem has equal number +# of namespaces (i.e. NVMEOF_NAMESPACES_COUNT ns per subsystem). +# This script then adds NEW_NAMESPACES_COUNT amount of namespaces +# to each subsystem and then deletes those new namespaces. + +source /etc/ceph/nvmeof.env + +RBD_POOL="${RBD_POOL:-mypool}" +NEW_IMAGE_SIZE="${RBD_IMAGE_SIZE:-8192}" # 1024*8 +NEW_NAMESPACES_COUNT="${NEW_NAMESPACES_COUNT:-3}" + +gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) +new_images_count=$(( $NVMEOF_SUBSYSTEMS_COUNT * $NEW_NAMESPACES_COUNT)) + + +assert_namespaces_count() { + expected_count_per_subsys=$1 + actual_count=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list | + grep namespace_count | grep $expected_count_per_subsys | wc -l) + if [ "$actual_count" -ne "$NVMEOF_SUBSYSTEMS_COUNT" ]; then + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list + echo "Expected count of namepaces not found, expected (per subsystem): $expected_count_per_subsys" + return 1 + fi +} + + +# add rbd images +for i in $(seq 1 $new_images_count); do + image_name="test${i}" + rbd create $RBD_POOL/$image_name --size $NEW_IMAGE_SIZE +done + +# add new namespaces +image_index=1 +for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do + subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" + for ns in $(seq 1 $NEW_NAMESPACES_COUNT); do + image="test${image_index}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace add --subsystem $subsystem_nqn --rbd-pool $RBD_POOL --rbd-image $image --load-balancing-group $(($image_index % $gateways_count + 1)) + ((image_index++)) + done +done + +# list namespaces +for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do + subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn +done + +# verify namespaces added +expected_count_per_subsys=$(( $NEW_NAMESPACES_COUNT + $NVMEOF_NAMESPACES_COUNT )) +assert_namespaces_count $expected_count_per_subsys + +# delete namespaces +for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do + subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" + NSIDs=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json namespace list --subsystem $subsystem_nqn | + jq -r '.namespaces[] | select(.rbd_image_name | startswith("test")) | .nsid') + + for nsid in $NSIDs; do + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace del --subsystem $subsystem_nqn --nsid $nsid + done +done + +# verify namespaces deleted +expected_count_per_subsys=$NVMEOF_NAMESPACES_COUNT +assert_namespaces_count $expected_count_per_subsys + diff --git a/qa/workunits/nvmeof/scalability_test.sh b/qa/workunits/nvmeof/scalability_test.sh new file mode 100755 index 00000000000..8ede4b7eda2 --- /dev/null +++ b/qa/workunits/nvmeof/scalability_test.sh @@ -0,0 +1,66 @@ +#!/bin/bash -xe + + +GATEWAYS=$1 # exmaple "nvmeof.a,nvmeof.b" +DELAY="${SCALING_DELAYS:-50}" +POOL="${RBD_POOL:-mypool}" +GROUP="${NVMEOF_GROUP:-mygroup0}" +source /etc/ceph/nvmeof.env + +if [ -z "$GATEWAYS" ]; then + echo "At least one gateway needs to be defined for scalability test" + exit 1 +fi + +status_checks() { + expected_count=$1 + + output=$(ceph nvme-gw show $POOL $GROUP) + nvme_show=$(echo $output | grep -o '"AVAILABLE"' | wc -l) + if [ "$nvme_show" -ne "$expected_count" ]; then + return 1 + fi + + orch_ls=$(ceph orch ls) + if ! echo "$orch_ls" | grep -q "$expected_count/$expected_count"; then + return 1 + fi + + output=$(ceph orch ps --service-name nvmeof.$POOL.$GROUP) + orch_ps=$(echo $output | grep -o 'running' | wc -l) + if [ "$orch_ps" -ne "$expected_count" ]; then + return 1 + fi + + ceph_status=$(ceph -s) + if ! echo "$ceph_status" | grep -q "HEALTH_OK"; then + return 1 + fi +} + +total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) +scaled_down_gateways_count=$(( total_gateways_count - $(echo "$GATEWAYS" | tr -cd ',' | wc -c) - 1 )) + + +echo "[nvmeof.scale] Setting up config to remove gateways ${GATEWAYS}" +ceph orch ls --service-name nvmeof.$POOL.$GROUP --export > /tmp/nvmeof-gw.yaml +ceph orch ls nvmeof --export > /tmp/nvmeof-gw.yaml +cat /tmp/nvmeof-gw.yaml + +pattern=$(echo $GATEWAYS | sed 's/,/\\|/g') +sed "/$pattern/d" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml +cat /tmp/nvmeof-gw-new.yaml + +echo "[nvmeof.scale] Starting scale testing by removing ${GATEWAYS}" +status_checks $total_gateways_count +ceph orch apply -i /tmp/nvmeof-gw-new.yaml # downscale +ceph orch redeploy nvmeof.$POOL.$GROUP +sleep $DELAY +status_checks $scaled_down_gateways_count +echo "[nvmeof.scale] Downscale complete - removed gateways (${GATEWAYS}); now scaling back up" +ceph orch apply -i /tmp/nvmeof-gw.yaml #upscale +ceph orch redeploy nvmeof.$POOL.$GROUP +sleep $DELAY +status_checks $total_gateways_count + +echo "[nvmeof.scale] Scale testing passed for ${GATEWAYS}" diff --git a/qa/workunits/rbd/nvmeof_setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh index fb72e1d6402..b573647b1e3 100755 --- a/qa/workunits/rbd/nvmeof_setup_subsystem.sh +++ b/qa/workunits/nvmeof/setup_subsystem.sh @@ -26,14 +26,21 @@ list_subsystems () { done } +list_namespaces () { + for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do + subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn + done +} + +echo "[nvmeof] Starting subsystem setup..." + # add all subsystems for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" - sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn + sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append done -list_subsystems - # add all gateway listeners for i in "${!gateway_ips[@]}" do @@ -65,11 +72,5 @@ done list_subsystems -# list namespaces -for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do - subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}" - sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn -done - echo "[nvmeof] Subsystem setup done" diff --git a/qa/workunits/rados/test_rados_tool.sh b/qa/workunits/rados/test_rados_tool.sh index b822aa2b823..9febc4a4524 100755 --- a/qa/workunits/rados/test_rados_tool.sh +++ b/qa/workunits/rados/test_rados_tool.sh @@ -329,10 +329,10 @@ test_xattr() { expect_false $RADOS_TOOL -p $POOL setxattr $OBJ 2>/dev/null expect_false $RADOS_TOOL -p $POOL setxattr $OBJ foo fooval extraarg 2>/dev/null $RADOS_TOOL -p $POOL setxattr $OBJ foo fooval - $RADOS_TOOL -p $POOL getxattr $OBJ foo > $V2 + $RADOS_TOOL -p $POOL getxattr $OBJ foo > $V2 | tr -d '\n' > $V2 cmp $V1 $V2 cat $V1 | $RADOS_TOOL -p $POOL setxattr $OBJ bar - $RADOS_TOOL -p $POOL getxattr $OBJ bar > $V2 + $RADOS_TOOL -p $POOL getxattr $OBJ bar > $V2 | tr -d '\n' > $V2 cmp $V1 $V2 $RADOS_TOOL -p $POOL listxattr $OBJ > $V1 grep -q foo $V1 diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index ca9ffde8113..0ceb9ff54cf 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash set -ex -. $(dirname $0)/../../standalone/ceph-helpers.sh - export RBD_FORCE_ALLOW_V1=1 # make sure rbd pool is EMPTY.. this is a test script!! @@ -916,6 +914,11 @@ test_namespace() { rbd group create rbd/test1/group1 rbd group image add rbd/test1/group1 rbd/test1/image1 + rbd group image add --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image2 + rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image1 + rbd group image rm rbd/test1/group1 rbd/test1/image2 rbd group rm rbd/test1/group1 rbd trash move rbd/test1/image1 @@ -935,7 +938,7 @@ get_migration_state() { local image=$1 rbd --format xml status $image | - $XMLSTARLET sel -t -v '//status/migration/state' + xmlstarlet sel -t -v '//status/migration/state' } test_migration() { @@ -1175,14 +1178,14 @@ test_trash_purge_schedule() { for i in `seq 12`; do test "$(rbd trash purge schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd' && break + xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd' && break sleep 10 done rbd trash purge schedule status test "$(rbd trash purge schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd' + xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd' test "$(rbd trash purge schedule status -p rbd --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd' + xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd' rbd trash purge schedule add 2d 00:17 rbd trash purge schedule ls | grep 'every 2d starting at 00:17' @@ -1191,36 +1194,36 @@ test_trash_purge_schedule() { rbd trash purge schedule ls -p rbd2 -R | grep 'every 2d starting at 00:17' rbd trash purge schedule ls -p rbd2/ns1 -R | grep 'every 2d starting at 00:17' test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml | - $XMLSTARLET sel -t -v '//schedules/schedule/pool')" = "-" + xmlstarlet sel -t -v '//schedules/schedule/pool')" = "-" test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml | - $XMLSTARLET sel -t -v '//schedules/schedule/namespace')" = "-" + xmlstarlet sel -t -v '//schedules/schedule/namespace')" = "-" test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml | - $XMLSTARLET sel -t -v '//schedules/schedule/items/item/start_time')" = "00:17:00" + xmlstarlet sel -t -v '//schedules/schedule/items/item/start_time')" = "00:17:00" for i in `seq 12`; do rbd trash purge schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool' | grep 'rbd2' && break + xmlstarlet sel -t -v '//scheduled/item/pool' | grep 'rbd2' && break sleep 10 done rbd trash purge schedule status rbd trash purge schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool' | grep 'rbd2' + xmlstarlet sel -t -v '//scheduled/item/pool' | grep 'rbd2' echo $(rbd trash purge schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool') | grep 'rbd rbd2 rbd2' + xmlstarlet sel -t -v '//scheduled/item/pool') | grep 'rbd rbd2 rbd2' test "$(rbd trash purge schedule status -p rbd --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd' + xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd' test "$(echo $(rbd trash purge schedule status -p rbd2 --format xml | - $XMLSTARLET sel -t -v '//scheduled/item/pool'))" = 'rbd2 rbd2' + xmlstarlet sel -t -v '//scheduled/item/pool'))" = 'rbd2 rbd2' test "$(echo $(rbd trash purge schedule ls -R --format xml | - $XMLSTARLET sel -t -v '//schedules/schedule/items'))" = "2d00:17:00 1d01:30:00" + xmlstarlet sel -t -v '//schedules/schedule/items'))" = "2d00:17:00 1d01:30:00" rbd trash purge schedule add 1d rbd trash purge schedule ls | grep 'every 2d starting at 00:17' rbd trash purge schedule ls | grep 'every 1d' rbd trash purge schedule ls -R --format xml | - $XMLSTARLET sel -t -v '//schedules/schedule/items' | grep '2d00:17' + xmlstarlet sel -t -v '//schedules/schedule/items' | grep '2d00:17' rbd trash purge schedule rm 1d rbd trash purge schedule ls | grep 'every 2d starting at 00:17' @@ -1362,13 +1365,13 @@ test_mirror_snapshot_schedule() { rbd mirror snapshot schedule status test "$(rbd mirror snapshot schedule status --format xml | - $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' + xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' test "$(rbd mirror snapshot schedule status -p rbd2 --format xml | - $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' + xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --format xml | - $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' + xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --image test1 --format xml | - $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' + xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1' rbd mirror image demote rbd2/ns1/test1 for i in `seq 12`; do diff --git a/qa/workunits/rbd/cli_migration.sh b/qa/workunits/rbd/cli_migration.sh index be8e031fd1b..3af19420957 100755 --- a/qa/workunits/rbd/cli_migration.sh +++ b/qa/workunits/rbd/cli_migration.sh @@ -1,17 +1,20 @@ #!/usr/bin/env bash set -ex -. $(dirname $0)/../../standalone/ceph-helpers.sh - TEMPDIR= IMAGE1=image1 IMAGE2=image2 IMAGE3=image3 -IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3}" +NAMESPACE1=namespace1 +NAMESPACE2=namespace2 +NAMESPACES="${NAMESPACE1} ${NAMESPACE2}" +IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3} rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}" cleanup() { + kill_nbd_server cleanup_tempdir remove_images + remove_namespaces } setup_tempdir() { @@ -22,10 +25,17 @@ cleanup_tempdir() { rm -rf ${TEMPDIR} } +expect_false() { + if "$@"; then return 1; else return 0; fi +} + create_base_image() { local image=$1 - rbd create --size 1G ${image} + # size is not a multiple of object size to trigger an edge case in + # list-snaps + rbd create --size 1025M ${image} + rbd bench --io-type write --io-pattern rand --io-size=4K --io-total 256M ${image} rbd snap create ${image}@1 rbd bench --io-type write --io-pattern rand --io-size=4K --io-total 64M ${image} @@ -36,8 +46,11 @@ create_base_image() { export_raw_image() { local image=$1 - rm -rf "${TEMPDIR}/${image}" - rbd export ${image} "${TEMPDIR}/${image}" + # Replace slashes (/) with underscores (_) for namespace images + local export_image="${image//\//_}" + + rm -rf "${TEMPDIR}/${export_image}" + rbd export "${image}" "${TEMPDIR}/${export_image}" } export_base_image() { @@ -63,6 +76,17 @@ remove_images() { done } +remove_namespaces() { + for namespace in ${NAMESPACES} + do + rbd namespace remove rbd/${namespace} || true + done +} + +kill_nbd_server() { + pkill -9 qemu-nbd || true +} + show_diff() { local file1=$1 @@ -80,6 +104,11 @@ compare_images() { local ret=0 export_raw_image ${dst_image} + + # Replace slashes (/) with underscores (_) for namespace images + src_image="${src_image//\//_}" + dst_image="${dst_image//\//_}" + if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" then show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}" @@ -89,18 +118,26 @@ compare_images() { } test_import_native_format() { - local base_image=$1 - local dest_image=$2 + local base_image_spec=$1 + local dest_image_spec=$2 + + # if base image is from namespace + local base_namespace="" + local base_image=${base_image_spec} + if [[ "${base_image_spec}" == rbd/*/* ]]; then + base_namespace=$(basename "$(dirname "${base_image_spec}")") + base_image=$(basename "${base_image_spec}") + fi - rbd migration prepare --import-only "rbd/${base_image}@2" ${dest_image} - rbd migration abort ${dest_image} + rbd migration prepare --import-only "${base_image_spec}@2" ${dest_image_spec} + rbd migration abort ${dest_image_spec} local pool_id=$(ceph osd pool ls detail --format xml | xmlstarlet sel -t -v "//pools/pool[pool_name='rbd']/pool_id") cat > ${TEMPDIR}/spec.json <<EOF { "type": "native", "pool_id": ${pool_id}, - "pool_namespace": "", + "pool_namespace": "${base_namespace}", "image_name": "${base_image}", "snap_name": "2" } @@ -108,37 +145,85 @@ EOF cat ${TEMPDIR}/spec.json rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec} - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" - rbd migration abort ${dest_image} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} - rbd migration execute ${dest_image} - - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec} + rbd migration execute ${dest_image_spec} + + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" + + rbd migration abort ${dest_image_spec} + + # no snap name or snap id + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\"}" \ + ${dest_image_spec} + + # invalid source spec JSON + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \ + ${dest_image_spec} + + # non-existing snap name + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \ + ${dest_image_spec} + + # invalid snap name + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \ + ${dest_image_spec} + + # non-existing snap id passed as int + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \ + ${dest_image_spec} + + # non-existing snap id passed as string + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \ + ${dest_image_spec} + + # invalid snap id + expect_false rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \ + ${dest_image_spec} + + # snap id passed as int + local snap_id=$(rbd snap ls ${base_image_spec} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id") + rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} - rbd migration abort ${dest_image} + # snap id passed as string + rbd migration prepare --import-only \ + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_id\": "${pool_id}", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ - ${dest_image} - rbd migration abort ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ + ${dest_image_spec} + rbd migration abort ${dest_image_spec} rbd migration prepare --import-only \ - --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ - ${dest_image} - rbd migration execute ${dest_image} - rbd migration commit ${dest_image} + --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \ + ${dest_image_spec} + rbd migration execute ${dest_image_spec} + rbd migration commit ${dest_image_spec} - compare_images "${base_image}@1" "${dest_image}@1" - compare_images "${base_image}@2" "${dest_image}@2" + compare_images "${base_image_spec}@1" "${dest_image_spec}@1" + compare_images "${base_image_spec}@2" "${dest_image_spec}@2" - remove_image "${dest_image}" + remove_image "${dest_image_spec}" } test_import_qcow_format() { @@ -279,12 +364,12 @@ EOF cat ${TEMPDIR}/spec.json cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \ - --source-spec-path - ${dest_image} + --source-spec-path - ${dest_image} compare_images ${base_image} ${dest_image} rbd migration abort ${dest_image} rbd migration prepare --import-only \ - --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + --source-spec-path ${TEMPDIR}/spec.json ${dest_image} rbd migration execute ${dest_image} rbd migration commit ${dest_image} @@ -340,6 +425,177 @@ EOF remove_image "${dest_image}" } +test_import_nbd_stream_qcow2() { + local base_image=$1 + local dest_image=$2 + + qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork \ + ${TEMPDIR}/${base_image}.qcow2 + + cat > ${TEMPDIR}/spec.json <<EOF +{ + "type": "raw", + "stream": { + "type": "nbd", + "uri": "nbd://localhost" + } +} +EOF + cat ${TEMPDIR}/spec.json + + cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \ + --source-spec-path - ${dest_image} + compare_images ${base_image} ${dest_image} + rbd migration abort ${dest_image} + + rbd migration prepare --import-only \ + --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + compare_images ${base_image} ${dest_image} + rbd migration execute ${dest_image} + compare_images ${base_image} ${dest_image} + rbd migration commit ${dest_image} + compare_images ${base_image} ${dest_image} + remove_image "${dest_image}" + + # shortest possible URI + rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://"}}' \ + ${dest_image} + rbd migration abort ${dest_image} + + # non-existing export name + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd:///myexport"}}' \ + ${dest_image} + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://localhost/myexport"}}' \ + ${dest_image} + + kill_nbd_server + qemu-nbd --export-name myexport -f qcow2 --read-only --shared 10 --persistent --fork \ + ${TEMPDIR}/${base_image}.qcow2 + + rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd:///myexport"}}' \ + ${dest_image} + rbd migration abort ${dest_image} + + rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://localhost/myexport"}}' \ + ${dest_image} + rbd migration abort ${dest_image} + + kill_nbd_server + + # server not running + expect_false rbd migration prepare --import-only \ + --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://"}}' \ + ${dest_image} + + # no URI + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd"}}' \ + ${dest_image} + + # invalid URI + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": 123456}}' \ + ${dest_image} + + # libnbd - nbd_get_errno() returns an error + # nbd_connect_uri: unknown URI scheme: NULL: Invalid argument (errno = 22) + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": ""}}' \ + ${dest_image} + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "foo.example.com"}}' \ + ${dest_image} + + # libnbd - nbd_get_errno() returns 0, EIO fallback + # nbd_connect_uri: getaddrinfo: foo.example.com:10809: Name or service not known (errno = 0) + expect_false rbd migration prepare --import-only \ + --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://foo.example.com"}}' \ + ${dest_image} +} + +test_import_nbd_stream_raw() { + local base_image=$1 + local dest_image=$2 + + qemu-nbd -f raw --read-only --shared 10 --persistent --fork \ + --socket ${TEMPDIR}/qemu-nbd-${base_image} ${TEMPDIR}/${base_image} + qemu-nbd -f raw --read-only --shared 10 --persistent --fork \ + --socket ${TEMPDIR}/qemu-nbd-${base_image}@1 ${TEMPDIR}/${base_image}@1 + qemu-nbd -f raw --read-only --shared 10 --persistent --fork \ + --socket ${TEMPDIR}/qemu-nbd-${base_image}@2 ${TEMPDIR}/${base_image}@2 + + cat > ${TEMPDIR}/spec.json <<EOF +{ + "type": "raw", + "stream": { + "type": "nbd", + "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}" + }, + "snapshots": [{ + "type": "raw", + "name": "snap1", + "stream": { + "type": "nbd", + "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}@1" + } + }, { + "type": "raw", + "name": "snap2", + "stream": { + "type": "nbd", + "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}@2" + } + }] +} +EOF + cat ${TEMPDIR}/spec.json + + rbd migration prepare --import-only \ + --source-spec-path ${TEMPDIR}/spec.json ${dest_image} + + rbd snap create ${dest_image}@head + rbd bench --io-type write --io-pattern rand --io-size 32K --io-total 4M ${dest_image} + + compare_images "${base_image}@1" "${dest_image}@snap1" + compare_images "${base_image}@2" "${dest_image}@snap2" + compare_images "${base_image}" "${dest_image}@head" + + rbd migration abort ${dest_image} + + cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \ + --source-spec-path - ${dest_image} + + rbd snap create ${dest_image}@head + rbd bench --io-type write --io-pattern rand --io-size 64K --io-total 8M ${dest_image} + + compare_images "${base_image}@1" "${dest_image}@snap1" + compare_images "${base_image}@2" "${dest_image}@snap2" + compare_images "${base_image}" "${dest_image}@head" + + rbd migration execute ${dest_image} + + compare_images "${base_image}@1" "${dest_image}@snap1" + compare_images "${base_image}@2" "${dest_image}@snap2" + compare_images "${base_image}" "${dest_image}@head" + + rbd migration commit ${dest_image} + + compare_images "${base_image}@1" "${dest_image}@snap1" + compare_images "${base_image}@2" "${dest_image}@snap2" + compare_images "${base_image}" "${dest_image}@head" + + remove_image "${dest_image}" + + kill_nbd_server +} + # make sure rbd pool is EMPTY.. this is a test script!! rbd ls 2>&1 | wc -l | grep -v '^0$' && echo "nonempty rbd pool, aborting! run this script on an empty test cluster only." && exit 1 @@ -351,7 +607,25 @@ export_base_image ${IMAGE1} test_import_native_format ${IMAGE1} ${IMAGE2} test_import_qcow_format ${IMAGE1} ${IMAGE2} + test_import_qcow2_format ${IMAGE2} ${IMAGE3} +test_import_nbd_stream_qcow2 ${IMAGE2} ${IMAGE3} + test_import_raw_format ${IMAGE1} ${IMAGE2} +test_import_nbd_stream_raw ${IMAGE1} ${IMAGE2} + +rbd namespace create rbd/${NAMESPACE1} +rbd namespace create rbd/${NAMESPACE2} +create_base_image rbd/${NAMESPACE1}/${IMAGE1} +export_base_image rbd/${NAMESPACE1}/${IMAGE1} + +# Migration from namespace to namespace +test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2} + +# Migration from namespace to non-namespace +test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} ${IMAGE2} + +# Migration from non-namespace to namespace +test_import_native_format ${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2} echo OK diff --git a/qa/workunits/rbd/journal.sh b/qa/workunits/rbd/journal.sh index ba89e75c926..7652a274243 100755 --- a/qa/workunits/rbd/journal.sh +++ b/qa/workunits/rbd/journal.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash set -e -. $(dirname $0)/../../standalone/ceph-helpers.sh - function list_tests() { echo "AVAILABLE TESTS" @@ -45,7 +43,7 @@ test_rbd_journal() rbd create --image-feature exclusive-lock --image-feature journaling \ --size 128 ${image} local journal=$(rbd info ${image} --format=xml 2>/dev/null | - $XMLSTARLET sel -t -v "//image/journal") + xmlstarlet sel -t -v "//image/journal") test -n "${journal}" rbd journal info ${journal} rbd journal info --journal ${journal} @@ -54,14 +52,14 @@ test_rbd_journal() rbd feature disable ${image} journaling rbd info ${image} --format=xml 2>/dev/null | - expect_false $XMLSTARLET sel -t -v "//image/journal" + expect_false xmlstarlet sel -t -v "//image/journal" expect_false rbd journal info ${journal} expect_false rbd journal info --image ${image} rbd feature enable ${image} journaling local journal1=$(rbd info ${image} --format=xml 2>/dev/null | - $XMLSTARLET sel -t -v "//image/journal") + xmlstarlet sel -t -v "//image/journal") test "${journal}" = "${journal1}" rbd journal info ${journal} @@ -89,7 +87,7 @@ test_rbd_journal() rbd create --image-feature exclusive-lock --image-feature journaling \ --size 128 ${image1} journal1=$(rbd info ${image1} --format=xml 2>/dev/null | - $XMLSTARLET sel -t -v "//image/journal") + xmlstarlet sel -t -v "//image/journal") save_commit_position ${journal1} rbd journal import --dest ${image1} $TMPDIR/journal.export @@ -130,7 +128,7 @@ rbd_assert_eq() { local expected_val=$4 local val=$(rbd --format xml ${cmd} --image ${image} | - $XMLSTARLET sel -t -v "${param}") + xmlstarlet sel -t -v "${param}") test "${val}" = "${expected_val}" } diff --git a/qa/workunits/rbd/luks-encryption.sh b/qa/workunits/rbd/luks-encryption.sh index 97cb5a0fe87..b6305cb46c6 100755 --- a/qa/workunits/rbd/luks-encryption.sh +++ b/qa/workunits/rbd/luks-encryption.sh @@ -2,7 +2,7 @@ set -ex CEPH_ID=${CEPH_ID:-admin} -TMP_FILES="/tmp/passphrase /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2" +TMP_FILES="/tmp/passphrase /tmp/passphrase1 /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2" _sudo() { @@ -278,8 +278,7 @@ function test_migration_clone() { rbd migration prepare testimg1 testimg2 # test reading - # FIXME: https://tracker.ceph.com/issues/63184 - LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase) + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase) cmp $LIBRBD_DEV /tmp/cmpdata # trigger copyup for an unwritten area @@ -297,8 +296,7 @@ function test_migration_clone() { _sudo rbd device unmap -t nbd $LIBRBD_DEV # test reading on a fresh mapping - # FIXME: https://tracker.ceph.com/issues/63184 - LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase) + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase) cmp $LIBRBD_DEV /tmp/cmpdata _sudo rbd device unmap -t nbd $LIBRBD_DEV @@ -320,6 +318,85 @@ function test_migration_clone() { rbd rm testimg } +function test_migration_open_clone_chain() { + rbd create --size 32M testimg + rbd encryption format testimg luks1 /tmp/passphrase + rbd snap create testimg@snap + rbd snap protect testimg@snap + + rbd clone testimg@snap testimg1 + rbd encryption format testimg1 luks2 /tmp/passphrase1 + rbd snap create testimg1@snap + rbd snap protect testimg1@snap + + rbd clone testimg1@snap testimg2 + rbd encryption format testimg2 luks1 /tmp/passphrase2 + + # 1. X <-- X <-- X + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + + # 2. X <-- X <-- migrating + rbd migration prepare testimg2 testimg2 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg2 + + # 3. X <-- migrating <-- X + rbd migration prepare testimg1 testimg1 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg1 + + # 4. migrating <-- X <-- X + rbd migration prepare testimg testimg + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg + + # 5. migrating <-- migrating <-- X + rbd migration prepare testimg testimg + rbd migration prepare testimg1 testimg1 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg1 + rbd migration abort testimg + + # 6. migrating <-- X <-- migrating + rbd migration prepare testimg testimg + rbd migration prepare testimg2 testimg2 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg2 + rbd migration abort testimg + + # 7. X <-- migrating <-- migrating + rbd migration prepare testimg1 testimg1 + rbd migration prepare testimg2 testimg2 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + rbd migration abort testimg2 + rbd migration abort testimg1 + + # 8. migrating <-- migrating <-- migrating + rbd migration prepare testimg testimg + rbd migration prepare testimg1 testimg1 + rbd migration prepare testimg2 testimg2 + LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase) + _sudo rbd device unmap -t nbd $LIBRBD_DEV + + rbd migration abort testimg2 + rbd rm testimg2 + rbd migration abort testimg1 + rbd snap unprotect testimg1@snap + rbd snap rm testimg1@snap + rbd rm testimg1 + rbd migration abort testimg + rbd snap unprotect testimg@snap + rbd snap rm testimg@snap + rbd rm testimg +} + function get_nbd_device_paths { rbd device list -t nbd | tail -n +2 | egrep "\s+rbd\s+testimg" | awk '{print $5;}' } @@ -343,6 +420,7 @@ function clean_up { rbd snap unprotect testimg1@snap || true rbd snap remove testimg1@snap || true rbd remove testimg1 || true + rbd migration abort testimg || true rbd snap remove testimg@snap2 || true rbd snap remove testimg@snap1 || true rbd snap unprotect testimg@snap || true @@ -371,6 +449,7 @@ dd if=/dev/urandom of=/tmp/testdata2 bs=4M count=4 # create passphrase files printf "pass\0word\n" > /tmp/passphrase +printf " passwo\nrd 1,1" > /tmp/passphrase1 printf "\t password2 " > /tmp/passphrase2 # create an image @@ -401,4 +480,6 @@ test_migration_clone luks1 rbd create --size 48M testimg test_migration_clone luks2 +test_migration_open_clone_chain + echo OK diff --git a/qa/workunits/rbd/rbd-ggate.sh b/qa/workunits/rbd/rbd-ggate.sh index 1bf89da382c..d1dd00e4e2d 100755 --- a/qa/workunits/rbd/rbd-ggate.sh +++ b/qa/workunits/rbd/rbd-ggate.sh @@ -7,15 +7,6 @@ SIZE=64 DATA= DEV= -if which xmlstarlet > /dev/null 2>&1; then - XMLSTARLET=xmlstarlet -elif which xml > /dev/null 2>&1; then - XMLSTARLET=xml -else - echo "Missing xmlstarlet binary!" - exit 1 -fi - if [ `uname -K` -ge 1200078 ] ; then RBD_GGATE_RESIZE_SUPPORTED=1 fi @@ -148,16 +139,16 @@ _sudo sync echo trim test provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -eq "${provisioned}" ] _sudo newfs -E ${DEV} _sudo sync provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -lt "${provisioned}" ] echo resize test diff --git a/qa/workunits/rbd/rbd-nbd.sh b/qa/workunits/rbd/rbd-nbd.sh index 98b3aff1370..1f9acd14492 100755 --- a/qa/workunits/rbd/rbd-nbd.sh +++ b/qa/workunits/rbd/rbd-nbd.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash set -ex -. $(dirname $0)/../../standalone/ceph-helpers.sh - POOL=rbd ANOTHER_POOL=new_default_pool$$ NS=ns @@ -105,7 +103,7 @@ function get_pid() local pool=$1 local ns=$2 - PID=$(rbd device --device-type nbd --format xml list | $XMLSTARLET sel -t -v \ + PID=$(rbd device --device-type nbd --format xml list | xmlstarlet sel -t -v \ "//devices/device[pool='${pool}'][namespace='${ns}'][image='${IMAGE}'][device='${DEV}']/id") test -n "${PID}" || return 1 ps -p ${PID} -C rbd-nbd @@ -172,17 +170,17 @@ unmap_device ${DEV} ${PID} DEV=`_sudo rbd device --device-type nbd --options notrim map ${POOL}/${IMAGE}` get_pid ${POOL} provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -eq "${provisioned}" ] # should fail discard as at time of mapping notrim was used expect_false _sudo blkdiscard ${DEV} sync provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -eq "${provisioned}" ] unmap_device ${DEV} ${PID} @@ -190,17 +188,17 @@ unmap_device ${DEV} ${PID} DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` get_pid ${POOL} provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -eq "${provisioned}" ] # should honor discard as at time of mapping trim was considered by default _sudo blkdiscard ${DEV} sync provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .` used=`rbd -p ${POOL} --format xml du ${IMAGE} | - $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .` + xmlstarlet sel -t -m "//stats/images/image/used_size" -v .` [ "${used}" -lt "${provisioned}" ] unmap_device ${DEV} ${PID} diff --git a/qa/workunits/rbd/rbd_groups.sh b/qa/workunits/rbd/rbd_groups.sh index 450095dabfc..ee3cb506740 100755 --- a/qa/workunits/rbd/rbd_groups.sh +++ b/qa/workunits/rbd/rbd_groups.sh @@ -210,15 +210,32 @@ check_snapshot_info() local snap_name=$2 local image_count=$3 - local snap_info=$(rbd group snap info $group_name@$snap_name --format=json) - local actual_snap_name=$(jq -r ".name" <<< "$snap_info") + local snap_info_json=$( + rbd group snap info $group_name@$snap_name --format=json) + local actual_snap_name=$(jq -r ".name" <<< "$snap_info_json") test "$actual_snap_name" = "$snap_name" || return 1 - local snap_state=$(jq -r ".state" <<< "$snap_info") + local snap_state=$(jq -r ".state" <<< "$snap_info_json") test "$snap_state" = "complete" || return 1 - local actual_image_count=$(jq '.images | length' <<< "$snap_info") - test "$actual_image_count" = "$image_count" + local actual_image_count=$(jq '.images | length' <<< "$snap_info_json") + test "$actual_image_count" = "$image_count" || return 1 + + local image_snap_name=$(jq -r '.image_snap_name' <<< "$snap_info_json") + local snap_info=$(rbd group snap info $group_name@$snap_name) + local snap_state=$(grep -w 'state:' <<< "$snap_info" | tr -d '\t') + test "$snap_state" = "state: complete" || return 1 + local image_snap_field=$(grep -w 'image snap:' <<< "$snap_info") + local images_field=$(grep -w 'images:' <<< "$snap_info") + if ((image_count != 0)); then + test -n "$image_snap_name" || return 1 + test -n "$image_snap_field" || return 1 + test -n "$images_field" || return 1 + else + test -z "$image_snap_name" || return 1 + test -z "$image_snap_field" || return 1 + test -z "$images_field" || return 1 + fi } echo "TEST: create remove consistency group" diff --git a/qa/workunits/rbd/rbd_mirror.sh b/qa/workunits/rbd/rbd_mirror.sh index 1cda355039e..90d5204b92f 100755 --- a/qa/workunits/rbd/rbd_mirror.sh +++ b/qa/workunits/rbd/rbd_mirror.sh @@ -37,12 +37,12 @@ set_image_meta ${CLUSTER2} ${POOL} ${image} "key1" "value1" set_image_meta ${CLUSTER2} ${POOL} ${image} "key2" "value2" wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'down+unknown' fi -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} compare_image_meta ${CLUSTER1} ${POOL} ${image} "key1" "value1" compare_image_meta ${CLUSTER1} ${POOL} ${image} "key2" "value2" @@ -53,19 +53,19 @@ create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${image1} ${RBD_MIRROR_MODE} write_image ${CLUSTER2} ${POOL} ${image1} 100 start_mirrors ${CLUSTER1} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image1} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image1} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image1} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1} if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image1} 'down+unknown' fi -compare_images ${POOL} ${image1} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image1} testlog "TEST: test the first image is replaying after restart" write_image ${CLUSTER2} ${POOL} ${image} 100 wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then testlog "TEST: stop/start/restart mirror via admin socket" @@ -173,7 +173,7 @@ wait_for_image_in_omap ${CLUSTER2} ${POOL} create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${image} ${RBD_MIRROR_MODE} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' testlog "TEST: failover and failback" @@ -187,10 +187,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} # failover (unmodified) demote_image ${CLUSTER2} ${POOL} ${image} @@ -207,10 +207,10 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} # failover demote_image ${CLUSTER2} ${POOL} ${image} @@ -220,10 +220,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} write_image ${CLUSTER1} ${POOL} ${image} 100 -wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} # failback demote_image ${CLUSTER1} ${POOL} ${image} @@ -233,10 +233,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} testlog "TEST: failover / failback loop" for i in `seq 1 20`; do @@ -246,7 +246,7 @@ for i in `seq 1 20`; do wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying' demote_image ${CLUSTER1} ${POOL} ${image} @@ -255,7 +255,7 @@ for i in `seq 1 20`; do wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' done @@ -271,7 +271,7 @@ create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${force_promote_image} ${RBD_ write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100 wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${force_promote_image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${force_promote_image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${force_promote_image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${force_promote_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${force_promote_image} wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${force_promote_image} 'up+stopped' promote_image ${CLUSTER1} ${POOL} ${force_promote_image} '--force' @@ -302,14 +302,14 @@ else enable_mirror ${CLUSTER2} ${PARENT_POOL} ${parent_image} ${RBD_MIRROR_MODE} fi wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} ${parent_image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${parent_image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} ${parent_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} ${parent_image} -compare_images ${PARENT_POOL} ${parent_image} +compare_images ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} ${parent_image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${clone_image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${clone_image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${clone_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${clone_image} -compare_images ${POOL} ${clone_image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${clone_image} remove_image_retry ${CLUSTER2} ${POOL} ${clone_image} testlog " - clone v1" @@ -383,11 +383,11 @@ create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap1' write_image ${CLUSTER2} ${POOL} ${dp_image} 100 create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap2' write_image ${CLUSTER2} ${POOL} ${dp_image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${dp_image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${dp_image} -compare_images ${POOL} ${dp_image}@snap1 -compare_images ${POOL} ${dp_image}@snap2 -compare_images ${POOL} ${dp_image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}@snap1 +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}@snap2 +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image} remove_image_retry ${CLUSTER2} ${POOL} ${dp_image} testlog "TEST: disable mirroring / delete non-primary image" @@ -436,8 +436,8 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then wait_for_image_present ${CLUSTER1} ${POOL} ${i} 'present' wait_for_snap_present ${CLUSTER1} ${POOL} ${i} 'snap2' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${i} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${i} - compare_images ${POOL} ${i} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${i} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${i} done testlog "TEST: remove mirroring pool" @@ -454,9 +454,9 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then create_image ${CLUSTER2} ${POOL} ${rdp_image} 128 --data-pool ${pool} write_image ${CLUSTER2} ${pool} ${image} 100 write_image ${CLUSTER2} ${POOL} ${rdp_image} 100 - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${pool} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${pool} ${pool} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${pool} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${rdp_image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${rdp_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${rdp_image} for cluster in ${CLUSTER1} ${CLUSTER2}; do CEPH_ARGS='' ceph --cluster ${cluster} osd pool rm ${pool} ${pool} --yes-i-really-really-mean-it @@ -519,12 +519,12 @@ wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS1} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS2} ${image} write_image ${CLUSTER2} ${POOL}/${NS1} ${image} 100 write_image ${CLUSTER2} ${POOL}/${NS2} ${image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${POOL}/${NS2} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS1} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS2} ${image} -compare_images ${POOL}/${NS1} ${image} -compare_images ${POOL}/${NS2} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${POOL}/${NS2} ${image} testlog " - disable mirroring / delete image" remove_image_retry ${CLUSTER2} ${POOL}/${NS1} ${image} @@ -533,6 +533,40 @@ wait_for_image_present ${CLUSTER1} ${POOL}/${NS1} ${image} 'deleted' wait_for_image_present ${CLUSTER1} ${POOL}/${NS2} ${image} 'deleted' remove_image_retry ${CLUSTER2} ${POOL}/${NS2} ${image} +testlog "TEST: mirror to a different remote namespace" +testlog " - replay" +NS3=ns3 +NS4=ns4 +rbd --cluster ${CLUSTER1} namespace create ${POOL}/${NS3} +rbd --cluster ${CLUSTER2} namespace create ${POOL}/${NS4} +rbd --cluster ${CLUSTER1} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE} --remote-namespace ${NS4} +rbd --cluster ${CLUSTER2} mirror pool enable ${POOL}/${NS4} ${MIRROR_POOL_MODE} --remote-namespace ${NS3} +create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS4} ${image} ${RBD_MIRROR_MODE} +wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS3} ${image} +write_image ${CLUSTER2} ${POOL}/${NS4} ${image} 100 +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS4} ${image} +wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS3} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS4} ${image} + +testlog " - disable mirroring and re-enable without remote-namespace" +remove_image_retry ${CLUSTER2} ${POOL}/${NS4} ${image} +wait_for_image_present ${CLUSTER1} ${POOL}/${NS3} ${image} 'deleted' +rbd --cluster ${CLUSTER1} mirror pool disable ${POOL}/${NS3} +rbd --cluster ${CLUSTER2} mirror pool disable ${POOL}/${NS4} +rbd --cluster ${CLUSTER2} namespace create ${POOL}/${NS3} +rbd --cluster ${CLUSTER2} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE} +rbd --cluster ${CLUSTER1} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE} +create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS3} ${image} ${RBD_MIRROR_MODE} +wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS3} ${image} +write_image ${CLUSTER2} ${POOL}/${NS3} ${image} 100 +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS3} ${image} +wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS3} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS3} ${image} +remove_image_retry ${CLUSTER2} ${POOL}/${NS3} ${image} +wait_for_image_present ${CLUSTER1} ${POOL}/${NS3} ${image} 'deleted' +rbd --cluster ${CLUSTER1} mirror pool disable ${POOL}/${NS3} +rbd --cluster ${CLUSTER2} mirror pool disable ${POOL}/${NS3} + testlog " - data pool" dp_image=test_data_pool create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS1} ${dp_image} ${RBD_MIRROR_MODE} 128 --data-pool ${PARENT_POOL} @@ -542,9 +576,9 @@ wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS1} ${dp_image} data_pool=$(get_image_data_pool ${CLUSTER1} ${POOL}/${NS1} ${dp_image}) test "${data_pool}" = "${PARENT_POOL}" write_image ${CLUSTER2} ${POOL}/${NS1} ${dp_image} 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${dp_image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${dp_image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS1} ${dp_image} -compare_images ${POOL}/${NS1} ${dp_image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${dp_image} remove_image_retry ${CLUSTER2} ${POOL}/${NS1} ${dp_image} testlog "TEST: simple image resync" @@ -553,7 +587,7 @@ wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then testlog "TEST: image resync while replayer is stopped" @@ -566,7 +600,7 @@ if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} - compare_images ${POOL} ${image} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} fi testlog "TEST: request image resync while daemon is offline" @@ -577,7 +611,7 @@ wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} -compare_images ${POOL} ${image} +compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} remove_image_retry ${CLUSTER2} ${POOL} ${image} if [ "${RBD_MIRROR_MODE}" = "journal" ]; then @@ -588,7 +622,7 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then testlog " - replay stopped after disconnect" wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" disconnect_image ${CLUSTER2} ${POOL} ${image} test -z "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" @@ -600,9 +634,9 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" - compare_images ${POOL} ${image} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} testlog " - disconnected after max_concurrent_object_sets reached" if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then @@ -628,25 +662,25 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" - compare_images ${POOL} ${image} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} testlog " - rbd_mirroring_resync_after_disconnect config option" set_image_meta ${CLUSTER2} ${POOL} ${image} \ conf_rbd_mirroring_resync_after_disconnect true - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} image_id=$(get_image_id ${CLUSTER1} ${POOL} ${image}) disconnect_image ${CLUSTER2} ${POOL} ${image} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id} wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present' wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" - compare_images ${POOL} ${image} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} set_image_meta ${CLUSTER2} ${POOL} ${image} \ conf_rbd_mirroring_resync_after_disconnect false - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} disconnect_image ${CLUSTER2} ${POOL} ${image} test -z "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})" wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} diff --git a/qa/workunits/rbd/rbd_mirror_bootstrap.sh b/qa/workunits/rbd/rbd_mirror_bootstrap.sh index 412e84c88a6..3ddb0aa219b 100755 --- a/qa/workunits/rbd/rbd_mirror_bootstrap.sh +++ b/qa/workunits/rbd/rbd_mirror_bootstrap.sh @@ -38,7 +38,7 @@ create_image_and_enable_mirror ${CLUSTER1} ${POOL} image1 wait_for_image_replay_started ${CLUSTER2} ${POOL} image1 write_image ${CLUSTER1} ${POOL} image1 100 -wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} image1 +wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} image1 wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${POOL} image1 testlog "TEST: verify rx-tx direction" @@ -54,12 +54,12 @@ enable_mirror ${CLUSTER2} ${PARENT_POOL} image2 wait_for_image_replay_started ${CLUSTER2} ${PARENT_POOL} image1 write_image ${CLUSTER1} ${PARENT_POOL} image1 100 -wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1 +wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} ${PARENT_POOL} image1 wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${PARENT_POOL} image1 wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2 write_image ${CLUSTER2} ${PARENT_POOL} image2 100 -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2 +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} image2 wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} image2 testlog "TEST: pool replayer and callout cleanup when peer is updated" diff --git a/qa/workunits/rbd/rbd_mirror_ha.sh b/qa/workunits/rbd/rbd_mirror_ha.sh index 1e43712a631..e5a086b82ab 100755 --- a/qa/workunits/rbd/rbd_mirror_ha.sh +++ b/qa/workunits/rbd/rbd_mirror_ha.sh @@ -71,7 +71,7 @@ test_replay() wait_for_image_replay_started ${CLUSTER1}:${LEADER} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 wait_for_replay_complete ${CLUSTER1}:${LEADER} ${CLUSTER2} ${POOL} \ - ${image} + ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' \ 'primary_position' \ "${MIRROR_USER_ID_PREFIX}${LEADER} on $(hostname -s)" @@ -79,7 +79,7 @@ test_replay() wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} \ 'down+unknown' fi - compare_images ${POOL} ${image} + compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} done } diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh index abb1d17c8df..1b1436db74d 100755 --- a/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -72,15 +72,6 @@ # ../qa/workunits/rbd/rbd_mirror_helpers.sh cleanup # -if type xmlstarlet > /dev/null 2>&1; then - XMLSTARLET=xmlstarlet -elif type xml > /dev/null 2>&1; then - XMLSTARLET=xml -else - echo "Missing xmlstarlet binary!" - exit 1 -fi - RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-2} CLUSTER1=cluster1 @@ -752,17 +743,18 @@ wait_for_journal_replay_complete() { local local_cluster=$1 local cluster=$2 - local pool=$3 - local image=$4 + local local_pool=$3 + local remote_pool=$4 + local image=$5 local s master_pos mirror_pos last_mirror_pos local master_tag master_entry mirror_tag mirror_entry while true; do for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16 32 32; do sleep ${s} - flush "${local_cluster}" "${pool}" "${image}" - master_pos=$(get_master_journal_position "${cluster}" "${pool}" "${image}") - mirror_pos=$(get_mirror_journal_position "${cluster}" "${pool}" "${image}") + flush "${local_cluster}" "${local_pool}" "${image}" + master_pos=$(get_master_journal_position "${cluster}" "${remote_pool}" "${image}") + mirror_pos=$(get_mirror_journal_position "${cluster}" "${remote_pool}" "${image}") test -n "${master_pos}" -a "${master_pos}" = "${mirror_pos}" && return 0 test "${mirror_pos}" != "${last_mirror_pos}" && break done @@ -805,21 +797,22 @@ wait_for_snapshot_sync_complete() { local local_cluster=$1 local cluster=$2 - local pool=$3 - local image=$4 + local local_pool=$3 + local remote_pool=$4 + local image=$5 - local status_log=${TEMPDIR}/$(mkfname ${cluster}-${pool}-${image}.status) - local local_status_log=${TEMPDIR}/$(mkfname ${local_cluster}-${pool}-${image}.status) + local status_log=${TEMPDIR}/$(mkfname ${cluster}-${remote_pool}-${image}.status) + local local_status_log=${TEMPDIR}/$(mkfname ${local_cluster}-${local_pool}-${image}.status) - mirror_image_snapshot "${cluster}" "${pool}" "${image}" - get_newest_mirror_snapshot "${cluster}" "${pool}" "${image}" "${status_log}" + mirror_image_snapshot "${cluster}" "${remote_pool}" "${image}" + get_newest_mirror_snapshot "${cluster}" "${remote_pool}" "${image}" "${status_log}" local snapshot_id=$(xmlstarlet sel -t -v "//snapshot/id" < ${status_log}) while true; do for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16 32 32; do sleep ${s} - get_newest_mirror_snapshot "${local_cluster}" "${pool}" "${image}" "${local_status_log}" + get_newest_mirror_snapshot "${local_cluster}" "${local_pool}" "${image}" "${local_status_log}" local primary_snapshot_id=$(xmlstarlet sel -t -v "//snapshot/namespace/primary_snap_id" < ${local_status_log}) test "${snapshot_id}" = "${primary_snapshot_id}" && return 0 @@ -834,13 +827,14 @@ wait_for_replay_complete() { local local_cluster=$1 local cluster=$2 - local pool=$3 - local image=$4 + local local_pool=$3 + local remote_pool=$4 + local image=$5 if [ "${RBD_MIRROR_MODE}" = "journal" ]; then - wait_for_journal_replay_complete ${local_cluster} ${cluster} ${pool} ${image} + wait_for_journal_replay_complete ${local_cluster} ${cluster} ${local_pool} ${remote_pool} ${image} elif [ "${RBD_MIRROR_MODE}" = "snapshot" ]; then - wait_for_snapshot_sync_complete ${local_cluster} ${cluster} ${pool} ${image} + wait_for_snapshot_sync_complete ${local_cluster} ${cluster} ${local_pool} ${remote_pool} ${image} else return 1 fi @@ -894,9 +888,9 @@ test_mirror_pool_status_verbose() --verbose --format xml) local last_update state - last_update=$($XMLSTARLET sel -t -v \ + last_update=$(xmlstarlet sel -t -v \ "//images/image[name='${image}']/last_update" <<< "$status") - state=$($XMLSTARLET sel -t -v \ + state=$(xmlstarlet sel -t -v \ "//images/image[name='${image}']/state" <<< "$status") echo "${state}" | grep "${state_pattern}" || @@ -1307,16 +1301,19 @@ show_diff() compare_images() { - local pool=$1 - local image=$2 local ret=0 + local local_cluster=$1 + local cluster=$2 + local local_pool=$3 + local remote_pool=$4 + local image=$5 - local rmt_export=${TEMPDIR}/$(mkfname ${CLUSTER2}-${pool}-${image}.export) - local loc_export=${TEMPDIR}/$(mkfname ${CLUSTER1}-${pool}-${image}.export) + local rmt_export=${TEMPDIR}/$(mkfname ${cluster}-${remote_pool}-${image}.export) + local loc_export=${TEMPDIR}/$(mkfname ${local_cluster}-${local_pool}-${image}.export) rm -f ${rmt_export} ${loc_export} - rbd --cluster ${CLUSTER2} export ${pool}/${image} ${rmt_export} - rbd --cluster ${CLUSTER1} export ${pool}/${image} ${loc_export} + rbd --cluster ${cluster} export ${remote_pool}/${image} ${rmt_export} + rbd --cluster ${local_cluster} export ${local_pool}/${image} ${loc_export} if ! cmp ${rmt_export} ${loc_export} then show_diff ${rmt_export} ${loc_export} @@ -1337,7 +1334,7 @@ compare_image_snapshots() for snap_name in $(rbd --cluster ${CLUSTER1} --format xml \ snap list ${pool}/${image} | \ - $XMLSTARLET sel -t -v "//snapshot/name" | \ + xmlstarlet sel -t -v "//snapshot/name" | \ grep -E -v "^\.rbd-mirror\."); do rm -f ${rmt_export} ${loc_export} rbd --cluster ${CLUSTER2} export ${pool}/${image}@${snap_name} ${rmt_export} diff --git a/qa/workunits/rbd/rbd_mirror_stress.sh b/qa/workunits/rbd/rbd_mirror_stress.sh index baf0c9f1a8f..b0a85e8a48a 100755 --- a/qa/workunits/rbd/rbd_mirror_stress.sh +++ b/qa/workunits/rbd/rbd_mirror_stress.sh @@ -111,7 +111,7 @@ do snap_name="snap${i}" create_snap ${CLUSTER2} ${POOL} ${image} ${snap_name} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${snap_name} if [ -n "${clean_snap_name}" ]; then @@ -124,7 +124,7 @@ do done wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} -wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} +wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${clean_snap_name} for i in `seq 1 10` @@ -173,7 +173,7 @@ do image="image_${i}" create_snap ${CLUSTER2} ${POOL} ${image} ${snap_name} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} - wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image} wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${snap_name} compare_image_snaps ${POOL} ${image} ${snap_name} done diff --git a/qa/workunits/rbd/test_admin_socket.sh b/qa/workunits/rbd/test_admin_socket.sh index 6b960787b5e..110fdd48ea7 100755 --- a/qa/workunits/rbd/test_admin_socket.sh +++ b/qa/workunits/rbd/test_admin_socket.sh @@ -5,8 +5,6 @@ TMPDIR=/tmp/rbd_test_admin_socket$$ mkdir $TMPDIR trap "rm -fr $TMPDIR" 0 -. $(dirname $0)/../../standalone/ceph-helpers.sh - function expect_false() { set -x @@ -40,12 +38,12 @@ function rbd_get_perfcounter() local name name=$(ceph --format xml --admin-daemon $(rbd_watch_asok ${image}) \ - perf schema | $XMLSTARLET el -d3 | + perf schema | xmlstarlet el -d3 | grep "/librbd-.*-${image}/${counter}\$") test -n "${name}" || return 1 ceph --format xml --admin-daemon $(rbd_watch_asok ${image}) perf dump | - $XMLSTARLET sel -t -m "${name}" -v . + xmlstarlet sel -t -m "${name}" -v . } function rbd_check_perfcounter() diff --git a/qa/workunits/rest/test-restful.sh b/qa/workunits/rest/test-restful.sh deleted file mode 100755 index fde0d107a0b..00000000000 --- a/qa/workunits/rest/test-restful.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -ex - -mydir=`dirname $0` - -secret=`ceph config-key get mgr/restful/keys/admin` -url=$(ceph mgr dump|jq -r .services.restful|sed -e 's/\/$//') -echo "url $url secret $secret" -$mydir/test_mgr_rest_api.py $url $secret - -echo $0 OK diff --git a/qa/workunits/rgw/s3_utilities.pm b/qa/workunits/rgw/s3_utilities.pm index 3c3fae900e8..5a91db9d1fd 100644 --- a/qa/workunits/rgw/s3_utilities.pm +++ b/qa/workunits/rgw/s3_utilities.pm @@ -21,7 +21,7 @@ sub get_timestamp { if ($min < 10) { $min = "0$min"; } if ($sec < 10) { $sec = "0$sec"; } $year=$year+1900; - return $year . '_' . $mon . '_' . $mday . '_' . $hour . '_' . $min . '_' . $sec; + return $year . '-' . $mon . '-' . $mday . '-' . $hour . '-' . $min . '-' . $sec; } # Function to check if radosgw is already running @@ -195,11 +195,12 @@ sub run_s3 host => $hostname, secure => 0, retry => 1, + dns_bucket_names => 0, } ); } -our $bucketname = 'buck_'.get_timestamp(); +our $bucketname = 'buck-'.get_timestamp(); # create a new bucket (the test bucket) our $bucket = $s3->add_bucket( { bucket => $bucketname } ) or die $s3->err. "bucket $bucketname create failed\n". $s3->errstr; diff --git a/qa/workunits/rgw/test_rgw_bucket_check.py b/qa/workunits/rgw/test_rgw_bucket_check.py index bfa6d65d6e7..33936df2401 100755 --- a/qa/workunits/rgw/test_rgw_bucket_check.py +++ b/qa/workunits/rgw/test_rgw_bucket_check.py @@ -173,6 +173,7 @@ def main(): exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}') out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys') json_out = json.loads(out) + log.info(f'"bucket check unlinked" returned {json_out}, expecting {unlinked_keys}') assert len(json_out) == len(unlinked_keys) bucket.object_versions.all().delete() out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}') diff --git a/qa/workunits/rgw/test_rgw_reshard.py b/qa/workunits/rgw/test_rgw_reshard.py index e22050fc27f..18ffb102250 100755 --- a/qa/workunits/rgw/test_rgw_reshard.py +++ b/qa/workunits/rgw/test_rgw_reshard.py @@ -76,6 +76,16 @@ def get_bucket_num_shards(bucket_name, bucket_id): num_shards = json_op['data']['bucket_info']['num_shards'] return num_shards +def get_bucket_reshard_status(bucket_name): + """ + function to get bucket reshard status + """ + cmd = exec_cmd("radosgw-admin bucket stats --bucket {}".format(bucket_name)) + json_op = json.loads(cmd) + #print(json.dumps(json_op, indent = 4, sort_keys=True)) + reshard_status = json_op['reshard_status'] + return reshard_status + def run_bucket_reshard_cmd(bucket_name, num_shards, **kwargs): cmd = 'radosgw-admin bucket reshard --bucket {} --num-shards {}'.format(bucket_name, num_shards) cmd += ' --rgw-reshard-bucket-lock-duration 30' # reduce to minimum @@ -104,7 +114,7 @@ def test_bucket_reshard(conn, name, **fault): # try reshard with fault injection _, ret = run_bucket_reshard_cmd(name, num_shards_expected, check_retcode=False, **fault) - if fault.get('error_code') == errno.ECANCELED: + if fault.get('error_code') == errno.ECANCELED or fault.get('error_code') == errno.EOPNOTSUPP: assert(ret == 0) # expect ECANCELED to retry and succeed else: assert(ret != 0 and ret != errno.EBUSY) @@ -139,6 +149,11 @@ def test_bucket_reshard(conn, name, **fault): bucket.delete_objects(Delete={'Objects':[{'Key':o.key} for o in objs]}) bucket.delete() +def calc_reshardlog_count(json_op): + cnt = 0 + for shard in json_op: + cnt += len(shard['shard_entries']) + return cnt def main(): """ @@ -210,6 +225,13 @@ def main(): log.error("Resharding failed on bucket {}. Expected number of shards are not created\n".format(BUCKET_NAME)) # TESTCASE 'manual bucket resharding','inject error','fail','check bucket accessibility', 'retry reshard' + log.debug('TEST: reshard bucket with EIO injected at init_index\n') + test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index') + log.debug('TEST: reshard bucket with EOPNOTSUPP injected at init_index\n') + test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index', error_code=errno.EOPNOTSUPP) + log.debug('TEST: reshard bucket with abort at init_index\n') + test_bucket_reshard(connection, 'abort-at-init-indext', abort_at='init_index') + log.debug('TEST: reshard bucket with EIO injected at set_target_layout\n') test_bucket_reshard(connection, 'error-at-set-target-layout', error_at='set_target_layout') log.debug('TEST: reshard bucket with ECANCELED injected at set_target_layout\n') @@ -217,6 +239,13 @@ def main(): log.debug('TEST: reshard bucket with abort at set_target_layout\n') test_bucket_reshard(connection, 'abort-at-set-target-layout', abort_at='set_target_layout') + log.debug('TEST: reshard bucket with EIO injected at trim_reshard_log_entries\n') + test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries') + log.debug('TEST: reshard bucket with EOPNOTSUPP injected at trim_reshard_log_entries\n') + test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries', error_code=errno.EOPNOTSUPP) + log.debug('TEST: reshard bucket with abort at trim_reshard_log_entries\n') + test_bucket_reshard(connection, 'abort-at-trim-reshard-log-entries', abort_at='trim_reshard_log_entries') + log.debug('TEST: reshard bucket with EIO injected at block_writes\n') test_bucket_reshard(connection, 'error-at-block-writes', error_at='block_writes') log.debug('TEST: reshard bucket with abort at block_writes\n') @@ -234,6 +263,80 @@ def main(): log.debug('TEST: reshard bucket with abort at do_reshard\n') test_bucket_reshard(connection, 'abort-at-do-reshard', abort_at='do_reshard') + log.debug('TEST: reshard bucket with EIO injected at logrecord_writes\n') + test_bucket_reshard(connection, 'error-at-logrecord-writes', error_at='logrecord_writes') + log.debug('TEST: reshard bucket with abort at logrecord_writes\n') + test_bucket_reshard(connection, 'abort-at-logrecord-writes', abort_at='logrecord_writes') + + log.debug('TEST: reshard bucket with EIO injected at change_reshard_state\n') + test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state') + log.debug('TEST: reshard bucket with ECANCELED injected at change_reshard_state\n') + test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state', error_code=errno.ECANCELED) + log.debug('TEST: reshard bucket with abort at change_reshard_state\n') + test_bucket_reshard(connection, 'abort-at-change-reshard-state', abort_at='change_reshard_state') + + # TESTCASE 'logrecord could be stopped after reshard failed' + log.debug(' test: logrecord could be stopped after reshard failed') + num_shards = get_bucket_stats(BUCKET_NAME).num_shards + assert "None" == get_bucket_reshard_status(BUCKET_NAME) + _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='change_reshard_state') + assert(ret != 0 and ret != errno.EBUSY) + assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME) + + bucket.put_object(Key='put_during_logrecord', Body=b"some_data") + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 1 + + # end up with logrecord status, the logrecord will be purged + time.sleep(30) + assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME) + bucket.put_object(Key='put_during_logrecord1', Body=b"some_data1") + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 0 + assert "None" == get_bucket_reshard_status(BUCKET_NAME) + + # TESTCASE 'duplicated entries should be purged before reshard' + log.debug(' test: duplicated entries should be purged before reshard') + num_shards = get_bucket_stats(BUCKET_NAME).num_shards + _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard') + assert(ret != 0 and ret != errno.EBUSY) + assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME) + + bucket.put_object(Key='put_during_logrecord2', Body=b"some_data2") + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 1 + + # begin to reshard again, the duplicated entries will be purged + time.sleep(30) + _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='logrecord_writes') + assert(ret != 0 and ret != errno.EBUSY) + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 0 + + # TESTCASE 'duplicated entries can be purged manually' + log.debug(' test: duplicated entries can be purged manually') + time.sleep(30) + num_shards = get_bucket_stats(BUCKET_NAME).num_shards + _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard') + assert(ret != 0 and ret != errno.EBUSY) + assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME) + + bucket.put_object(Key='put_during_logrecord3', Body=b"some_data3") + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 1 + + time.sleep(30) + exec_cmd('radosgw-admin reshardlog purge --bucket %s' % BUCKET_NAME) + cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME) + json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80 + assert calc_reshardlog_count(json_op) == 0 + log.debug('check reshard logrecord successfully') + # TESTCASE 'versioning reshard-','bucket', reshard','versioning reshard','succeeds' log.debug(' test: reshard versioned bucket') num_shards_expected = get_bucket_stats(VER_BUCKET_NAME).num_shards + 1 @@ -287,6 +390,8 @@ def main(): time.sleep(1) ver_bucket.put_object(Key='put_during_reshard', Body=b"some_data") log.debug('put object successful') + # waiter for delay reshard to finish + time.sleep(5) # TESTCASE 'check that bucket stats are correct after reshard with unlinked entries' log.debug('TEST: check that bucket stats are correct after reshard with unlinked entries\n') |