summaryrefslogtreecommitdiffstats
path: root/qa
diff options
context:
space:
mode:
Diffstat (limited to 'qa')
-rw-r--r--qa/Makefile2
-rw-r--r--qa/README5
-rw-r--r--qa/btrfs/.gitignore3
-rw-r--r--qa/btrfs/Makefile11
-rw-r--r--qa/btrfs/clone_range.c35
-rw-r--r--qa/btrfs/create_async_snap.c34
-rw-r--r--qa/btrfs/test_async_snap.c83
-rw-r--r--qa/btrfs/test_rmdir_async_snap.c62
-rw-r--r--qa/cephfs/begin/3-kernel.yaml23
-rw-r--r--qa/cephfs/begin/3-modules.yaml19
-rw-r--r--qa/cephfs/conf/mgr.yaml4
-rw-r--r--qa/cephfs/conf/mon.yaml1
-rw-r--r--qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml9
-rw-r--r--qa/cephfs/overrides/ignorelist_health.yaml8
-rw-r--r--qa/cephfs/overrides/pg_health.yaml2
-rw-r--r--qa/config/crimson_bluestore.yaml25
-rw-r--r--qa/config/crimson_qa_overrides.yaml1
-rw-r--r--qa/config/crimson_seastore.yaml20
-rw-r--r--qa/config/seastore.yaml6
-rw-r--r--qa/crontab/teuthology-cronjobs10
-rw-r--r--qa/distros/container-hosts/centos_9.stream.yaml3
-rw-r--r--qa/distros/container-hosts/centos_9.stream_runc.yaml3
-rw-r--r--qa/objectstore_debug/% (renamed from qa/suites/crimson-rados-experimental/seastore/basic/%)0
-rw-r--r--qa/objectstore_debug/bluestore-options/write$/write_random.yaml5
-rw-r--r--qa/objectstore_debug/bluestore-options/write$/write_v1.yaml5
-rw-r--r--qa/objectstore_debug/bluestore-options/write$/write_v2.yaml5
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-bitmap.yaml (renamed from qa/objectstore_debug/bluestore-bitmap.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml (renamed from qa/objectstore_debug/bluestore-comp-lz4.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml (renamed from qa/objectstore_debug/bluestore-comp-snappy.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-comp-zlib.yaml (renamed from qa/objectstore_debug/bluestore-comp-zlib.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-comp-zstd.yaml (renamed from qa/objectstore_debug/bluestore-comp-zstd.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-hybrid.yaml (renamed from qa/objectstore_debug/bluestore-hybrid.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-low-osd-mem-target.yaml (renamed from qa/objectstore_debug/bluestore-low-osd-mem-target.yaml)0
-rw-r--r--qa/objectstore_debug/bluestore/bluestore-stupid.yaml (renamed from qa/objectstore_debug/bluestore-stupid.yaml)0
-rw-r--r--qa/rbd/krbd_discard_granularity.t48
-rw-r--r--qa/rgw/s3tests-branch.yaml4
-rwxr-xr-xqa/standalone/ceph-helpers.sh74
-rwxr-xr-xqa/standalone/crush/crush-classes.sh2
-rwxr-xr-xqa/standalone/mon/mon-cluster-log.sh16
-rwxr-xr-xqa/standalone/osd-backfill/osd-backfill-space.sh9
-rwxr-xr-xqa/standalone/osd/osd-bluefs-volume-ops.sh2
-rwxr-xr-xqa/standalone/osd/osd-rep-recov-eio.sh14
-rwxr-xr-xqa/standalone/scrub/osd-recovery-scrub.sh145
-rwxr-xr-xqa/standalone/scrub/osd-scrub-repair.sh256
-rwxr-xr-xqa/standalone/scrub/osd-scrub-test.sh259
-rw-r--r--qa/standalone/scrub/scrub-helpers.sh111
-rw-r--r--qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml1
-rw-r--r--qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml15
-rw-r--r--qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml1
l---------qa/suites/crimson-rados-experimental/.qa2
l---------qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml1
-rw-r--r--qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml14
-rw-r--r--qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml18
l---------qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml1
-rw-r--r--qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml28
-rw-r--r--qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml18
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/% (renamed from qa/suites/rados/rest/%)0
l---------qa/suites/crimson-rados-experimental/thrash/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/.qa)0
l---------qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/basic/.qa)0
l---------qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled1
l---------qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml1
l---------qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml (renamed from qa/suites/fs/thrash/workloads/overrides/+)0
l---------qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml (renamed from qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$0
l---------qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml0
l---------qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled1
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled6
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled5
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled5
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/clusters/+0
l---------qa/suites/crimson-rados-experimental/thrash/clusters/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml (renamed from qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml)9
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled4
l---------qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro1
l---------qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml (renamed from qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml)0
l---------qa/suites/crimson-rados-experimental/thrash/deploy/.qa (renamed from qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml11
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled16
l---------qa/suites/crimson-rados-experimental/thrash/objectstore/.qa (renamed from qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa)0
l---------qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml1
l---------qa/suites/crimson-rados-experimental/thrash/thrashers/.qa (renamed from qa/suites/rados/rest/.qa)0
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml34
l---------qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml1
l---------qa/suites/crimson-rados-experimental/thrash/workloads/.qa1
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml13
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml20
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml49
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml24
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml24
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml24
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml (renamed from qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml)14
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml15
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml15
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml14
-rw-r--r--qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml8
l---------qa/suites/crimson-rados/basic/objectstore/bluestore.yaml2
l---------qa/suites/crimson-rados/basic/objectstore/seastore.yaml2
-rw-r--r--qa/suites/crimson-rados/basic/tasks/rados_python.yaml2
-rw-r--r--qa/suites/crimson-rados/perf/deploy/ceph.yaml1
l---------qa/suites/crimson-rados/perf/objectstore/bluestore.yaml2
l---------qa/suites/crimson-rados/perf/objectstore/seastore.yaml2
l---------qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml2
l---------qa/suites/crimson-rados/rbd/objectstore/seastore.yaml2
l---------qa/suites/crimson-rados/singleton/objectstore1
l---------qa/suites/crimson-rados/singleton/objectstore/.qa1
l---------qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml1
l---------qa/suites/crimson-rados/singleton/objectstore/seastore.yaml1
l---------qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml (renamed from qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled)0
l---------qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml2
l---------qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled1
l---------qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml1
l---------qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml2
l---------qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml2
l---------qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml1
l---------qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml1
-rw-r--r--qa/suites/fs/functional/tasks/test_snap_schedule/%0
-rw-r--r--qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$0
-rw-r--r--qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v1.yaml (renamed from qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml)0
-rw-r--r--qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v2.yaml (renamed from qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml)0
-rw-r--r--qa/suites/fs/functional/tasks/test_snap_schedule/snap-schedule.yaml (renamed from qa/suites/fs/functional/tasks/snap-schedule.yaml)1
-rw-r--r--qa/suites/fs/functional/tasks/uninlining.yaml26
-rw-r--r--qa/suites/fs/libcephfs/tasks/client.yaml1
-rw-r--r--qa/suites/fs/multifs/tasks/failover.yaml1
-rw-r--r--qa/suites/fs/nfs/tasks/nfs.yaml7
-rw-r--r--qa/suites/fs/thrash/workloads/overrides/%0
l---------qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa1
-rw-r--r--qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml (renamed from qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml)0
-rw-r--r--qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml (renamed from qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml)0
l---------qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml1
l---------qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml1
l---------qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml1
-rw-r--r--qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml1
-rw-r--r--qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml3
l---------qa/suites/fs/upgrade/nofs/kernel.yaml1
l---------qa/suites/fs/upgrade/upgraded_client/kernel.yaml1
-rw-r--r--qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml5
l---------qa/suites/fs/workload/begin/3-kernel.yaml1
l---------qa/suites/fs/workload/begin/3-modules.yaml1
-rw-r--r--qa/suites/fs/workload/tasks/3-snaps/yes.yaml5
-rw-r--r--qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml1
-rw-r--r--qa/suites/nvmeof/basic/base/install.yaml3
-rw-r--r--qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml (renamed from qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml)21
-rw-r--r--qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml25
-rw-r--r--qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml36
-rw-r--r--qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml39
-rw-r--r--qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml41
-rw-r--r--qa/suites/nvmeof/thrash/clusters/4-gateways-1-initiator.yaml (renamed from qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml)12
-rw-r--r--qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml37
-rw-r--r--qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml24
-rw-r--r--qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml24
-rw-r--r--qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml6
-rw-r--r--qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml7
-rw-r--r--qa/suites/nvmeof/thrash/workloads/fio.yaml8
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml1
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml91
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml135
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml138
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml145
-rw-r--r--qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml1
-rw-r--r--qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml4
-rw-r--r--qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml4
-rw-r--r--qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml1
-rw-r--r--qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml77
-rw-r--r--qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml6
-rw-r--r--qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml1
-rw-r--r--qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml1
-rw-r--r--qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml8
-rw-r--r--qa/suites/rados/rest/mgr-restful.yaml31
l---------qa/suites/rados/rest/supported-random-distro$1
l---------qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml2
l---------qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml2
l---------qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml2
-rw-r--r--qa/suites/rados/singleton/all/mon-connection-score.yaml40
-rw-r--r--qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml57
l---------qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml2
-rw-r--r--qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml9
-rw-r--r--qa/suites/rados/valgrind-leaks/1-start.yaml1
-rw-r--r--qa/suites/rados/verify/validater/valgrind.yaml2
l---------qa/suites/rbd/iscsi/0-single-container-host.yaml1
-rw-r--r--qa/suites/rbd/iscsi/base/install.yaml6
l---------qa/suites/rbd/iscsi/supported-container-hosts$1
-rw-r--r--qa/suites/rbd/migration-external/%0
l---------qa/suites/rbd/migration-external/.qa1
l---------qa/suites/rbd/migration-external/1-base/.qa1
-rw-r--r--qa/suites/rbd/migration-external/1-base/install.yaml8
l---------qa/suites/rbd/migration-external/2-clusters/.qa1
-rw-r--r--qa/suites/rbd/migration-external/2-clusters/2-node.yaml15
l---------qa/suites/rbd/migration-external/3-objectstore1
l---------qa/suites/rbd/migration-external/4-supported-random-distro$1
l---------qa/suites/rbd/migration-external/5-data-pool/.qa1
-rw-r--r--qa/suites/rbd/migration-external/5-data-pool/ec.yaml29
-rw-r--r--qa/suites/rbd/migration-external/5-data-pool/none.yaml0
-rw-r--r--qa/suites/rbd/migration-external/5-data-pool/replicated.yaml14
l---------qa/suites/rbd/migration-external/6-prepare/.qa1
-rw-r--r--qa/suites/rbd/migration-external/6-prepare/native-clone.yaml29
-rw-r--r--qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml18
l---------qa/suites/rbd/migration-external/7-io-workloads/.qa1
-rw-r--r--qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml14
l---------qa/suites/rbd/migration-external/8-migrate-workloads/.qa1
-rw-r--r--qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml14
l---------qa/suites/rbd/migration-external/conf1
-rw-r--r--qa/suites/rbd/migration/6-prepare/qcow2-https.yaml8
-rw-r--r--qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml12
-rw-r--r--qa/suites/rbd/migration/6-prepare/raw-nbd.yaml13
-rw-r--r--qa/suites/rbd/migration/9-cleanup/cleanup.yaml1
-rw-r--r--qa/suites/rgw/bucket-logging/%0
l---------qa/suites/rgw/bucket-logging/.qa1
-rw-r--r--qa/suites/rgw/bucket-logging/0-install.yaml13
l---------qa/suites/rgw/bucket-logging/beast.yaml1
l---------qa/suites/rgw/bucket-logging/fixed-1.yaml1
l---------qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml1
-rw-r--r--qa/suites/rgw/bucket-logging/overrides.yaml10
l---------qa/suites/rgw/bucket-logging/s3tests-branch.yaml1
l---------qa/suites/rgw/bucket-logging/supported-distros1
-rw-r--r--qa/suites/rgw/bucket-logging/tasks/+0
-rw-r--r--qa/suites/rgw/bucket-logging/tasks/s3tests.yaml6
-rw-r--r--qa/suites/rgw/crypt/2-kms/barbican.yaml4
-rw-r--r--qa/suites/rgw/multifs/0-install.yaml5
-rw-r--r--qa/suites/rgw/multifs/tasks/+0
-rw-r--r--qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml8
-rw-r--r--qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml8
-rw-r--r--qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml4
-rw-r--r--qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml4
-rw-r--r--qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml8
-rw-r--r--qa/suites/rgw/multisite/realms/three-zones.yaml.disabled2
-rw-r--r--qa/suites/rgw/multisite/realms/two-zonegroup.yaml (renamed from qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled)4
-rw-r--r--qa/suites/rgw/multisite/realms/two-zones.yaml2
-rw-r--r--qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml2
-rw-r--r--qa/suites/rgw/notifications/tasks/kafka_failover/+0
-rw-r--r--qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml20
l---------qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros1
-rw-r--r--qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml8
l---------qa/suites/rgw/sts/auth-order/.qa (renamed from qa/suites/fs/functional/subvol_versions/.qa)0
-rw-r--r--qa/suites/rgw/sts/auth-order/local-sts.yaml5
-rw-r--r--qa/suites/rgw/sts/auth-order/sts-local.yaml5
-rw-r--r--qa/suites/rgw/tempest/0-install.yaml2
-rw-r--r--qa/suites/rgw/tempest/tasks/s3/%0
l---------qa/suites/rgw/tempest/tasks/s3/.qa1
l---------qa/suites/rgw/tempest/tasks/s3/auth-order/.qa1
-rw-r--r--qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml5
-rw-r--r--qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml5
-rw-r--r--qa/suites/rgw/tempest/tasks/s3/s3tests.yaml (renamed from qa/suites/rgw/tempest/tasks/s3tests.yaml)0
-rw-r--r--qa/suites/rgw/verify/overrides.yaml1
-rw-r--r--qa/suites/rgw/verify/tasks/cls.yaml5
-rw-r--r--qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml (renamed from qa/suites/rgw/verify/tasks/s3tests-java.yaml)0
l---------qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml2
-rw-r--r--qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml15
-rw-r--r--qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml14
-rw-r--r--qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml9
-rw-r--r--qa/suites/upgrade/quincy-x/parallel/0-start.yaml11
-rw-r--r--qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml5
-rw-r--r--qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml2
-rw-r--r--qa/suites/upgrade/quincy-x/stress-split/1-start.yaml11
-rw-r--r--qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml2
-rw-r--r--qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml2
-rw-r--r--qa/suites/upgrade/reef-x/parallel/0-start.yaml20
-rw-r--r--qa/suites/upgrade/reef-x/parallel/1-tasks.yaml6
-rw-r--r--qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml19
-rw-r--r--qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml2
-rw-r--r--qa/suites/upgrade/reef-x/stress-split/1-start.yaml22
-rw-r--r--qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml2
-rw-r--r--qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml2
-rw-r--r--qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml19
-rw-r--r--qa/tasks/barbican.py13
-rw-r--r--qa/tasks/cbt.py131
-rw-r--r--qa/tasks/ceph.py25
-rw-r--r--qa/tasks/ceph_iscsi_client.py11
-rw-r--r--qa/tasks/ceph_manager.py94
-rw-r--r--qa/tasks/ceph_test_case.py11
-rw-r--r--qa/tasks/cephadm.py50
-rw-r--r--qa/tasks/cephfs/cephfs_test_case.py8
-rw-r--r--qa/tasks/cephfs/filesystem.py9
-rw-r--r--qa/tasks/cephfs/mount.py4
-rw-r--r--qa/tasks/cephfs/test_admin.py268
-rw-r--r--qa/tasks/cephfs/test_backtrace.py26
-rw-r--r--qa/tasks/cephfs/test_exports.py186
-rw-r--r--qa/tasks/cephfs/test_failover.py55
-rw-r--r--qa/tasks/cephfs/test_fscrypt.py4
-rw-r--r--qa/tasks/cephfs/test_mirroring.py194
-rw-r--r--qa/tasks/cephfs/test_misc.py14
-rw-r--r--qa/tasks/cephfs/test_nfs.py193
-rw-r--r--qa/tasks/cephfs/test_quota.py5
-rw-r--r--qa/tasks/cephfs/test_snap_schedules.py50
-rw-r--r--qa/tasks/cephfs/test_snapshots.py26
-rw-r--r--qa/tasks/cephfs/test_uninlining.py332
-rw-r--r--qa/tasks/cephfs/test_volumes.py952
-rw-r--r--qa/tasks/check_counter.py32
-rw-r--r--qa/tasks/fwd_scrub.py2
-rw-r--r--qa/tasks/kafka.py11
-rw-r--r--qa/tasks/kafka_failover.py244
-rw-r--r--qa/tasks/mgr/dashboard/helper.py37
-rw-r--r--qa/tasks/mgr/dashboard/test_auth.py6
-rw-r--r--qa/tasks/mgr/dashboard/test_mgr_module.py8
-rw-r--r--qa/tasks/mgr/dashboard/test_osd.py26
-rw-r--r--qa/tasks/mgr/dashboard/test_rbd.py12
-rw-r--r--qa/tasks/mgr/dashboard/test_rgw.py4
-rw-r--r--qa/tasks/mgr/mgr_test_case.py23
-rw-r--r--qa/tasks/mgr/test_module_selftest.py7
-rw-r--r--qa/tasks/mon_connection_score.py95
-rw-r--r--qa/tasks/mon_thrash.py2
-rw-r--r--qa/tasks/notification_tests.py2
-rw-r--r--qa/tasks/nvme_loop.py121
-rw-r--r--qa/tasks/nvmeof.py161
-rw-r--r--qa/tasks/qemu.py165
-rw-r--r--qa/tasks/rabbitmq.py15
-rw-r--r--qa/tasks/rados.py8
-rw-r--r--qa/tasks/radosgw_admin.py35
-rw-r--r--qa/tasks/rbd.py16
-rw-r--r--qa/tasks/rgw_multisite.py7
-rw-r--r--qa/tasks/rgw_multisite_tests.py4
-rw-r--r--qa/tasks/rook.py6
-rw-r--r--qa/tasks/s3a_hadoop.py16
-rw-r--r--qa/tasks/s3tests.py28
-rw-r--r--qa/tasks/s3tests_java.py1
-rw-r--r--qa/tasks/stretch_mode_disable_enable.py547
-rw-r--r--qa/tasks/thrashosds-health.yaml1
-rw-r--r--qa/tasks/tox.py2
-rw-r--r--qa/tasks/vstart_runner.py8
-rw-r--r--qa/tasks/workunit.py4
-rwxr-xr-xqa/workunits/cephadm/test_iscsi_setup.sh99
-rwxr-xr-xqa/workunits/cephtool/test.sh25
-rwxr-xr-xqa/workunits/client/test_oc_disabled.sh5
-rwxr-xr-xqa/workunits/dencoder/test_readable.py10
-rwxr-xr-xqa/workunits/erasure-code/bench.sh17
-rwxr-xr-xqa/workunits/fs/misc/fallocate.sh17
-rwxr-xr-xqa/workunits/fs/snaps/snaptest-double-null.sh1
-rwxr-xr-xqa/workunits/fs/snaps/snaptest-git-ceph.sh14
-rwxr-xr-xqa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh72
-rwxr-xr-xqa/workunits/nvmeof/basic_tests.sh (renamed from qa/workunits/rbd/nvmeof_basic_tests.sh)10
-rwxr-xr-xqa/workunits/nvmeof/fio_test.sh (renamed from qa/workunits/rbd/nvmeof_fio_test.sh)9
-rwxr-xr-xqa/workunits/nvmeof/mtls_test.sh76
-rwxr-xr-xqa/workunits/nvmeof/namespace_test.sh71
-rwxr-xr-xqa/workunits/nvmeof/scalability_test.sh66
-rwxr-xr-xqa/workunits/nvmeof/setup_subsystem.sh (renamed from qa/workunits/rbd/nvmeof_setup_subsystem.sh)19
-rwxr-xr-xqa/workunits/rados/test_rados_tool.sh4
-rwxr-xr-xqa/workunits/rbd/cli_generic.sh43
-rwxr-xr-xqa/workunits/rbd/cli_migration.sh340
-rwxr-xr-xqa/workunits/rbd/journal.sh12
-rwxr-xr-xqa/workunits/rbd/luks-encryption.sh91
-rwxr-xr-xqa/workunits/rbd/rbd-ggate.sh17
-rwxr-xr-xqa/workunits/rbd/rbd-nbd.sh20
-rwxr-xr-xqa/workunits/rbd/rbd_groups.sh27
-rwxr-xr-xqa/workunits/rbd/rbd_mirror.sh130
-rwxr-xr-xqa/workunits/rbd/rbd_mirror_bootstrap.sh6
-rwxr-xr-xqa/workunits/rbd/rbd_mirror_ha.sh4
-rwxr-xr-xqa/workunits/rbd/rbd_mirror_helpers.sh65
-rwxr-xr-xqa/workunits/rbd/rbd_mirror_stress.sh6
-rwxr-xr-xqa/workunits/rbd/test_admin_socket.sh6
-rwxr-xr-xqa/workunits/rest/test-restful.sh10
-rw-r--r--qa/workunits/rgw/s3_utilities.pm5
-rwxr-xr-xqa/workunits/rgw/test_rgw_bucket_check.py1
-rwxr-xr-xqa/workunits/rgw/test_rgw_reshard.py107
354 files changed, 7696 insertions, 1402 deletions
diff --git a/qa/Makefile b/qa/Makefile
index ad655b7e743..05dc834adbd 100644
--- a/qa/Makefile
+++ b/qa/Makefile
@@ -1,4 +1,4 @@
-DIRS= workunits btrfs
+DIRS= workunits
all:
for d in $(DIRS) ; do ( cd $$d ; $(MAKE) all ) ; done
diff --git a/qa/README b/qa/README
index f9b8988c6f9..a6a95c479bc 100644
--- a/qa/README
+++ b/qa/README
@@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or
ubuntu, chosen randomly.
The teuthology code can be found in https://github.com/ceph/teuthology.git
+
+Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git
+CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench.
+cosbench support has been removed from qa/tasks/cbt.py.
+
diff --git a/qa/btrfs/.gitignore b/qa/btrfs/.gitignore
deleted file mode 100644
index 530c1b5b4ed..00000000000
--- a/qa/btrfs/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/clone_range
-/test_async_snap
-/create_async_snap
diff --git a/qa/btrfs/Makefile b/qa/btrfs/Makefile
deleted file mode 100644
index be95ecfd3cd..00000000000
--- a/qa/btrfs/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-CFLAGS = -Wall -Wextra -D_GNU_SOURCE
-
-TARGETS = clone_range test_async_snap create_async_snap
-
-.c:
- $(CC) $(CFLAGS) $@.c -o $@
-
-all: $(TARGETS)
-
-clean:
- rm $(TARGETS)
diff --git a/qa/btrfs/clone_range.c b/qa/btrfs/clone_range.c
deleted file mode 100644
index 0a88e160131..00000000000
--- a/qa/btrfs/clone_range.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <fcntl.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#include <linux/types.h>
-#include "../../src/os/btrfs_ioctl.h"
-#include <stdio.h>
-#include <errno.h>
-
-int main(int argc, char **argv)
-{
- struct btrfs_ioctl_clone_range_args ca;
- int dfd;
- int r;
-
- if (argc < 6) {
- printf("usage: %s <srcfn> <srcoffset> <srclen> <destfn> <destoffset>\n", argv[0]);
- exit(1);
- }
-
- ca.src_fd = open(argv[1], O_RDONLY);
- ca.src_offset = atoi(argv[2]);
- ca.src_length = atoi(argv[3]);
- dfd = open(argv[4], O_WRONLY|O_CREAT);
- ca.dest_offset = atoi(argv[5]);
-
- r = ioctl(dfd, BTRFS_IOC_CLONE_RANGE, &ca);
- printf("clone_range %s %lld %lld~%lld to %s %d %lld = %d %s\n",
- argv[1], ca.src_fd,
- ca.src_offset, ca.src_length,
- argv[4], dfd,
- ca.dest_offset, r, strerror(errno));
- return r;
-}
diff --git a/qa/btrfs/create_async_snap.c b/qa/btrfs/create_async_snap.c
deleted file mode 100644
index 2ef22af7b45..00000000000
--- a/qa/btrfs/create_async_snap.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include "../../src/os/btrfs_ioctl.h"
-
-struct btrfs_ioctl_vol_args_v2 va;
-
-int main(int argc, char **argv)
-{
- int fd;
- int r;
-
- if (argc != 3) {
- printf("usage: %s <source subvol> <name>\n", argv[0]);
- return 1;
- }
- printf("creating snap ./%s from %s\n", argv[2], argv[1]);
- fd = open(".", O_RDONLY);
- va.fd = open(argv[1], O_RDONLY);
- va.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- strcpy(va.name, argv[2]);
- r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va);
- printf("result %d\n", r ? -errno:0);
- return r;
-}
diff --git a/qa/btrfs/test_async_snap.c b/qa/btrfs/test_async_snap.c
deleted file mode 100644
index 211be95a61c..00000000000
--- a/qa/btrfs/test_async_snap.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include "../../src/os/btrfs_ioctl.h"
-
-struct btrfs_ioctl_vol_args_v2 va;
-struct btrfs_ioctl_vol_args vold;
-int max = 4;
-
-void check_return(int r)
-{
- if (r < 0) {
- printf("********* failed with %d %s ********\n", errno, strerror(errno));
- exit(1);
- }
-}
-
-int main(int argc, char **argv)
-{
- int num = 1000;
-
- if (argc > 1)
- num = atoi(argv[1]);
- printf("will do %d iterations\n", num);
-
- int cwd = open(".", O_RDONLY);
- printf("cwd = %d\n", cwd);
- while (num-- > 0) {
- if (rand() % 10 == 0) {
- __u64 transid;
- int r;
- printf("sync starting\n");
- r = ioctl(cwd, BTRFS_IOC_START_SYNC, &transid);
- check_return(r);
- printf("sync started, transid %lld, waiting\n", transid);
- r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &transid);
- check_return(r);
- printf("sync finished\n");
- }
-
- int i = rand() % max;
- struct stat st;
- va.fd = cwd;
- sprintf(va.name, "test.%d", i);
- va.transid = 0;
- int r = stat(va.name, &st);
- if (r < 0) {
- if (rand() % 3 == 0) {
- printf("snap create (sync) %s\n", va.name);
- va.flags = 0;
- r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va);
- check_return(r);
- } else {
- printf("snap create (async) %s\n", va.name);
- va.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- r = ioctl(cwd, BTRFS_IOC_SNAP_CREATE_V2, &va);
- check_return(r);
- printf("snap created, transid %lld\n", va.transid);
- if (rand() % 2 == 0) {
- printf("waiting for async snap create\n");
- r = ioctl(cwd, BTRFS_IOC_WAIT_SYNC, &va.transid);
- check_return(r);
- }
- }
- } else {
- printf("snap remove %s\n", va.name);
- vold.fd = va.fd;
- strcpy(vold.name, va.name);
- r = ioctl(cwd, BTRFS_IOC_SNAP_DESTROY, &vold);
- check_return(r);
- }
- }
- return 0;
-}
diff --git a/qa/btrfs/test_rmdir_async_snap.c b/qa/btrfs/test_rmdir_async_snap.c
deleted file mode 100644
index 5dafaacaaeb..00000000000
--- a/qa/btrfs/test_rmdir_async_snap.c
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include "../../src/os/btrfs_ioctl.h"
-
-struct btrfs_ioctl_vol_args_v2 va;
-struct btrfs_ioctl_vol_args vold;
-
-int main(int argc, char **argv)
-{
- int num = 1000;
- int i, r, fd;
- char buf[30];
-
- if (argc > 1)
- num = atoi(argv[1]);
- printf("will do %d iterations\n", num);
-
- fd = open(".", O_RDONLY);
- vold.fd = 0;
- strcpy(vold.name, "current");
- r = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&vold);
- printf("create current ioctl got %d\n", r ? errno:0);
- if (r)
- return 1;
-
- for (i=0; i<num; i++) {
- sprintf(buf, "current/dir.%d", i);
- r = mkdir(buf, 0755);
- printf("mkdir got %d\n", r ? errno:0);
- if (r)
- return 1;
- }
-
- va.fd = open("current", O_RDONLY);
- va.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- for (i=0; i<num; i++) {
- system("/bin/cp /boot/vmlinuz-3.2.0-ceph-00142-g9e98323 current/foo");
- sprintf(buf, "current/dir.%d", i);
- r = rmdir(buf);
- printf("rmdir got %d\n", r ? errno:0);
- if (r)
- return 1;
-
- if (i % 10) continue;
- sprintf(va.name, "snap.%d", i);
- r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_V2, (unsigned long long)&va);
- printf("ioctl got %d\n", r ? errno:0);
- if (r)
- return 1;
- }
- return 0;
-}
diff --git a/qa/cephfs/begin/3-kernel.yaml b/qa/cephfs/begin/3-kernel.yaml
new file mode 100644
index 00000000000..e94a0d87dc8
--- /dev/null
+++ b/qa/cephfs/begin/3-kernel.yaml
@@ -0,0 +1,23 @@
+# When the --kernel option is given to teuthology-suite, the kernel is set for
+# all nodes (also, the kernel is "distro" when the --kernel option is not set).
+# We don't generally want to use a custom kernel for all tests, so unset it.
+# The k-testing.yaml will set it, if given, for only the client nodes.
+#
+# Allow overriding this by using a branch ending in "-all".
+
+teuthology:
+ postmerge:
+ - |
+ local branch = yaml.kernel.branch
+ if branch and not yaml.kernel.branch:find "-all$" then
+ log.debug("removing default kernel specification: %s", yaml.kernel)
+ py_attrgetter(yaml.kernel).pop('branch', nil)
+ py_attrgetter(yaml.kernel).pop('deb', nil)
+ py_attrgetter(yaml.kernel).pop('flavor', nil)
+ py_attrgetter(yaml.kernel).pop('kdb', nil)
+ py_attrgetter(yaml.kernel).pop('koji', nil)
+ py_attrgetter(yaml.kernel).pop('koji_task', nil)
+ py_attrgetter(yaml.kernel).pop('rpm', nil)
+ py_attrgetter(yaml.kernel).pop('sha1', nil)
+ py_attrgetter(yaml.kernel).pop('tag', nil)
+ end
diff --git a/qa/cephfs/begin/3-modules.yaml b/qa/cephfs/begin/3-modules.yaml
deleted file mode 100644
index 25947342569..00000000000
--- a/qa/cephfs/begin/3-modules.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Enable mgr modules now before any CephFS mounts are created by the mgr. This
-# avoids the potential race of the mgr mounting CephFS and then getting failed
-# over by the monitors before the monitors have a chance to note the new client
-# session from the mgr beacon. In that case, the monitors will not blocklist
-# that client mount automatically so the MDS will eventually do the eviction
-# (and create a cluster log warning which we want to avoid).
-#
-# Note: ideally the mgr would gently stop mgr modules before respawning so that
-# the client mounts can be unmounted but this caused issues historically with
-# modules like the dashboard so an abrupt restart was chosen instead.
-
-mgrmodules:
- sequential:
- - print: "Enabling mgr modules"
- # other fragments append to this
-
-tasks:
- - sequential:
- - mgrmodules
diff --git a/qa/cephfs/conf/mgr.yaml b/qa/cephfs/conf/mgr.yaml
index fb6e9b09fa1..2b053f8bdcf 100644
--- a/qa/cephfs/conf/mgr.yaml
+++ b/qa/cephfs/conf/mgr.yaml
@@ -1,7 +1,9 @@
overrides:
ceph:
- conf:
+ cluster-conf:
mgr:
+ client mount timeout: 30
debug client: 20
debug mgr: 20
debug ms: 1
+ mon warn on pool no app: false
diff --git a/qa/cephfs/conf/mon.yaml b/qa/cephfs/conf/mon.yaml
index e33437ae404..9bc2eb852b3 100644
--- a/qa/cephfs/conf/mon.yaml
+++ b/qa/cephfs/conf/mon.yaml
@@ -3,7 +3,6 @@ overrides:
cluster-conf:
mon:
mon op complaint time: 120
- mon warn on pool no app: false
# cephadm can take up to 5 minutes to bring up remaining mons
# This needs to be set before cluster-conf configs are applied.
conf:
diff --git a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
index 2ee219125e7..048cd5ce8b9 100644
--- a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
+++ b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
@@ -1,3 +1,12 @@
+teuthology:
+ premerge: |
+ log.debug("base kernel %s", base_config.kernel)
+ local kernel = base_config.kernel
+ if kernel.branch ~= "distro" then
+ log.debug("overriding testing kernel with %s", kernel)
+ yaml_fragment.kernel.client = kernel
+ end
+
kernel:
client:
branch: testing
diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index a3f3a010d43..5ac25a8f790 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -2,13 +2,17 @@ overrides:
ceph:
log-ignorelist:
- FS_DEGRADED
+ - fs.*is degraded
+ - filesystem is degraded
- FS_INLINE_DATA_DEPRECATED
- FS_WITH_FAILED_MDS
- MDS_ALL_DOWN
+ - filesystem is offline
- MDS_DAMAGE
- MDS_DEGRADED
- MDS_FAILED
- MDS_INSUFFICIENT_STANDBY
+ - insufficient standby MDS daemons available
- MDS_UP_LESS_THAN_MAX
- online, but wants
- filesystem is online with fewer MDS than max_mds
@@ -17,3 +21,7 @@ overrides:
- overall HEALTH_
- Replacing daemon
- deprecated feature inline_data
+ - BLUESTORE_SLOW_OP_ALERT
+ - slow operation indications in BlueStore
+ - experiencing slow operations in BlueStore
+ - MGR_MODULE_ERROR
diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml
index 1740134a2e0..07ca62e01fb 100644
--- a/qa/cephfs/overrides/pg_health.yaml
+++ b/qa/cephfs/overrides/pg_health.yaml
@@ -9,3 +9,5 @@ overrides:
- PG_DEGRADED
- Reduced data availability
- Degraded data redundancy
+ - pg .* is stuck inactive
+ - pg .* is .*degraded
diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml
new file mode 100644
index 00000000000..d5ba487b9bf
--- /dev/null
+++ b/qa/config/crimson_bluestore.yaml
@@ -0,0 +1,25 @@
+overrides:
+ ceph:
+ fs: xfs
+ conf:
+ osd:
+ # crimson's osd objectstore option
+ crimson osd objectstore: bluestore
+ debug alienstore: 20
+ bluestore block size: 96636764160
+ debug bluestore: 20
+ debug bluefs: 20
+ debug rocksdb: 10
+ bluestore compression mode: aggressive
+ bluestore fsck on mount: true
+ bluestore compression algorithm: snappy
+ # lower the full ratios since we can fill up a 100gb osd so quickly
+ mon osd full ratio: .9
+ mon osd backfillfull_ratio: .85
+ mon osd nearfull ratio: .8
+ osd failsafe full ratio: .95
+ bluestore rocksdb cf: false
+ log to stderr: true
+ err to stderr: true
+ log flush on exit: true
+ log to file: false
diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml
index fa8f49a4986..a10c59d77cc 100644
--- a/qa/config/crimson_qa_overrides.yaml
+++ b/qa/config/crimson_qa_overrides.yaml
@@ -9,6 +9,7 @@ overrides:
osd pool default crimson: true
osd:
crimson osd obc lru size: 10
+ debug ms: 20
flavor: crimson
workunit:
env:
diff --git a/qa/config/crimson_seastore.yaml b/qa/config/crimson_seastore.yaml
new file mode 100644
index 00000000000..d1919456ab1
--- /dev/null
+++ b/qa/config/crimson_seastore.yaml
@@ -0,0 +1,20 @@
+overrides:
+ ceph:
+ conf:
+ osd:
+ # crimson's osd objectstore option
+ crimson osd objectstore: seastore
+ debug seastore: 20
+ debug seastore onode: 20
+ debug seastore odata: 20
+ debug seastore omap: 20
+ debug seastore tm: 20
+ debug seastore t: 20
+ debug seastore cleaner: 20
+ debug seastore epm: 20
+ debug seastore lba: 20
+ debug seastore fixedkv tree: 20
+ debug seastore cache: 20
+ debug seastore journal: 20
+ debug seastore device: 20
+ debug seastore backref: 20
diff --git a/qa/config/seastore.yaml b/qa/config/seastore.yaml
deleted file mode 100644
index 713d9322584..00000000000
--- a/qa/config/seastore.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-overrides:
- ceph:
- fs: xfs
- conf:
- osd:
- osd objectstore: seastore
diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs
index ea328eb22c7..c558a1382ef 100644
--- a/qa/crontab/teuthology-cronjobs
+++ b/qa/crontab/teuthology-cronjobs
@@ -52,16 +52,11 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
00 05 * * 0,2,4 $CW $SS 1 --ceph main --suite smoke -p 100 --force-priority
08 05 * * 0 $CW $SS 1 --ceph squid --suite smoke -p 100 --force-priority
16 05 * * 0 $CW $SS 1 --ceph reef --suite smoke -p 100 --force-priority
-24 05 * * 0 $CW $SS 1 --ceph quincy --suite smoke -p 100 --force-priority
## ********** windows tests on main branch - weekly
# 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL
-## ********** crimson tests on main branch - weekly
-# 01 01 * * 0 CEPH_BRANCH=main; MACHINE_NAME=smithi; SUITE_NAME=crimson-rados; KERNEL=distro; $CW $SCHEDULE 100000 $CEPH_BRANCH $MACHINE_NAME $SUITE_NAME $CEPH_QA_EMAIL $KERNEL
-
-
## ********** teuthology/nop on main branch - daily
@daily $CW $SS 1 --ceph main --suite teuthology/nop -p 1 --force-priority
@@ -78,9 +73,10 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
32 20 * * 4 $CW $SS 4 --ceph main --suite powercycle -p 950
40 20 * * 5 $CW $SS 1 --ceph main --suite rgw -p 950
48 20 * * 6 $CW $SS 4 --ceph main --suite krbd -p 950 --kernel testing
+56 20 * * 6 $CW $SS 1 --ceph main --suite crimson-rados -p 101 --force-priority --flavor crimson
-## squid branch runs - twice weekly
+## squid branch runs - twice weekly (crimson-rados is run weekly)
## suites rados and rbd use --subset arg and must be call with schedule_subset.sh
## see script in https://github.com/ceph/ceph/tree/main/qa/machine_types
@@ -93,6 +89,7 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
32 21 * * 4,1 $CW $SS 4 --ceph squid --suite powercycle -p 100 --force-priority
40 21 * * 5,2 $CW $SS 1 --ceph squid --suite rgw -p 100 --force-priority
48 21 * * 6,3 $CW $SS 4 --ceph squid --suite krbd -p 100 --force-priority --kernel testing
+56 21 * * 6 $CW $SS 1 --ceph squid --suite crimson-rados -p 100 --force-priority --flavor crimson
## reef branch runs - weekly
## suites rados and rbd use --subset arg and must be call with schedule_subset.sh
@@ -124,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce
16 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820
24 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820
32 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820
-40 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade/quincy-p2p -p 820
### upgrade runs for reef release
###### on smithi
diff --git a/qa/distros/container-hosts/centos_9.stream.yaml b/qa/distros/container-hosts/centos_9.stream.yaml
index 425cb144b1d..d2eafe6f0a9 100644
--- a/qa/distros/container-hosts/centos_9.stream.yaml
+++ b/qa/distros/container-hosts/centos_9.stream.yaml
@@ -9,4 +9,7 @@ overrides:
tasks:
- pexec:
all:
+ # in order to work around a possible nvme-cli <-> libnvme linking issue
+ # See https://tracker.ceph.com/issues/67684
+ - sudo dnf remove nvme-cli -y
- sudo dnf install nvmetcli nvme-cli -y
diff --git a/qa/distros/container-hosts/centos_9.stream_runc.yaml b/qa/distros/container-hosts/centos_9.stream_runc.yaml
index 0f3f21d8ad4..d147851ec98 100644
--- a/qa/distros/container-hosts/centos_9.stream_runc.yaml
+++ b/qa/distros/container-hosts/centos_9.stream_runc.yaml
@@ -8,6 +8,9 @@ overrides:
tasks:
- pexec:
all:
+ # in order to work around a possible nvme-cli <-> libnvme linking issue
+ # See https://tracker.ceph.com/issues/67684
+ - sudo dnf remove nvme-cli -y
- sudo dnf install runc nvmetcli nvme-cli -y
- sudo sed -i 's/^#runtime = "crun"/runtime = "runc"/g' /usr/share/containers/containers.conf
- sudo sed -i 's/runtime = "crun"/#runtime = "crun"/g' /usr/share/containers/containers.conf
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/objectstore_debug/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/%
+++ b/qa/objectstore_debug/%
diff --git a/qa/objectstore_debug/bluestore-options/write$/write_random.yaml b/qa/objectstore_debug/bluestore-options/write$/write_random.yaml
new file mode 100644
index 00000000000..d14f561c72a
--- /dev/null
+++ b/qa/objectstore_debug/bluestore-options/write$/write_random.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ osd:
+ bluestore write v2 random: true
diff --git a/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml b/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml
new file mode 100644
index 00000000000..4b20e8e52ca
--- /dev/null
+++ b/qa/objectstore_debug/bluestore-options/write$/write_v1.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ osd:
+ bluestore write v2: false
diff --git a/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml b/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml
new file mode 100644
index 00000000000..238973b1165
--- /dev/null
+++ b/qa/objectstore_debug/bluestore-options/write$/write_v2.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ osd:
+ bluestore write v2: true
diff --git a/qa/objectstore_debug/bluestore-bitmap.yaml b/qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
index b18e04bee32..b18e04bee32 100644
--- a/qa/objectstore_debug/bluestore-bitmap.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-bitmap.yaml
diff --git a/qa/objectstore_debug/bluestore-comp-lz4.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml
index 46f993e686c..46f993e686c 100644
--- a/qa/objectstore_debug/bluestore-comp-lz4.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml
diff --git a/qa/objectstore_debug/bluestore-comp-snappy.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml
index b5d58414e3f..b5d58414e3f 100644
--- a/qa/objectstore_debug/bluestore-comp-snappy.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml
diff --git a/qa/objectstore_debug/bluestore-comp-zlib.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-zlib.yaml
index b47ebbb7c62..b47ebbb7c62 100644
--- a/qa/objectstore_debug/bluestore-comp-zlib.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-comp-zlib.yaml
diff --git a/qa/objectstore_debug/bluestore-comp-zstd.yaml b/qa/objectstore_debug/bluestore/bluestore-comp-zstd.yaml
index e2f5e4e5ba6..e2f5e4e5ba6 100644
--- a/qa/objectstore_debug/bluestore-comp-zstd.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-comp-zstd.yaml
diff --git a/qa/objectstore_debug/bluestore-hybrid.yaml b/qa/objectstore_debug/bluestore/bluestore-hybrid.yaml
index 68b9bc4279f..68b9bc4279f 100644
--- a/qa/objectstore_debug/bluestore-hybrid.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-hybrid.yaml
diff --git a/qa/objectstore_debug/bluestore-low-osd-mem-target.yaml b/qa/objectstore_debug/bluestore/bluestore-low-osd-mem-target.yaml
index b2a49790bc3..b2a49790bc3 100644
--- a/qa/objectstore_debug/bluestore-low-osd-mem-target.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-low-osd-mem-target.yaml
diff --git a/qa/objectstore_debug/bluestore-stupid.yaml b/qa/objectstore_debug/bluestore/bluestore-stupid.yaml
index ca811f131a7..ca811f131a7 100644
--- a/qa/objectstore_debug/bluestore-stupid.yaml
+++ b/qa/objectstore_debug/bluestore/bluestore-stupid.yaml
diff --git a/qa/rbd/krbd_discard_granularity.t b/qa/rbd/krbd_discard_granularity.t
index 844643baedb..8001786b0ab 100644
--- a/qa/rbd/krbd_discard_granularity.t
+++ b/qa/rbd/krbd_discard_granularity.t
@@ -1,11 +1,13 @@
+Default object size:
+
$ rbd create --size 20M img
$ DEV=$(sudo rbd map img)
$ blockdev --getiomin $DEV
65536
$ blockdev --getioopt $DEV
- 65536
+ 4194304
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
65536
$ sudo rbd unmap $DEV
@@ -14,7 +16,7 @@
$ blockdev --getiomin $DEV
512
$ blockdev --getioopt $DEV
- 512
+ 4194304
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
512
$ sudo rbd unmap $DEV
@@ -38,3 +40,45 @@
$ sudo rbd unmap $DEV
$ rbd rm --no-progress img
+
+Custom object size:
+
+ $ rbd create --size 20M --object-size 1M img
+
+ $ DEV=$(sudo rbd map img)
+ $ blockdev --getiomin $DEV
+ 65536
+ $ blockdev --getioopt $DEV
+ 1048576
+ $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
+ 65536
+ $ sudo rbd unmap $DEV
+
+ $ DEV=$(sudo rbd map -o alloc_size=512 img)
+ $ blockdev --getiomin $DEV
+ 512
+ $ blockdev --getioopt $DEV
+ 1048576
+ $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
+ 512
+ $ sudo rbd unmap $DEV
+
+ $ DEV=$(sudo rbd map -o alloc_size=1048576 img)
+ $ blockdev --getiomin $DEV
+ 1048576
+ $ blockdev --getioopt $DEV
+ 1048576
+ $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
+ 1048576
+ $ sudo rbd unmap $DEV
+
+ $ DEV=$(sudo rbd map -o alloc_size=2097152 img)
+ $ blockdev --getiomin $DEV
+ 1048576
+ $ blockdev --getioopt $DEV
+ 1048576
+ $ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
+ 1048576
+ $ sudo rbd unmap $DEV
+
+ $ rbd rm --no-progress img
diff --git a/qa/rgw/s3tests-branch.yaml b/qa/rgw/s3tests-branch.yaml
index ef6819c87e0..8710ce35893 100644
--- a/qa/rgw/s3tests-branch.yaml
+++ b/qa/rgw/s3tests-branch.yaml
@@ -1,4 +1,4 @@
overrides:
s3tests:
- force-branch: ceph-master
- # git_remote: https://github.com/ceph/
+ force-branch: ceph-master
+ # git_remote: https://github.com/ceph/
diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh
index f9c6924ce04..72d70ca7ad5 100755
--- a/qa/standalone/ceph-helpers.sh
+++ b/qa/standalone/ceph-helpers.sh
@@ -25,15 +25,6 @@ TMPDIR=${TMPDIR:-/tmp}
CEPH_BUILD_VIRTUALENV=${TMPDIR}
TESTDIR=${TESTDIR:-${TMPDIR}}
-if type xmlstarlet > /dev/null 2>&1; then
- XMLSTARLET=xmlstarlet
-elif type xml > /dev/null 2>&1; then
- XMLSTARLET=xml
-else
- echo "Missing xmlstarlet binary!"
- exit 1
-fi
-
if [ `uname` = FreeBSD ]; then
SED=gsed
AWK=gawk
@@ -1572,6 +1563,20 @@ function test_is_clean() {
#######################################################################
+##
+# Predicate checking if the named PG is in state "active+clean"
+#
+# @return 0 if the PG is active & clean, 1 otherwise
+#
+function is_pg_clean() {
+ local pgid=$1
+ local pg_state
+ pg_state=$(ceph pg $pgid query 2>/dev/null | jq -r ".state ")
+ [[ "$pg_state" == "active+clean"* ]]
+}
+
+#######################################################################
+
calc() { $AWK "BEGIN{print $*}"; }
##
@@ -1688,6 +1693,33 @@ function test_wait_for_clean() {
}
##
+# Wait until the named PG becomes clean or until a timeout of
+# $WAIT_FOR_CLEAN_TIMEOUT seconds.
+#
+# @return 0 if the PG is clean, 1 otherwise
+#
+function wait_for_pg_clean() {
+ local pg_id=$1
+ local -a delays=($(get_timeout_delays $WAIT_FOR_CLEAN_TIMEOUT 1 3))
+ local -i loop=0
+
+ flush_pg_stats || return 1
+
+ while true ; do
+ echo "#---------- $pgid loop $loop"
+ is_pg_clean $pg_id && break
+ if (( $loop >= ${#delays[*]} )) ; then
+ ceph report
+ echo "PG $pg_id is not clean after $loop iterations"
+ return 1
+ fi
+ sleep ${delays[$loop]}
+ loop+=1
+ done
+ return 0
+}
+
+##
# Wait until the cluster becomes peered or if it does not make progress
# for $WAIT_FOR_CLEAN_TIMEOUT seconds.
# Progress is measured either via the **get_is_making_recovery_progress**
@@ -1869,7 +1901,7 @@ function test_repair() {
wait_for_clean || return 1
repair 1.0 || return 1
kill_daemons $dir KILL osd || return 1
- ! TIMEOUT=1 repair 1.0 || return 1
+ ! TIMEOUT=2 repair 1.0 || return 1
teardown $dir || return 1
}
#######################################################################
@@ -1889,6 +1921,8 @@ function test_repair() {
#
function pg_scrub() {
local pgid=$1
+ # do not issue the scrub command unless the PG is clean
+ wait_for_pg_clean $pgid || return 1
local last_scrub=$(get_last_scrub_stamp $pgid)
ceph pg scrub $pgid
wait_for_scrub $pgid "$last_scrub"
@@ -1896,6 +1930,8 @@ function pg_scrub() {
function pg_deep_scrub() {
local pgid=$1
+ # do not issue the scrub command unless the PG is clean
+ wait_for_pg_clean $pgid || return 1
local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
ceph pg deep-scrub $pgid
wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
@@ -1912,7 +1948,7 @@ function test_pg_scrub() {
wait_for_clean || return 1
pg_scrub 1.0 || return 1
kill_daemons $dir KILL osd || return 1
- ! TIMEOUT=1 pg_scrub 1.0 || return 1
+ ! TIMEOUT=2 pg_scrub 1.0 || return 1
teardown $dir || return 1
}
@@ -1931,15 +1967,19 @@ function test_pg_scrub() {
#
function pg_schedule_scrub() {
local pgid=$1
+ # do not issue the scrub command unless the PG is clean
+ wait_for_pg_clean $pgid || return 1
local last_scrub=$(get_last_scrub_stamp $pgid)
- ceph pg scrub $pgid
+ ceph tell $pgid schedule-scrub
wait_for_scrub $pgid "$last_scrub"
}
function pg_schedule_deep_scrub() {
local pgid=$1
+ # do not issue the scrub command unless the PG is clean
+ wait_for_pg_clean $pgid || return 1
local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
- ceph pg deep-scrub $pgid
+ ceph tell $pgid schedule-deep-scrub
wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
}
@@ -1948,13 +1988,11 @@ function test_pg_schedule_scrub() {
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
run_osd $dir 0 || return 1
create_rbd_pool || return 1
wait_for_clean || return 1
pg_schedule_scrub 1.0 || return 1
- kill_daemons $dir KILL osd || return 1
- ! TIMEOUT=1 pg_scrub 1.0 || return 1
teardown $dir || return 1
}
@@ -2050,7 +2088,7 @@ function test_wait_for_scrub() {
wait_for_scrub $pgid "$last_scrub" || return 1
kill_daemons $dir KILL osd || return 1
last_scrub=$(get_last_scrub_stamp $pgid)
- ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1
+ ! TIMEOUT=2 wait_for_scrub $pgid "$last_scrub" || return 1
teardown $dir || return 1
}
@@ -2341,7 +2379,7 @@ function run_tests() {
shopt -s -o xtrace
PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
- export .:$PATH # make sure program from sources are preferred
+ export PATH=./bin:.:$PATH # make sure program from sources are preferred
export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one
export CEPH_ARGS
diff --git a/qa/standalone/crush/crush-classes.sh b/qa/standalone/crush/crush-classes.sh
index 558aabe6d93..a0662c3f1ee 100755
--- a/qa/standalone/crush/crush-classes.sh
+++ b/qa/standalone/crush/crush-classes.sh
@@ -52,7 +52,7 @@ function get_osds_up() {
local objectname=$2
local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \
- $XMLSTARLET sel -t -m "//up/osd" -v . -o ' ')
+ xmlstarlet sel -t -m "//up/osd" -v . -o ' ')
# get rid of the trailing space
echo $osds
}
diff --git a/qa/standalone/mon/mon-cluster-log.sh b/qa/standalone/mon/mon-cluster-log.sh
index 863a97c7cab..7b9adda0af6 100755
--- a/qa/standalone/mon/mon-cluster-log.sh
+++ b/qa/standalone/mon/mon-cluster-log.sh
@@ -62,7 +62,7 @@ function TEST_cluster_log_level() {
ceph config set mon.a mon_cluster_log_level info
ceph osd down 0
TIMEOUT=20 wait_for_osd up 0 || return 1
- grep -q "cluster [[]INF[]] osd.0.*boot" $dir/log
+ TIMEOUT=60 wait_for_string $dir/log "cluster [[]INF[]] osd.0.*boot"
return_code=$?
if [ $return_code -ne 0 ]; then
echo "Failed : Could not find INF log in the cluster log file"
@@ -145,9 +145,17 @@ function TEST_journald_cluster_log_level() {
ceph osd down 0
TIMEOUT=20 wait_for_osd up 0 || return 1
search_str="osd.0.*boot"
- sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log
- grep -q "$search_str" $dir/journal.log
- return_code=$?
+ return_code=1
+ RETRY_DURATION=60
+ for ((i=0; i < $RETRY_DURATION; i++)); do
+ sudo journalctl _COMM=ceph-mon CEPH_CHANNEL=cluster PRIORITY=6 --output=json-pretty --since "60 seconds ago" |jq '.MESSAGE' > $dir/journal.log
+ if ! grep "$search_str" $dir/journal.log; then
+ sleep 1
+ else
+ return_code=0
+ break
+ fi
+ done
if [ $return_code -ne 0 ]; then
echo "Failed : Could not find INF log in the journalctl log file"
ERRORS=$(($ERRORS + 1))
diff --git a/qa/standalone/osd-backfill/osd-backfill-space.sh b/qa/standalone/osd-backfill/osd-backfill-space.sh
index 6a5c69412f4..84b9703bbfc 100755
--- a/qa/standalone/osd-backfill/osd-backfill-space.sh
+++ b/qa/standalone/osd-backfill/osd-backfill-space.sh
@@ -609,9 +609,16 @@ function TEST_backfill_grow() {
wait_for_clean || return 1
+ #Capture the timestamp after complete cleanup or finish the recovery progress
+ current_timestamp=$(date +"%Y-%m-%dT%H:%M:%S")
+
delete_pool $poolname
kill_daemons $dir || return 1
- ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
+
+ #Ignore the num_bytes mismatch messages before calling wait_cleanup
+ if ! awk -v ts="$current_timestamp" '$0 >= ts && /num_bytes mismatch/' $dir/osd.*.log > /dev/null; then
+ return 1
+ fi
}
# Create a 5 shard EC pool on 6 OSD cluster
diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh
index aedfbc9b5cb..f7424de8ce1 100755
--- a/qa/standalone/osd/osd-bluefs-volume-ops.sh
+++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh
@@ -72,7 +72,7 @@ function TEST_bluestore() {
truncate $dir/0/block -s 4294967296 # 4GB
ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1
- truncate $dir/1/block -s 4311744512 # 4GB + 16MB
+ truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240
ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1
truncate $dir/2/block -s 4295099392 # 4GB + 129KB
ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh
index 6fea441b3a9..a34f4a47189 100755
--- a/qa/standalone/osd/osd-rep-recov-eio.sh
+++ b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -219,6 +219,18 @@ function TEST_rados_repair_warning() {
ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
set +o pipefail
+ ceph health unmute OSD_TOO_MANY_REPAIRS
+ ceph tell osd.$primary clear_shards_repaired
+ sleep 10
+
+ set -o pipefail
+ # Should clear this
+ ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
+ set +o pipefail
+
+ ceph tell osd.$primary clear_shards_repaired $OBJS
+ sleep 10
+
for i in $(seq 1 $OBJS)
do
inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
@@ -235,7 +247,7 @@ function TEST_rados_repair_warning() {
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
- # Give mon a chance to notice additional OSD and unmute
+ # Give mon a chance to notice additional OSD and reset num_shards_repaired
# The default tick time is 5 seconds
CHECKTIME=10
LOOPS=0
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 3d3121fe8d8..7b77a60f35b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -99,11 +99,11 @@ function TEST_recovery_scrub_1() {
kill_daemons $dir #|| return 1
declare -a err_strings
- err_strings[0]="recovery in progress. Only high priority scrubs allowed."
+ err_strings[0]="recovery in progress.*scrubs"
for osd in $(seq 0 $(expr $OSDS - 1))
do
- grep "recovery in progress. Only high priority scrubs allowed." $dir/osd.${osd}.log
+ grep "recovery in progress.*scrubs" $dir/osd.${osd}.log
done
for err_string in "${err_strings[@]}"
do
@@ -163,7 +163,7 @@ function wait_for_scrub_mod() {
fi
sleep 1
# are we still the primary?
- local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
if [ $orig_primary != $current_primary ]; then
echo $orig_primary no longer primary for $pgid
return 0
@@ -187,9 +187,14 @@ function wait_for_scrub_mod() {
#
function pg_scrub_mod() {
local pgid=$1
+ # wait for 'clean' state of the PG. Operator scrub commands are rejected
+ # *and not remembered* if the PG is not clean
+ wait_for_pg_clean $pgid
+ wait_for_pg_clean $pgid || return 1
+
local last_scrub=$(get_last_scrub_stamp $pgid)
# locate the primary
- local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' `
local recovery=false
ceph pg scrub $pgid
#ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
@@ -229,138 +234,6 @@ function wait_background_check() {
return $return_code
}
-# osd_scrub_during_recovery=true make sure scrub happens
-function TEST_recovery_scrub_2() {
- local dir=$1
- local poolname=test
-
- TESTDATA="testdata.$$"
- OSDS=8
- PGS=32
- OBJECTS=40
-
- setup $dir || return 1
- run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
- run_mgr $dir x || return 1
- local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 "
- ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
- ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
- ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
- for osd in $(seq 0 $(expr $OSDS - 1))
- do
- run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 \
- $ceph_osd_args || return 1
- done
-
- # Create a pool with $PGS pgs
- create_pool $poolname $PGS $PGS
- wait_for_clean || return 1
- poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
- dd if=/dev/urandom of=$TESTDATA bs=1M count=50
- for i in $(seq 1 $OBJECTS)
- do
- rados -p $poolname put obj${i} $TESTDATA
- done
- rm -f $TESTDATA
-
- ceph osd pool set $poolname size 3
-
- ceph pg dump pgs
-
- # note that the following will be needed if the mclock scheduler is specified
- #ceph tell osd.* config get osd_mclock_override_recovery_settings
-
- # the '_max_active' is expected to be 0
- ceph tell osd.1 config get osd_recovery_max_active
- # both next parameters are expected to be >=3
- ceph tell osd.1 config get osd_recovery_max_active_hdd
- ceph tell osd.1 config get osd_recovery_max_active_ssd
-
- # Wait for recovery to start
- count=0
- while(true)
- do
- #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
- if test $(ceph --format json pg dump pgs |
- jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
- then
- break
- fi
- sleep 2
- if test "$count" -eq "10"
- then
- echo "Not enough recovery started simultaneously"
- return 1
- fi
- count=$(expr $count + 1)
- done
- ceph pg dump pgs
-
- pids=""
- recov_scrub_count=0
- for pg in $(seq 0 $(expr $PGS - 1))
- do
- run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
- done
- wait_background_check pids
- return_code=$?
- if [ $return_code -ne 0 ]; then return $return_code; fi
-
- ERRORS=0
- if test $recov_scrub_count -eq 0
- then
- echo "No scrubs occurred while PG recovering"
- ERRORS=$(expr $ERRORS + 1)
- fi
-
- pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
- pid=$(cat $pidfile)
- if ! kill -0 $pid
- then
- echo "OSD crash occurred"
- #tail -100 $dir/osd.0.log
- ERRORS=$(expr $ERRORS + 1)
- fi
-
- # Work around for http://tracker.ceph.com/issues/38195
- kill_daemons $dir #|| return 1
-
- declare -a err_strings
- err_strings[0]="not scheduling scrubs due to active recovery"
-
- for osd in $(seq 0 $(expr $OSDS - 1))
- do
- grep "not scheduling scrubs" $dir/osd.${osd}.log
- done
- for err_string in "${err_strings[@]}"
- do
- found=false
- for osd in $(seq 0 $(expr $OSDS - 1))
- do
- if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
- then
- found=true
- fi
- done
- if [ "$found" = "true" ]; then
- echo "Found log message not expected '$err_string'"
- ERRORS=$(expr $ERRORS + 1)
- fi
- done
-
- teardown $dir || return 1
-
- if [ $ERRORS != "0" ];
- then
- echo "TEST FAILED WITH $ERRORS ERRORS"
- return 1
- fi
-
- echo "TEST PASSED"
- return 0
-}
-
main osd-recovery-scrub "$@"
# Local Variables:
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index b717026e191..6dd5b10ae8f 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() {
['pool_name']="testpool"
['extras']=" --osd_scrub_auto_repair=true"
)
- local extr_dbg=3
standard_scrub_cluster $dir cluster_conf
local poolid=${cluster_conf['pool_id']}
local poolname=${cluster_conf['pool_name']}
@@ -5754,11 +5753,13 @@ function TEST_corrupt_scrub_erasure_overwrites() {
#
# Test to make sure that a periodic scrub won't cause deep-scrub info to be lost
+# Update 2024: this functionality was removed from the code. The test will be skipped.
#
function TEST_periodic_scrub_replicated() {
local dir=$1
local poolname=psr_pool
local objname=POBJ
+ return 0
run_mon $dir a --osd_pool_default_size=2 || return 1
run_mgr $dir x || return 1
@@ -5795,12 +5796,13 @@ function TEST_periodic_scrub_replicated() {
flush_pg_stats
local last_scrub=$(get_last_scrub_stamp $pg)
- # Fake a schedule scrub
+ # Fake a scheduled deep scrub
ceph tell $pg schedule-scrub || return 1
# Wait for schedule regular scrub
wait_for_scrub $pg "$last_scrub"
# It needed to be upgraded
+ # update 2024: the "upgrade" functionality has been removed
grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1
# Bad object still known
@@ -5831,7 +5833,7 @@ function TEST_periodic_scrub_replicated() {
flush_pg_stats
# Request a regular scrub and it will be done
- pg_schedule_scrub $pg
+ pg_scrub $pg
grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
# deep-scrub error is no longer present
@@ -6249,6 +6251,254 @@ function TEST_request_scrub_priority() {
grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
}
+#
+# Testing the "split scrub store" feature: shallow scrubs do not
+# purge deep errors from the store.
+#
+# Corrupt one copy of a replicated pool, creating both shallow and deep errors.
+# Then shallow-scrub the pool and verify that the deep errors are still present.
+#
+function TEST_dual_store_replicated_cluster() {
+ local dir=$1
+ local poolname=csr_pool
+ local total_objs=19
+ local extr_dbg=1 # note: 3 and above leave some temp files around
+
+ run_mon $dir a --osd_pool_default_size=2 || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
+ local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+ ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+ ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 "
+ for osd in $(seq 0 1)
+ do
+ run_osd $dir $osd $ceph_osd_args || return 1
+ done
+
+ create_rbd_pool || return 1
+ wait_for_clean || return 1
+
+ create_pool foo 1 || return 1
+ create_pool $poolname 1 1 || return 1
+ wait_for_clean || return 1
+
+ ceph osd pool set $poolname noscrub 1
+ ceph osd pool set $poolname nodeep-scrub 1
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+ add_something $dir $poolname $objname || return 1
+
+ rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+ rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+ done
+
+ # Increase file 1 MB + 1KB
+ dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025
+ rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1
+ rm -f $dir/new.ROBJ19
+
+ local pg=$(get_pg $poolname ROBJ0)
+ local primary=$(get_primary $poolname ROBJ0)
+
+ # Compute an old omap digest and save oi
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+ config set osd_deep_scrub_update_digest_min_age 0
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+ config set osd_deep_scrub_update_digest_min_age 0
+ pg_deep_scrub $pg
+
+ for i in $(seq 1 $total_objs) ; do
+ objname=ROBJ${i}
+
+ # Alternate corruption between osd.0 and osd.1
+ local osd=$(expr $i % 2)
+
+ case $i in
+ 1)
+ # Size (deep scrub data_digest too)
+ local payload=UVWXYZZZ
+ echo $payload > $dir/CORRUPT
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 2)
+ # digest (deep scrub only)
+ local payload=UVWXYZ
+ echo $payload > $dir/CORRUPT
+ objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+ ;;
+
+ 3)
+ # missing
+ objectstore_tool $dir $osd $objname remove || return 1
+ ;;
+
+ 4)
+ # Modify omap value (deep scrub only)
+ objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+ ;;
+
+ 5)
+ # Delete omap key (deep scrub only)
+ objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+ ;;
+
+ 6)
+ # Add extra omap key (deep scrub only)
+ echo extra > $dir/extra-val
+ objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+ rm $dir/extra-val
+ ;;
+
+ 7)
+ # Modify omap header (deep scrub only)
+ echo -n newheader > $dir/hdr
+ objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+ rm $dir/hdr
+ ;;
+
+ 8)
+ rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+ rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+ # Break xattrs
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+ objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+ echo -n val3-$objname > $dir/newval
+ objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+ rm $dir/bad-val $dir/newval
+ ;;
+
+ 9)
+ objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+ echo -n D > $dir/change
+ rados --pool $poolname put $objname $dir/change
+ objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+ rm $dir/oi $dir/change
+ ;;
+
+ # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+ # ROBJ11 must be handled with config change before deep scrub
+ # ROBJ12 must be handled with config change before scrubs
+ # ROBJ13 must be handled before scrubs
+
+ 14)
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+ objectstore_tool $dir 1 $objname rm-attr _ || return 1
+ rm $dir/bad-val
+ ;;
+
+ 15)
+ objectstore_tool $dir $osd $objname rm-attr _ || return 1
+ ;;
+
+ 16)
+ objectstore_tool $dir 0 $objname rm-attr snapset || return 1
+ echo -n bad-val > $dir/bad-val
+ objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+ ;;
+
+ 17)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=ROBJ17
+ echo $payload > $dir/new.ROBJ17
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+ ;;
+
+ 18)
+ # Deep-scrub only (all replicas are diffent than the object info
+ local payload=ROBJ18
+ echo $payload > $dir/new.ROBJ18
+ objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+ objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+ # Make one replica have a different object info, so a full repair must happen too
+ objectstore_tool $dir $osd $objname corrupt-info || return 1
+ ;;
+
+ 19)
+ # Set osd-max-object-size smaller than this object's size
+
+ esac
+ done
+
+ local pg=$(get_pg $poolname ROBJ0)
+
+ ceph tell osd.\* injectargs -- --osd-max-object-size=1048576
+
+ inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+ inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+ inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+
+ # first sequence: the final shallow scrub should not override any of the deep errors
+ pg_scrub $pg
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1.json
+ pg_scrub $pg
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_1b.json
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_1b_s.json
+
+ pg_deep_scrub $pg
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_2.json
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_2s.json
+
+ pg_scrub $pg
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_3.json
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_3s.json
+
+ diff -u $dir/dp_results.json $dir/sh2_results.json || return 1
+
+ # inject a read error, which is a special case: the scrub encountering the read error
+ # would override the previously collected shard info.
+ inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+
+ pg_deep_scrub $pg
+
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_4.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+ jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+ jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json
+
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json
+ # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+ jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_4s.json
+
+ pg_scrub $pg
+
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.' > /tmp/WQR_5.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+ python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json
+ (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+ jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+ jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json
+
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \
+ $dir/sh2Part2_w13_results.json
+ rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+ jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json
+
+ # the shallow scrub results should differ from the results of the deep
+ # scrub preceding it, but the difference should be limited to ROBJ13
+ diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1
+ diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1
+
+ ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it
+ return 0
+}
+
main osd-scrub-repair "$@"
diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh
index ec0066d955f..385479258f2 100755
--- a/qa/standalone/scrub/osd-scrub-test.sh
+++ b/qa/standalone/scrub/osd-scrub-test.sh
@@ -57,7 +57,7 @@ function TEST_scrub_test() {
TESTDATA="testdata.$$"
run_mon $dir a --osd_pool_default_size=3 || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
@@ -160,7 +160,7 @@ function TEST_interval_changes() {
# This min scrub interval results in 30 seconds backoff time
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd --osd_scrub_min_interval=$min_interval --osd_scrub_max_interval=$max_interval --osd_scrub_interval_randomize_ratio=0 || return 1
@@ -205,7 +205,9 @@ function TEST_interval_changes() {
perf_counters $dir $OSDS
}
-function TEST_scrub_extended_sleep() {
+# RRR 6aug24: this test cannot work as expected, following the changes in the
+# scrub type to overrides matrix. Disabled for now.
+function NO_scrub_extended_sleep() {
local dir=$1
local poolname=test
local OSDS=3
@@ -224,7 +226,7 @@ function TEST_scrub_extended_sleep() {
DAY_END=$(expr $DAY + 3)
run_mon $dir a --osd_pool_default_size=3 || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
@@ -312,7 +314,7 @@ function _scrub_abort() {
fi
run_mon $dir a --osd_pool_default_size=3 || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
# Set scheduler to "wpq" until there's a reliable way to query scrub
@@ -424,7 +426,7 @@ function TEST_scrub_permit_time() {
TESTDATA="testdata.$$"
run_mon $dir a --osd_pool_default_size=3 || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
local scrub_begin_hour=$(date -d '2 hour ago' +"%H" | sed 's/^0//')
local scrub_end_hour=$(date -d '1 hour ago' +"%H" | sed 's/^0//')
for osd in $(seq 0 $(expr $OSDS - 1))
@@ -531,7 +533,7 @@ function TEST_dump_scrub_schedule() {
TESTDATA="testdata.$$"
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
# Set scheduler to "wpq" until there's a reliable way to query scrub states
# with "--osd-scrub-sleep" set to 0. The "mclock_scheduler" overrides the
@@ -542,6 +544,9 @@ function TEST_dump_scrub_schedule() {
--osd_op_queue=wpq \
--osd_stats_update_period_not_scrubbing=1 \
--osd_stats_update_period_scrubbing=1 \
+ --osd_scrub_retry_after_noscrub=1 \
+ --osd_scrub_retry_pg_state=2 \
+ --osd_scrub_retry_delay=2 \
--osd_scrub_sleep=0.2"
for osd in $(seq 0 $(expr $OSDS - 1))
@@ -598,17 +603,16 @@ function TEST_dump_scrub_schedule() {
declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" )
wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1
- sleep 2
-
#
# step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub
# scheduled for the future' value
#
- ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
- ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
ceph osd set noscrub || return 1
sleep 2
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1
+ ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1
+ sleep 8
saved_last_stamp=${sched_data['query_last_stamp']}
ceph tell $pgid schedule-scrub
@@ -638,7 +642,8 @@ function TEST_dump_scrub_schedule() {
# missed it.
declare -A cond_active_dmp=( ['dmp_state_has_scrubbing']="true" ['query_active']="false" )
sched_data=()
- wait_any_cond $pgid 10 $saved_last_stamp cond_active_dmp "WaitingActive " sched_data || return 1
+ wait_any_cond $pgid 10 $saved_last_stamp cond_active_dmp "WaitingActive " sched_data
+ sleep 4
perf_counters $dir $OSDS
}
@@ -653,7 +658,7 @@ function TEST_pg_dump_objects_scrubbed() {
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
@@ -680,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() {
teardown $dir || return 1
}
+function wait_initial_scrubs() {
+ local -n pg_to_prim_dict=$1
+ local extr_dbg=1 # note: 3 and above leave some temp files around
+
+ # set a long schedule for the periodic scrubs. Wait for the
+ # initial 'no previous scrub is known' scrubs to finish for all PGs.
+ ceph tell osd.* config set osd_scrub_min_interval 7200
+ ceph tell osd.* config set osd_deep_scrub_interval 14400
+ ceph tell osd.* config set osd_max_scrubs 32
+ ceph tell osd.* config set osd_scrub_sleep 0
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max 10
+ ceph tell osd.* config set osd_scrub_chunk_max 10
+
+ for pg in "${!pg_to_prim_dict[@]}"; do
+ (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg"
+ ceph tell $pg scrub || return 1
+ done
+
+ sleep 1
+ (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+
+ tout=20
+ while [ $tout -gt 0 ] ; do
+ sleep 0.5
+ (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})'
+ not_done=$(ceph pg dump pgs --format=json-pretty | \
+ jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l )
+ # note that we should ignore a header line
+ if [ "$not_done" -le 1 ]; then
+ break
+ fi
+ not_done=$(( (not_done - 2) / 4 ))
+ echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)"
+ tout=$((tout - 1))
+ done
+ (( tout == 0 )) && return 1
+ return 0
+}
+
+
+# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued
+# for its replicas:
+# if the operator is requesting a scrub of the same PG, the operator's request
+# should trigger an abort of the ongoing scrub.
+#
+# The test process:
+# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one.
+# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one
+# should be stuck in replica reservation. We will verify that.
+# - now - the operator is requesting that second PG to be scrubbed. The original (pending)
+# scrub should be aborted. We would check for:
+# - the new, operator's scrub to be scheduled
+# - the replicas' reservers to be released
+function TEST_abort_periodic_for_operator() {
+ local dir=$1
+ local -A cluster_conf=(
+ ['osds_num']="5"
+ ['pgs_in_pool']="16"
+ ['pool_name']="test"
+ )
+ local extr_dbg=1 # note: 3 and above leave some temp files around
+
+ standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1
+ local poolid=${cluster_conf['pool_id']}
+ local poolname=${cluster_conf['pool_name']}
+ echo "Pool: $poolname : $poolid"
+
+ #turn off '-x' (but remember previous state)
+ local saved_echo_flag=${-//[^x]/}
+ set +x
+
+ # fill the pool with some data
+ TESTDATA="testdata.$$"
+ dd if=/dev/urandom of=$TESTDATA bs=320 count=1
+ for i in $( seq 1 256 )
+ do
+ rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null
+ done
+ rm -f $TESTDATA
+ if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+
+ # create the dictionary of the PGs in the pool
+ declare -A pg_pr
+ declare -A pg_ac
+ declare -A pg_po
+ build_pg_dicts "$dir" pg_pr pg_ac pg_po "-"
+ (( extr_dbg >= 2 )) && echo "PGs table:"
+ for pg in "${!pg_pr[@]}"; do
+ (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}"
+ done
+
+ wait_initial_scrubs pg_pr || return 1
+
+ # limit all OSDs to one scrub at a time
+ ceph tell osd.* config set osd_max_scrubs 1
+ ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1
+
+ # configure for slow scrubs
+ ceph tell osd.* config set osd_scrub_sleep 3
+ ceph tell osd.* config set osd_shallow_scrub_chunk_max 2
+ ceph tell osd.* config set osd_scrub_chunk_max 2
+ (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty
+
+ # the first PG to work with:
+ local pg1="1.0"
+ # and another one, that shares its primary, and at least one more active set member
+ local pg2=""
+ for pg in "${!pg_pr[@]}"; do
+ if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then
+ local -i common=0
+ count_common_active $pg $pg1 pg_ac common
+ if [[ $common -gt 1 ]]; then
+ pg2=$pg
+ break
+ fi
+ fi
+ done
+ if [[ -z "$pg2" ]]; then
+ # \todo handle the case when no such PG is found
+ echo "No PG found with the same primary as $pg1"
+ return 1
+ fi
+
+ # the common primary is allowed two concurrent scrubs
+ ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2
+ echo "The two PGs to manipulate are $pg1 and $pg2"
+
+ set_query_debug "$pg1"
+ # wait till the information published by pg1 is updated to show it as
+ # not being scrubbed
+ local is_act
+ for i in $( seq 1 3 )
+ do
+ is_act=$(ceph pg "$pg1" query | jq '.scrubber.active')
+ if [[ "$is_act" = "false" ]]; then
+ break
+ fi
+ echo "Still waiting for pg $pg1 to finish scrubbing"
+ sleep 0.7
+ done
+ ceph pg dump pgs
+ if [[ "$is_act" != "false" ]]; then
+ ceph pg "$pg1" query
+ echo "PG $pg1 appears to be still scrubbing"
+ return 1
+ fi
+ sleep 0.5
+
+ echo "Initiating a periodic scrub of $pg1"
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ ceph tell $pg1 schedule-deep-scrub || return 1
+ sleep 1
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+ for i in $( seq 1 14 )
+ do
+ sleep 0.5
+ stt=$(ceph pg "$pg1" query | jq '.scrubber')
+ is_active=$(echo $stt | jq '.active')
+ is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas')
+ if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then
+ break
+ fi
+ echo "Still waiting for pg $pg1 to start scrubbing: $stt"
+ done
+ if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then
+ ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ echo "The scrub is not active or is reserving replicas"
+ return 1
+ fi
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+
+ # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared
+ # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from
+ # reserving its replicas.
+
+ (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty
+
+ # now - the 2'nd scrub - which should be blocked on reserving
+ set_query_debug "$pg2"
+ ceph tell "$pg2" schedule-deep-scrub
+ sleep 0.5
+ (( extr_dbg >= 2 )) && echo "===================================================================================="
+ (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+ sleep 1
+ (( extr_dbg >= 2 )) && echo "===================================================================================="
+ (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber'
+ (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber'
+
+ # make sure pg2 scrub is stuck in the reserving state
+ local stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+ local pg2_is_reserving
+ pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+ if [[ "$pg2_is_reserving" != "true" ]]; then
+ echo "The scheduled scrub for $pg2 should have been stuck"
+ ceph pg dump pgs
+ return 1
+ fi
+
+ # now - issue an operator-initiated scrub on pg2.
+ # The periodic scrub should be aborted, and the operator-initiated scrub should start.
+ echo "Instructing $pg2 to perform a high-priority scrub"
+ ceph tell "$pg2" scrub
+ for i in $( seq 1 10 )
+ do
+ sleep 0.5
+ stt2=$(ceph pg "$pg2" query | jq '.scrubber')
+ pg2_is_active=$(echo $stt2 | jq '.active')
+ pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas')
+ if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then
+ break
+ fi
+ echo "Still waiting: $stt2"
+ done
+
+ if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then
+ echo "The high-priority scrub for $pg2 is not active or is reserving replicas"
+ return 1
+ fi
+ echo "Done"
+}
+
+
+
main osd-scrub-test "$@"
# Local Variables:
diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh
index b0922892a4a..dd37b643e08 100644
--- a/qa/standalone/scrub/scrub-helpers.sh
+++ b/qa/standalone/scrub/scrub-helpers.sh
@@ -240,8 +240,8 @@ function standard_scrub_cluster() {
local saved_echo_flag=${-//[^x]/}
set +x
- run_mon $dir a --osd_pool_default_size=$OSDS || return 1
- run_mgr $dir x || return 1
+ run_mon $dir a --osd_pool_default_size=3 || return 1
+ run_mgr $dir x --mgr_stats_period=1 || return 1
local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
--osd_scrub_interval_randomize_ratio=0 \
@@ -249,6 +249,12 @@ function standard_scrub_cluster() {
--osd_pool_default_pg_autoscale_mode=off \
--osd_pg_stat_report_interval_max_seconds=1 \
--osd_pg_stat_report_interval_max_epochs=1 \
+ --osd_stats_update_period_not_scrubbing=3 \
+ --osd_stats_update_period_scrubbing=1 \
+ --osd_scrub_retry_after_noscrub=5 \
+ --osd_scrub_retry_pg_state=5 \
+ --osd_scrub_retry_delay=3 \
+ --osd_pool_default_size=3 \
$extra_pars"
for osd in $(seq 0 $(expr $OSDS - 1))
@@ -294,6 +300,107 @@ function standard_scrub_wpq_cluster() {
}
+# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries:
+# - pg_primary_dict: a dictionary of pgid -> acting_primary
+# - pg_acting_dict: a dictionary of pgid -> acting set
+# - pg_pool_dict: a dictionary of pgid -> pool
+# If the input file is '-', the function will fetch the dump directly from the ceph cluster.
+function build_pg_dicts {
+ local dir=$1
+ local -n pg_primary_dict=$2
+ local -n pg_acting_dict=$3
+ local -n pg_pool_dict=$4
+ local infile=$5
+
+ local extr_dbg=0 # note: 3 and above leave some temp files around
+
+ #turn off '-x' (but remember previous state)
+ local saved_echo_flag=${-//[^x]/}
+ set +x
+
+ # if the infile name is '-', fetch the dump directly from the ceph cluster
+ if [[ $infile == "-" ]]; then
+ local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty"
+ local -r ceph_cmd_out=$(eval $ceph_cmd)
+ local -r ceph_cmd_rc=$?
+ if [[ $ceph_cmd_rc -ne 0 ]]; then
+ echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc"
+ fi
+ (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2
+ l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' `
+ else
+ l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile `
+ fi
+ (( extr_dbg >= 2 )) && echo "L0: $l0"
+
+ mapfile -t l1 < <(echo "$l0" | jq -c '.[]')
+ (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}"
+
+ for item in "${l1[@]}"; do
+ pgid=$(echo "$item" | jq -r '.pgid')
+ acting=$(echo "$item" | jq -r '.acting | @sh')
+ pg_acting_dict["$pgid"]=$acting
+ acting_primary=$(echo "$item" | jq -r '.acting_primary')
+ pg_primary_dict["$pgid"]=$acting_primary
+ pool=$(echo "$item" | jq -r '.pool')
+ pg_pool_dict["$pgid"]=$pool
+ done
+
+ if [[ -n "$saved_echo_flag" ]]; then set -x; fi
+}
+
+
+# a function that counts the number of common active-set elements between two PGs
+# 1 - the first PG
+# 2 - the second PG
+# 3 - the dictionary of active sets
+function count_common_active {
+ local pg1=$1
+ local pg2=$2
+ local -n pg_acting_dict=$3
+ local -n res=$4
+
+ local -a a1=(${pg_acting_dict[$pg1]})
+ local -a a2=(${pg_acting_dict[$pg2]})
+
+ local -i cnt=0
+ for i in "${a1[@]}"; do
+ for j in "${a2[@]}"; do
+ if [[ $i -eq $j ]]; then
+ cnt=$((cnt+1))
+ fi
+ done
+ done
+
+ res=$cnt
+}
+
+
+# given a PG, find another one with a disjoint active set
+# - but allow a possible common Primary
+# 1 - the PG
+# 2 - the dictionary of active sets
+# 3 - [out] - the PG with a disjoint active set
+function find_disjoint_but_primary {
+ local pg=$1
+ local -n ac_dict=$2
+ local -n p_dict=$3
+ local -n res=$4
+
+ for cand in "${!ac_dict[@]}"; do
+ if [[ "$cand" != "$pg" ]]; then
+ local -i common=0
+ count_common_active "$pg" "$cand" ac_dict common
+ if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then
+ res=$cand
+ return
+ fi
+ fi
+ done
+}
+
+
+
# A debug flag is set for the PG specified, causing the 'pg query' command to display
# an additional 'scrub sessions counter' field.
#
diff --git a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
index 7e7ede3e334..5be06bc6732 100644
--- a/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
+++ b/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
@@ -21,7 +21,6 @@ overrides:
ceph_repository: dev
ceph_mgr_modules:
- status
- - restful
cephfs_pools:
- name: "cephfs_data"
pg_num: "64"
diff --git a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml b/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
deleted file mode 100644
index 8e389134b92..00000000000
--- a/qa/suites/ceph-ansible/smoke/basic/4-tasks/rest.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-tasks:
-- exec:
- mgr.x:
- - systemctl stop ceph-mgr.target
- - sleep 5
- - ceph -s
-- exec:
- mon.a:
- - ceph restful create-key admin
- - ceph restful create-self-signed-cert
- - ceph restful restart
-- workunit:
- clients:
- client.0:
- - rest/test-restful.sh
diff --git a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml
index 309f5060045..53e2b7fdbc8 100644
--- a/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml
+++ b/qa/suites/cephmetrics/2-ceph/ceph_ansible.yaml
@@ -20,7 +20,6 @@ overrides:
ceph_repository: dev
ceph_mgr_modules:
- status
- - restful
cephfs_pools:
- name: "cephfs_data"
pg_num: "64"
diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa
index fea2489fdf6..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/.qa
+++ b/qa/suites/crimson-rados-experimental/.qa
@@ -1 +1 @@
-../.qa \ No newline at end of file
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
deleted file mode 120000
index bd9854e7029..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/supported/centos_latest.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
deleted file mode 100644
index d8e5898b99f..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-overrides:
- ceph-deploy:
- conf:
- global:
- osd pool default size: 2
- osd crush chooseleaf type: 0
- osd pool default pg num: 128
- osd pool default pgp num: 128
- ceph:
- conf:
- osd:
- osd shutdown pgref assert: true
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0]
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
deleted file mode 100644
index c22f08eecf8..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
- install:
- ceph:
- flavor: crimson
-tasks:
-- install:
-- ceph:
- conf:
- osd:
- debug monc: 20
- mon:
- mon min osdmap epochs: 50
- paxos service trim min: 10
- # prune full osdmaps regularly
- mon osdmap full prune min: 15
- mon osdmap full prune interval: 2
- mon osdmap full prune txsize: 2
- flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml
deleted file mode 120000
index 6a70c381709..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/config/seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
deleted file mode 100644
index ad8c921425b..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-overrides:
- ceph:
- log-ignorelist:
- - reached quota
- - but it is still running
- - overall HEALTH_
- - \(POOL_FULL\)
- - \(SMALLER_PGP_NUM\)
- - \(CACHE_POOL_NO_HIT_SET\)
- - \(CACHE_POOL_NEAR_FULL\)
- - \(POOL_APP_NOT_ENABLED\)
- - \(PG_AVAILABILITY\)
- - \(PG_DEGRADED\)
- conf:
- client:
- debug ms: 1
- mon:
- mon warn on pool no app: false
- osd:
- osd class load list: "*"
- osd class default list: "*"
- osd blocked scrub grace period: 3600
-tasks:
-- workunit:
- clients:
- client.0:
- - rados/test.sh
- - rados/test_pool_quota.sh
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
deleted file mode 100644
index 25efcdac83d..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
- ceph:
- crush_tunables: optimal
- conf:
- mon:
- mon osd initial require min compat client: luminous
- osd:
- osd_discard_disconnected_ops: false
-tasks:
-- rados:
- clients: [client.0]
- ops: 4000
- objects: 500
- max_attr_len: 8192
- op_weights:
- read: 45
- write: 45
- delete: 10
diff --git a/qa/suites/rados/rest/% b/qa/suites/crimson-rados-experimental/thrash/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/rados/rest/%
+++ b/qa/suites/crimson-rados-experimental/thrash/%
diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
new file mode 120000
index 00000000000..5393a75548a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/2-size-2-min-size.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
new file mode 120000
index 00000000000..5ff70eadf75
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
@@ -0,0 +1 @@
+.qa/overrides/3-size-2-min-size.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
diff --git a/qa/suites/fs/thrash/workloads/overrides/+ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/fs/thrash/workloads/overrides/+
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
new file mode 120000
index 00000000000..47afd70202d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/more-active-recovery.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..0bbc72db754
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
@@ -0,0 +1,6 @@
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_async_recovery_min_cost: 1
+ osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
new file mode 100644
index 00000000000..4aed086bcc3
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_async_recovery_min_cost: 1
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..88f15f2f691
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
index 9774de6887b..79641f695ab 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
@@ -6,6 +6,15 @@ overrides:
conf:
osd:
osd shutdown pgref assert: true
+ crimson alien thread cpu cores: 6-7
+ osd.0:
+ crimson seastar cpu cores: 0-2
+ osd.1:
+ crimson seastar cpu cores: 3-5
+ osd.2:
+ crimson seastar cpu cores: 0-2
+ osd.3:
+ crimson seastar cpu cores: 3-5
global:
ms cluster mode: crc
ms service mode: crc
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
new file mode 100644
index 00000000000..e559d9126e8
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
@@ -0,0 +1,4 @@
+openstack:
+ - volumes: # attached to each instance
+ count: 4
+ size: 10 # GB
diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
new file mode 120000
index 00000000000..a5b729b9efa
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
@@ -0,0 +1 @@
+.qa/distros/crimson-supported-all-distro/ \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
index 2bf67af1b18..2bf67af1b18 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
new file mode 100644
index 00000000000..ecad09cfe3a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
@@ -0,0 +1,11 @@
+overrides:
+ install:
+ ceph:
+ flavor: crimson
+tasks:
+- install:
+- ceph:
+ conf:
+ osd:
+ debug monc: 20
+ flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
new file mode 100644
index 00000000000..0c2062240ee
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
@@ -0,0 +1,16 @@
+# no need to verify os + flavor + sha1
+verify_ceph_hash: false
+tasks:
+- cephadm:
+ conf:
+ mgr:
+ debug ms: 1
+ debug mgr: 20
+ debug osd: 10
+- cephadm.shell:
+ mon.a:
+ - ceph orch status
+ - ceph orch ps
+ - ceph orch ls
+ - ceph orch host ls
+ - ceph orch device ls
diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/rados/rest/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/rados/rest/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
new file mode 100644
index 00000000000..aa44b6101ff
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
@@ -0,0 +1,34 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - but it is still running
+ - objects unfound and apparently lost
+ conf:
+ osd:
+ osd debug reject backfill probability: .3
+ osd scrub min interval: 60
+ osd scrub max interval: 120
+ osd max backfills: 3
+ osd snap trim sleep: 2
+ osd delete sleep: 1
+ mon:
+ mon min osdmap epochs: 50
+ paxos service trim min: 10
+ # prune full osdmaps regularly
+ mon osdmap full prune min: 15
+ mon osdmap full prune interval: 2
+ mon osdmap full prune txsize: 2
+tasks:
+- thrashosds:
+ timeout: 2400
+ dump_ops_enable: false
+ sighup_delay: 0
+ min_in: 3
+ noscrub_toggle_delay: 0
+ chance_thrash_pg_upmap: 0
+ reweight_osd: 0
+ thrash_primary_affinity: false
+ ceph_objectstore_tool: false
+ chance_inject_pause_short: 0
+ chance_thrash_cluster_full: 0
+ chance_reset_purged_snaps_last: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
new file mode 120000
index 00000000000..9124eb1aa29
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+.qa/tasks/thrashosds-health.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
new file mode 100644
index 00000000000..8c9764ade84
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
@@ -0,0 +1,13 @@
+overrides:
+ ceph:
+ conf:
+ client.0:
+ admin socket: /var/run/ceph/ceph-$name.asok
+tasks:
+- radosbench:
+ clients: [client.0]
+ time: 150
+- admin_socket:
+ client.0:
+ objecter_requests:
+ test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
new file mode 100644
index 00000000000..d35e8421ab4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
@@ -0,0 +1,20 @@
+overrides:
+ conf:
+ osd:
+ osd deep scrub update digest min age: 0
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ pool_snaps: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
+
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
new file mode 100644
index 00000000000..902c4b56a1e
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
@@ -0,0 +1,49 @@
+overrides:
+ ceph:
+ conf:
+ client.0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ concurrency: 128
+ size: 8192
+ time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
new file mode 100644
index 00000000000..071f55e3928
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
@@ -0,0 +1,24 @@
+overrides:
+ ceph:
+ conf:
+ client.0:
+ debug ms: 1
+ debug objecter: 20
+ debug rados: 20
+tasks:
+- full_sequential:
+ - radosbench:
+ clients: [client.0]
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ time: 90
+ - radosbench:
+ clients: [client.0]
+ time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
new file mode 100644
index 00000000000..afe04229898
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
@@ -0,0 +1,24 @@
+overrides:
+ ceph:
+ crush_tunables: jewel
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ balance_reads: true
+ max_attr_len: 8192
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
+ setattr: 25
+ rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
new file mode 100644
index 00000000000..445b582ea42
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
@@ -0,0 +1,24 @@
+overrides:
+ ceph:
+ crush_tunables: jewel
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 400000
+ max_seconds: 600
+ max_in_flight: 64
+ objects: 1024
+ size: 16384
+ localize_reads: true
+ max_attr_len: 8192
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
+ setattr: 25
+ rmattr: 25
diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
index af0ac39310e..e7e8070fd76 100644
--- a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-balanced.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
@@ -1,3 +1,6 @@
+overrides:
+ ceph:
+ crush_tunables: jewel
tasks:
- rados:
clients: [client.0]
@@ -6,16 +9,15 @@ tasks:
max_in_flight: 64
objects: 1024
size: 16384
- ec_pool: true
- balanced_reads: true
+ max_attr_len: 8192
op_weights:
read: 100
- write: 0
- append: 100
+ write: 100
delete: 50
snap_create: 50
snap_remove: 50
- rollback: 50
- copy_from: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
setattr: 25
rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
new file mode 100644
index 00000000000..1161c3cc253
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ balance_reads: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
new file mode 100644
index 00000000000..80af0def0e4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ localize_reads: true
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
new file mode 100644
index 00000000000..0694ffcd0d6
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
@@ -0,0 +1,14 @@
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 50
+ op_weights:
+ read: 100
+ write: 100
+ delete: 50
+ snap_create: 50
+ snap_remove: 50
+ rollback: 0
+ # TODO: CEPH_OSD_OP_COPY_FROM
+ copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
new file mode 100644
index 00000000000..606dcae6922
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
@@ -0,0 +1,8 @@
+tasks:
+- rados:
+ clients: [client.0]
+ ops: 4000
+ objects: 500
+ write_fadvise_dontneed: true
+ op_weights:
+ write: 100
diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml \ No newline at end of file
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml \ No newline at end of file
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml
index 06d475e2165..1302e14f21a 100644
--- a/qa/suites/crimson-rados/basic/tasks/rados_python.yaml
+++ b/qa/suites/crimson-rados/basic/tasks/rados_python.yaml
@@ -17,4 +17,4 @@ tasks:
timeout: 1h
clients:
client.0:
- - rados/test_python.sh -m 'not (tier or ec or bench)'
+ - rados/test_python.sh -m 'not (wait or tier or ec)'
diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
index 0f6021975a4..50d170f5022 100644
--- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml
+++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
@@ -10,3 +10,4 @@ tasks:
osd:
debug monc: 20
flavor: crimson
+- ssh_keys:
diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml \ No newline at end of file
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml \ No newline at end of file
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml \ No newline at end of file
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml \ No newline at end of file
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore
deleted file mode 120000
index dbccf5ad928..00000000000
--- a/qa/suites/crimson-rados/singleton/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/objectstore \ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml
new file mode 120000
index 00000000000..481e393be4a
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml \ No newline at end of file
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
new file mode 120000
index 00000000000..61e26e7acf8
--- /dev/null
+++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
@@ -0,0 +1 @@
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
new file mode 120000
index 00000000000..abd86d7d986
--- /dev/null
+++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1 @@
+.qa/overrides/short_pg_log.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
index e84f396e4b2..481e393be4a 120000
--- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
+++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml
@@ -1 +1 @@
-.qa/config/bluestore.yaml \ No newline at end of file
+.qa/config/crimson_bluestore.yaml \ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
index 6a70c381709..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml
@@ -1 +1 @@
-.qa/config/seastore.yaml \ No newline at end of file
+.qa/config/crimson_seastore.yaml \ No newline at end of file
diff --git a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml b/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml
deleted file mode 120000
index 09cfdb59eda..00000000000
--- a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v1.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml \ No newline at end of file
diff --git a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml b/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml
deleted file mode 120000
index 5a4de14e7e0..00000000000
--- a/qa/suites/fs/functional/subvol_versions/create_subvol_version_v2.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml \ No newline at end of file
diff --git a/qa/suites/fs/functional/tasks/test_snap_schedule/% b/qa/suites/fs/functional/tasks/test_snap_schedule/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/fs/functional/tasks/test_snap_schedule/%
diff --git a/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/$
diff --git a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v1.yaml
index 120b2bf04be..120b2bf04be 100644
--- a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v1.yaml
+++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v1.yaml
diff --git a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v2.yaml
index c8bcf95c056..c8bcf95c056 100644
--- a/qa/cephfs/overrides/subvol_versions/create_subvol_version_v2.yaml
+++ b/qa/suites/fs/functional/tasks/test_snap_schedule/overrides/v2.yaml
diff --git a/qa/suites/fs/functional/tasks/snap-schedule.yaml b/qa/suites/fs/functional/tasks/test_snap_schedule/snap-schedule.yaml
index 26922abeda4..7d7f62f16a8 100644
--- a/qa/suites/fs/functional/tasks/snap-schedule.yaml
+++ b/qa/suites/fs/functional/tasks/test_snap_schedule/snap-schedule.yaml
@@ -15,6 +15,7 @@ overrides:
- is full \(reached quota
- POOL_FULL
- POOL_BACKFILLFULL
+ - cluster \[WRN\] evicting unresponsive client
tasks:
- cephfs_test_runner:
diff --git a/qa/suites/fs/functional/tasks/uninlining.yaml b/qa/suites/fs/functional/tasks/uninlining.yaml
new file mode 100644
index 00000000000..1c5da558b2a
--- /dev/null
+++ b/qa/suites/fs/functional/tasks/uninlining.yaml
@@ -0,0 +1,26 @@
+overrides:
+ ceph:
+ conf:
+ mgr:
+ debug mgr: 20
+ debug ms: 1
+ debug finisher: 20
+ debug client: 20
+ mds:
+ # to force replication without waiting for hit ratio to ramp up
+ # this helps with quicker testing against replicas
+ mds_bal_replicate_threshold: 1
+ log-whitelist:
+ - OSD full dropping all updates
+ - OSD near full
+ - pausewr flag
+ - failsafe engaged, dropping updates
+ - failsafe disengaged, no longer dropping
+ - is full \(reached quota
+ - POOL_FULL
+ - POOL_BACKFILLFULL
+
+tasks:
+ - cephfs_test_runner:
+ modules:
+ - tasks.cephfs.test_uninlining
diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml
index da841373220..42ca9336c8e 100644
--- a/qa/suites/fs/libcephfs/tasks/client.yaml
+++ b/qa/suites/fs/libcephfs/tasks/client.yaml
@@ -12,3 +12,4 @@ tasks:
clients:
client.0:
- client/test.sh
+ - client/test_oc_disabled.sh
diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml
index 55dde639c23..b7a0338566c 100644
--- a/qa/suites/fs/multifs/tasks/failover.yaml
+++ b/qa/suites/fs/multifs/tasks/failover.yaml
@@ -8,6 +8,7 @@ overrides:
- \(MDS_DAMAGE\)
- \(FS_DEGRADED\)
- \(MDS_CACHE_OVERSIZED\)
+ - \(MDS_ESTIMATED_REPLAY_TIME\)
ceph-fuse:
disabled: true
tasks:
diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml
index aa966bff214..2dd668c9f88 100644
--- a/qa/suites/fs/nfs/tasks/nfs.yaml
+++ b/qa/suites/fs/nfs/tasks/nfs.yaml
@@ -1,3 +1,10 @@
+overrides:
+ install:
+ extra_system_packages:
+ rpm:
+ - fio
+ deb:
+ - fio
tasks:
- cephfs_test_runner:
modules:
diff --git a/qa/suites/fs/thrash/workloads/overrides/% b/qa/suites/fs/thrash/workloads/overrides/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/fs/thrash/workloads/overrides/%
diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml
index 91b45367934..91b45367934 100644
--- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/no.yaml
+++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/no.yaml
diff --git a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml
index bd202f988c8..bd202f988c8 100644
--- a/qa/suites/fs/thrash/workloads/overrides/prefetch_dirfrags/yes.yaml
+++ b/qa/suites/fs/thrash/workloads/overrides/prefetch_oft_dirfrags/yes.yaml
diff --git a/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
index 713adb9628a..96e4353e99c 100644
--- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
@@ -2,3 +2,4 @@ overrides:
ceph:
log-ignorelist:
- OSD_DOWN
+ - osd.*is down
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml
index fd8e5c9221e..4a5f54dc8c3 100644
--- a/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/tasks/0-from/squid.yaml
@@ -11,8 +11,7 @@ tasks:
- cephadm:
image: quay.ceph.io/ceph-ci/ceph:squid
roleless: true
- cephadm_branch: squid
- cephadm_git_url: https://github.com/ceph/ceph
+ compiled_cephadm_branch: squid
conf:
osd:
#set config option for which cls modules are allowed to be loaded / used
diff --git a/qa/suites/fs/upgrade/nofs/kernel.yaml b/qa/suites/fs/upgrade/nofs/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/nofs/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/upgrade/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml b/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml
new file mode 100644
index 00000000000..db0ec6db8b9
--- /dev/null
+++ b/qa/suites/fs/volumes/tasks/volumes/test/clone-progress.yaml
@@ -0,0 +1,5 @@
+tasks:
+ - cephfs_test_runner:
+ fail_on_skip: false
+ modules:
+ - tasks.cephfs.test_volumes.TestCloneProgressReporter
diff --git a/qa/suites/fs/workload/begin/3-kernel.yaml b/qa/suites/fs/workload/begin/3-kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/workload/begin/3-kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml \ No newline at end of file
diff --git a/qa/suites/fs/workload/begin/3-modules.yaml b/qa/suites/fs/workload/begin/3-modules.yaml
deleted file mode 120000
index 1eba706a59d..00000000000
--- a/qa/suites/fs/workload/begin/3-modules.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/cephfs/begin/3-modules.yaml \ No newline at end of file
diff --git a/qa/suites/fs/workload/tasks/3-snaps/yes.yaml b/qa/suites/fs/workload/tasks/3-snaps/yes.yaml
index dee81778942..51bbe2a3dbf 100644
--- a/qa/suites/fs/workload/tasks/3-snaps/yes.yaml
+++ b/qa/suites/fs/workload/tasks/3-snaps/yes.yaml
@@ -1,8 +1,3 @@
-mgrmodules:
- sequential:
- - exec:
- mon.a:
- - ceph mgr module enable snap_schedule
overrides:
ceph:
mgr-modules:
diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
index 602d3416263..aa327b0cdf5 100644
--- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
+++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml
@@ -5,6 +5,7 @@ overrides:
- "mds.dir_split"
tasks:
- workunit:
+ timeout: 5h
clients:
all:
- kernel_untar_build.sh
diff --git a/qa/suites/nvmeof/basic/base/install.yaml b/qa/suites/nvmeof/basic/base/install.yaml
index 64b754e4270..88974f0e638 100644
--- a/qa/suites/nvmeof/basic/base/install.yaml
+++ b/qa/suites/nvmeof/basic/base/install.yaml
@@ -3,8 +3,7 @@ tasks:
- install:
extra_packages:
- nvme-cli
-- cephadm:
- watchdog_setup:
+- cephadm:
- cephadm.shell:
host.a:
# get state before nvmeof deployment
diff --git a/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml
index 56e6cc0992a..7f20f9f04a8 100644
--- a/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml
+++ b/qa/suites/nvmeof/basic/clusters/4-gateways-2-initiator.yaml
@@ -1,21 +1,26 @@
roles:
+- - client.0
+- - client.1
- - host.a
- mon.a
- mgr.x
- osd.0
- - osd.1
- - client.0
+ - client.2
- ceph.nvmeof.nvmeof.a
- - host.b
- mon.b
+ - osd.1
+ - client.3
+ - ceph.nvmeof.nvmeof.b
+- - host.c
- mon.c
- osd.2
+ - client.4
+ - ceph.nvmeof.nvmeof.c
+- - host.d
- osd.3
- - osd.4
- - client.1
- - ceph.nvmeof.nvmeof.b
-- - client.2
-- - client.3
+ - client.5
+ - ceph.nvmeof.nvmeof.d
overrides:
ceph:
@@ -23,3 +28,5 @@ overrides:
mon:
# cephadm can take up to 5 minutes to bring up remaining mons
mon down mkfs grace: 300
+ log-ignorelist:
+ - NVMEOF_SINGLE_GATEWAY
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 1532c944452..0416ae2ea4e 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,23 +1,24 @@
+# runs on default nvmeof image (i.e. DEFAULT_NVMEOF_IMAGE)
tasks:
- nvmeof:
- client: client.0
- gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ installer: host.a
+ gw_image: default # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
rbd:
pool_name: mypool
image_name_prefix: myimage
gateway_config:
subsystems_count: 3
namespaces_count: 20
- cli_image: quay.io/ceph/nvmeof-cli:1.2
+ cli_image: quay.io/ceph/nvmeof-cli:latest
- cephadm.wait_for_service:
- service: nvmeof.mypool
+ service: nvmeof.mypool.mygroup0
- workunit:
no_coverage_and_limits: true
clients:
- client.2:
- - rbd/nvmeof_setup_subsystem.sh
+ client.0:
+ - nvmeof/setup_subsystem.sh
env:
RBD_POOL: mypool
RBD_IMAGE_PREFIX: myimage
@@ -26,12 +27,12 @@ tasks:
no_coverage_and_limits: true
timeout: 30m
clients:
- client.2:
- - rbd/nvmeof_basic_tests.sh
- - rbd/nvmeof_fio_test.sh --start_ns 1 --end_ns 30 --rbd_iostat
- client.3:
- - rbd/nvmeof_basic_tests.sh
- - rbd/nvmeof_fio_test.sh --start_ns 31 --end_ns 60
+ client.0:
+ - nvmeof/basic_tests.sh
+ - nvmeof/fio_test.sh --start_ns 1 --end_ns 30 --rbd_iostat
+ client.1:
+ - nvmeof/basic_tests.sh
+ - nvmeof/fio_test.sh --start_ns 31 --end_ns 60
env:
RBD_POOL: mypool
IOSTAT_INTERVAL: '10'
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml
new file mode 100644
index 00000000000..8eb4f6dc63c
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml
@@ -0,0 +1,36 @@
+tasks:
+- nvmeof:
+ installer: host.a
+ gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ rbd:
+ pool_name: mypool
+ image_name_prefix: myimage
+ gateway_config:
+ subsystems_count: 3
+ namespaces_count: 20
+ cli_image: quay.io/ceph/nvmeof-cli:latest
+ create_mtls_secrets: true
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- workunit:
+ no_coverage_and_limits: true
+ timeout: 30m
+ clients:
+ client.0:
+ - nvmeof/setup_subsystem.sh
+ - nvmeof/basic_tests.sh
+ - nvmeof/fio_test.sh --rbd_iostat
+ env:
+ RBD_POOL: mypool
+ RBD_IMAGE_PREFIX: myimage
+ IOSTAT_INTERVAL: '10'
+ RUNTIME: '60'
+
+- workunit:
+ no_coverage_and_limits: true
+ timeout: 30m
+ clients:
+ client.0:
+ - nvmeof/mtls_test.sh
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
new file mode 100644
index 00000000000..dfe31380bb6
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -0,0 +1,39 @@
+tasks:
+- nvmeof:
+ installer: host.a
+ gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ rbd:
+ pool_name: mypool
+ image_name_prefix: myimage
+ gateway_config:
+ subsystems_count: 3
+ namespaces_count: 20
+ cli_image: quay.io/ceph/nvmeof-cli:latest
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- workunit:
+ no_coverage_and_limits: true
+ clients:
+ client.0:
+ - nvmeof/setup_subsystem.sh
+ - nvmeof/basic_tests.sh
+ env:
+ RBD_POOL: mypool
+ RBD_IMAGE_PREFIX: myimage
+
+- workunit:
+ no_coverage_and_limits: true
+ timeout: 30m
+ clients:
+ client.0:
+ - nvmeof/fio_test.sh --rbd_iostat
+ client.1:
+ - nvmeof/basic_tests.sh
+ - nvmeof/namespace_test.sh
+ env:
+ RBD_POOL: mypool
+ IOSTAT_INTERVAL: '10'
+ RUNTIME: '120'
+ NEW_NAMESPACES_COUNT: '5'
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
new file mode 100644
index 00000000000..d66b6fc8093
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -0,0 +1,41 @@
+tasks:
+- nvmeof:
+ installer: host.a
+ gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ rbd:
+ pool_name: mypool
+ image_name_prefix: myimage
+ gateway_config:
+ subsystems_count: 3
+ namespaces_count: 20
+ cli_image: quay.io/ceph/nvmeof-cli:latest
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- workunit:
+ no_coverage_and_limits: true
+ timeout: 30m
+ clients:
+ client.0:
+ - nvmeof/setup_subsystem.sh
+ - nvmeof/basic_tests.sh
+ - nvmeof/fio_test.sh --rbd_iostat
+ env:
+ RBD_POOL: mypool
+ RBD_IMAGE_PREFIX: myimage
+ IOSTAT_INTERVAL: '10'
+ RUNTIME: '60'
+
+- workunit:
+ no_coverage_and_limits: true
+ timeout: 30m
+ clients:
+ client.3:
+ - nvmeof/scalability_test.sh nvmeof.a,nvmeof.b
+ - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d
+ - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c
+ env:
+ SCALING_DELAYS: '50'
+ RBD_POOL: mypool
+ NVMEOF_GROUP: mygroup0
diff --git a/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml b/qa/suites/nvmeof/thrash/clusters/4-gateways-1-initiator.yaml
index afe0ed726fe..37c727ed37c 100644
--- a/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml
+++ b/qa/suites/nvmeof/thrash/clusters/4-gateways-1-initiator.yaml
@@ -1,26 +1,30 @@
roles:
+- - client.0 # initiator
- - host.a
- mon.a
- mgr.x
- osd.0
- osd.1
- - client.0
+ - client.1
- ceph.nvmeof.nvmeof.a
- - host.b
- mon.b
- osd.2
- osd.3
- osd.4
- - client.1
+ - client.2
- ceph.nvmeof.nvmeof.b
- - host.c
- mon.c
- osd.5
- osd.6
- osd.7
- - client.2
+ - client.3
- ceph.nvmeof.nvmeof.c
-- - client.3 # initiator
+- - host.d
+ - client.4
+ - ceph.nvmeof.nvmeof.d
+
overrides:
ceph:
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml
new file mode 100644
index 00000000000..83d54cdf5c3
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml
@@ -0,0 +1,37 @@
+tasks:
+- nvmeof:
+ installer: host.a
+ gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ rbd:
+ pool_name: mypool
+ image_name_prefix: myimage
+ gateway_config:
+ subsystems_count: 10
+ namespaces_count: 90 # each subsystem
+ cli_image: quay.io/ceph/nvmeof-cli:latest
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- cephadm.exec:
+ host.a:
+ - ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml
+ - cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml
+ - "sed -i '/ pool: mypool/a\\ spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml"
+ - cat /tmp/nvmeof-no-huge-page.yaml
+ - ceph orch ls --refresh
+ - ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml
+ - ceph orch redeploy nvmeof.mypool.mygroup0
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- workunit:
+ no_coverage_and_limits: true
+ clients:
+ client.0:
+ - nvmeof/setup_subsystem.sh
+ - nvmeof/basic_tests.sh
+ env:
+ RBD_POOL: mypool
+ RBD_IMAGE_PREFIX: myimage
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml
new file mode 100644
index 00000000000..0f7ac011a60
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml
@@ -0,0 +1,24 @@
+tasks:
+- nvmeof:
+ installer: host.a
+ gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+ rbd:
+ pool_name: mypool
+ image_name_prefix: myimage
+ gateway_config:
+ subsystems_count: 120
+ namespaces_count: 8 # each subsystem
+ cli_image: quay.io/ceph/nvmeof-cli:latest
+
+- cephadm.wait_for_service:
+ service: nvmeof.mypool.mygroup0
+
+- workunit:
+ no_coverage_and_limits: true
+ clients:
+ client.0:
+ - nvmeof/setup_subsystem.sh
+ - nvmeof/basic_tests.sh
+ env:
+ RBD_POOL: mypool
+ RBD_IMAGE_PREFIX: myimage
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
deleted file mode 100644
index 3e5262f95df..00000000000
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-tasks:
-- nvmeof:
- client: client.0
- gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
- rbd:
- pool_name: mypool
- image_name_prefix: myimage
- gateway_config:
- subsystems_count: 3
- namespaces_count: 20 # each subsystem
- cli_image: quay.io/ceph/nvmeof-cli:1.2
-
-- cephadm.wait_for_service:
- service: nvmeof.mypool
-
-- workunit:
- no_coverage_and_limits: true
- clients:
- client.3:
- - rbd/nvmeof_setup_subsystem.sh
- - rbd/nvmeof_basic_tests.sh
- env:
- RBD_POOL: mypool
- RBD_IMAGE_PREFIX: myimage
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
index 4306de99e4d..46037784d31 100644
--- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
@@ -8,12 +8,16 @@ overrides:
- out of quorum
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
+ - NVMEOF_SINGLE_GATEWAY
+ - NVMEOF_GATEWAY_DOWN
+ - are in unavailable state
+ - is unavailable
- is in error state
- failed cephadm daemon
tasks:
- nvmeof.thrash:
- checker_host: 'client.3'
+ checker_host: 'client.0'
switch_thrashers: True
- mon_thrash:
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
index 0271e410f7c..b58dc14d87b 100644
--- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
@@ -3,9 +3,14 @@ overrides:
log-ignorelist:
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
+ - NVMEOF_SINGLE_GATEWAY
+ - NVMEOF_GATEWAY_DOWN
+ - are in unavailable state
+ - is unavailable
- is in error state
- failed cephadm daemon
tasks:
- nvmeof.thrash:
- checker_host: 'client.3'
+ checker_host: 'client.0'
+ randomize: False
diff --git a/qa/suites/nvmeof/thrash/workloads/fio.yaml b/qa/suites/nvmeof/thrash/workloads/fio.yaml
index fa7153d2ed9..f9a0d0ebde5 100644
--- a/qa/suites/nvmeof/thrash/workloads/fio.yaml
+++ b/qa/suites/nvmeof/thrash/workloads/fio.yaml
@@ -1,11 +1,11 @@
tasks:
- workunit:
no_coverage_and_limits: true
- timeout: 30m
+ timeout: 60m
clients:
- client.3:
- - rbd/nvmeof_fio_test.sh --rbd_iostat
+ client.0:
+ - nvmeof/fio_test.sh --random_devices 200
env:
RBD_POOL: mypool
IOSTAT_INTERVAL: '10'
- RUNTIME: '600'
+ RUNTIME: '1800'
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml
index e57b7763661..18f3ed374ea 100644
--- a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_basic.yaml
@@ -39,6 +39,7 @@ tasks:
ceph smb cluster create modusr1 user
--define-user-pass=user1%t3stP4ss1
--define-user-pass=user2%t3stP4ss2
+ --placement=count:1
- cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1
- cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2
# Wait for the smb service to start
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml
new file mode 100644
index 00000000000..3bbf30ea427
--- /dev/null
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml
@@ -0,0 +1,91 @@
+roles:
+# Test is for basic smb deployment & functionality. one node cluster is OK
+- - host.a
+ - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - client.0
+- - host.b
+ - mon.b
+ - osd.2
+ - osd.3
+- - host.c
+ - mon.c
+ - osd.4
+ - osd.5
+# Reserve a host for acting as a domain controller and smb client
+- - host.d
+ - cephadm.exclude
+overrides:
+ ceph:
+ log-only-match:
+ - CEPHADM_
+tasks:
+- cephadm.configure_samba_client_container:
+ role: host.d
+- vip:
+ count: 1
+- cephadm:
+
+- cephadm.shell:
+ host.a:
+ - ceph fs volume create cephfs
+- cephadm.wait_for_service:
+ service: mds.cephfs
+
+- cephadm.shell:
+ host.a:
+ # add subvolgroup & subvolumes for test
+ - cmd: ceph fs subvolumegroup create cephfs smb
+ - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777
+ - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777
+ # set up smb cluster and shares
+ - cmd: ceph mgr module enable smb
+ - cmd: sleep 30
+ - cmd: >
+ ceph smb cluster create modusr1 user
+ --define-user-pass=user1%t3stP4ss1
+ --placement=count:3
+ --clustering=default
+ --public_addrs={{VIP0}}/{{VIPPREFIXLEN}}
+ - cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1
+ - cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2
+# Wait for the smb service to start
+- cephadm.wait_for_service:
+ service: smb.modusr1
+
+# Check if shares exist
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+# verify CTDB is healthy, cluster well formed
+- cephadm.exec:
+ host.a:
+ - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.modusr1\")))[-1].name' > /tmp/svcname"
+ - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status"
+ - cat /tmp/ctdb_status
+ - grep 'pnn:0 .*OK' /tmp/ctdb_status
+ - grep 'pnn:1 .*OK' /tmp/ctdb_status
+ - grep 'pnn:2 .*OK' /tmp/ctdb_status
+ - grep 'Number of nodes:3' /tmp/ctdb_status
+ - rm -rf /tmp/svcname /tmp/ctdb_status
+
+# Test the assigned VIP
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share2 -c ls"
+
+- cephadm.shell:
+ host.a:
+ - cmd: ceph smb share rm modusr1 share2
+ - cmd: ceph smb share rm modusr1 share1
+ - cmd: ceph smb cluster rm modusr1
+# Wait for the smb service to be removed
+- cephadm.wait_for_service_not_present:
+ service: smb.modusr1
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml
new file mode 100644
index 00000000000..b9b0ec0d6f1
--- /dev/null
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_basic.yaml
@@ -0,0 +1,135 @@
+roles:
+# Test is for basic smb deployment & functionality. one node cluster is OK
+- - host.a
+ - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - client.0
+- - host.b
+ - mon.b
+ - osd.2
+ - osd.3
+- - host.c
+ - mon.c
+ - osd.4
+ - osd.5
+# Reserve a host for acting as an smb client
+- - host.d
+ - cephadm.exclude
+overrides:
+ ceph:
+ log-only-match:
+ - CEPHADM_
+tasks:
+- cephadm.configure_samba_client_container:
+ role: host.d
+- cephadm:
+
+- cephadm.shell:
+ host.a:
+ - ceph fs volume create cephfs
+- cephadm.wait_for_service:
+ service: mds.cephfs
+
+- cephadm.shell:
+ host.a:
+ # add subvolgroup & subvolumes for test
+ - cmd: ceph fs subvolumegroup create cephfs smb
+ - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777
+ - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777
+ # set up smb cluster and shares
+ - cmd: ceph mgr module enable smb
+ # TODO: replace sleep with poll of mgr state?
+ - cmd: sleep 30
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: uctdb1
+ auth_mode: user
+ user_group_settings:
+ - {source_type: resource, ref: ug1}
+ placement:
+ count: 3
+ - resource_type: ceph.smb.usersgroups
+ users_groups_id: ug1
+ values:
+ users:
+ - {name: user1, password: t3stP4ss1}
+ - {name: user2, password: t3stP4ss2}
+ groups: []
+ - resource_type: ceph.smb.share
+ cluster_id: uctdb1
+ share_id: share1
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv1
+ path: /
+ - resource_type: ceph.smb.share
+ cluster_id: uctdb1
+ share_id: share2
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv2
+ path: /
+ # --- End Embedded YAML
+# Wait for the smb service to start
+- cephadm.wait_for_service:
+ service: smb.uctdb1
+# Since this is a true cluster there should be a clustermeta in rados
+- cephadm.shell:
+ host.a:
+ - cmd: rados --pool=.smb -N uctdb1 get cluster.meta.json /dev/stdout
+
+# Check if shares exist
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user2%t3stP4ss2 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+# verify CTDB is healthy, cluster well formed
+- cephadm.exec:
+ host.a:
+ - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.uctdb1\")))[-1].name' > /tmp/svcname"
+ - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status"
+ - cat /tmp/ctdb_status
+ - grep 'pnn:0 .*OK' /tmp/ctdb_status
+ - grep 'pnn:1 .*OK' /tmp/ctdb_status
+ - grep 'pnn:2 .*OK' /tmp/ctdb_status
+ - grep 'Number of nodes:3' /tmp/ctdb_status
+ - rm -rf /tmp/svcname /tmp/ctdb_status
+
+# Test a different host in the cluster
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.c'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user2%t3stP4ss2 //{{'host.c'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+- cephadm.shell:
+ host.a:
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: uctdb1
+ intent: removed
+ - resource_type: ceph.smb.usersgroups
+ users_groups_id: ug1
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: uctdb1
+ share_id: share1
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: uctdb1
+ share_id: share2
+ intent: removed
+ # --- End Embedded YAML
+# Wait for the smb service to be removed
+- cephadm.wait_for_service_not_present:
+ service: smb.uctdb1
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml
new file mode 100644
index 00000000000..b74593058e2
--- /dev/null
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_dom.yaml
@@ -0,0 +1,138 @@
+roles:
+# Test is for basic smb deployment & functionality. one node cluster is OK
+- - host.a
+ - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - client.0
+- - host.b
+ - mon.b
+ - osd.2
+ - osd.3
+- - host.c
+ - mon.c
+ - osd.4
+ - osd.5
+# Reserve a host for acting as a domain controller and smb client
+- - host.d
+ - cephadm.exclude
+overrides:
+ ceph:
+ log-only-match:
+ - CEPHADM_
+tasks:
+- cephadm.deploy_samba_ad_dc:
+ role: host.d
+- cephadm:
+
+- cephadm.shell:
+ host.a:
+ - ceph fs volume create cephfs
+- cephadm.wait_for_service:
+ service: mds.cephfs
+
+- cephadm.shell:
+ host.a:
+ # add subvolgroup & subvolumes for test
+ - cmd: ceph fs subvolumegroup create cephfs smb
+ - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777
+ - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777
+ # set up smb cluster and shares
+ - cmd: ceph mgr module enable smb
+ # TODO: replace sleep with poll of mgr state?
+ - cmd: sleep 30
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: adctdb1
+ auth_mode: active-directory
+ domain_settings:
+ realm: DOMAIN1.SINK.TEST
+ join_sources:
+ - source_type: resource
+ ref: join1-admin
+ custom_dns:
+ - "{{ctx.samba_ad_dc_ip}}"
+ placement:
+ count: 3
+ - resource_type: ceph.smb.join.auth
+ auth_id: join1-admin
+ auth:
+ username: Administrator
+ password: Passw0rd
+ - resource_type: ceph.smb.share
+ cluster_id: adctdb1
+ share_id: share1
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv1
+ path: /
+ - resource_type: ceph.smb.share
+ cluster_id: adctdb1
+ share_id: share2
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv2
+ path: /
+ # --- End Embedded YAML
+# Wait for the smb service to start
+- cephadm.wait_for_service:
+ service: smb.adctdb1
+# Since this is a true cluster there should be a clustermeta in rados
+- cephadm.shell:
+ host.a:
+ - cmd: rados --pool=.smb -N adctdb1 get cluster.meta.json /dev/stdout
+
+# Check if shares exist
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+# verify CTDB is healthy, cluster well formed
+- cephadm.exec:
+ host.a:
+ - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.adctdb1\")))[-1].name' > /tmp/svcname"
+ - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status"
+ - cat /tmp/ctdb_status
+ - grep 'pnn:0 .*OK' /tmp/ctdb_status
+ - grep 'pnn:1 .*OK' /tmp/ctdb_status
+ - grep 'pnn:2 .*OK' /tmp/ctdb_status
+ - grep 'Number of nodes:3' /tmp/ctdb_status
+ - rm -rf /tmp/svcname /tmp/ctdb_status
+
+# Test a different host in the cluster
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.c'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.c'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+- cephadm.shell:
+ host.a:
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: adctdb1
+ intent: removed
+ - resource_type: ceph.smb.join.auth
+ auth_id: join1-admin
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: adctdb1
+ share_id: share1
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: adctdb1
+ share_id: share2
+ intent: removed
+ # --- End Embedded YAML
+# Wait for the smb service to be removed
+- cephadm.wait_for_service_not_present:
+ service: smb.adctdb1
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml
new file mode 100644
index 00000000000..0aa55a53a3d
--- /dev/null
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_ctdb_res_ips.yaml
@@ -0,0 +1,145 @@
+roles:
+# Test is for basic smb deployment & functionality. one node cluster is OK
+- - host.a
+ - mon.a
+ - mgr.x
+ - osd.0
+ - osd.1
+ - client.0
+- - host.b
+ - mon.b
+ - osd.2
+ - osd.3
+- - host.c
+ - mon.c
+ - osd.4
+ - osd.5
+# Reserve a host for acting as a domain controller and smb client
+- - host.d
+ - cephadm.exclude
+overrides:
+ ceph:
+ log-only-match:
+ - CEPHADM_
+tasks:
+- cephadm.deploy_samba_ad_dc:
+ role: host.d
+- vip:
+ count: 2
+- cephadm:
+
+- cephadm.shell:
+ host.a:
+ - ceph fs volume create cephfs
+- cephadm.wait_for_service:
+ service: mds.cephfs
+
+- cephadm.shell:
+ host.a:
+ # add subvolgroup & subvolumes for test
+ - cmd: ceph fs subvolumegroup create cephfs smb
+ - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777
+ - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777
+ # set up smb cluster and shares
+ - cmd: ceph mgr module enable smb
+ # TODO: replace sleep with poll of mgr state?
+ - cmd: sleep 30
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: adipctdb
+ auth_mode: active-directory
+ domain_settings:
+ realm: DOMAIN1.SINK.TEST
+ join_sources:
+ - source_type: resource
+ ref: join1-admin
+ custom_dns:
+ - "{{ctx.samba_ad_dc_ip}}"
+ public_addrs:
+ - address: {{VIP0}}/{{VIPPREFIXLEN}}
+ - address: {{VIP1}}/{{VIPPREFIXLEN}}
+ placement:
+ count: 3
+ - resource_type: ceph.smb.join.auth
+ auth_id: join1-admin
+ auth:
+ username: Administrator
+ password: Passw0rd
+ - resource_type: ceph.smb.share
+ cluster_id: adipctdb
+ share_id: share1
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv1
+ path: /
+ - resource_type: ceph.smb.share
+ cluster_id: adipctdb
+ share_id: share2
+ cephfs:
+ volume: cephfs
+ subvolumegroup: smb
+ subvolume: sv2
+ path: /
+ # --- End Embedded YAML
+# Wait for the smb service to start
+- cephadm.wait_for_service:
+ service: smb.adipctdb
+# Since this is a true cluster there should be a clustermeta in rados
+- cephadm.shell:
+ host.a:
+ - cmd: rados --pool=.smb -N adipctdb get cluster.meta.json /dev/stdout
+
+# Check if shares exist
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+# verify CTDB is healthy, cluster well formed
+- cephadm.exec:
+ host.a:
+ - "{{ctx.cephadm}} ls --no-detail | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.adipctdb\")))[-1].name' > /tmp/svcname"
+ - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status"
+ - cat /tmp/ctdb_status
+ - grep 'pnn:0 .*OK' /tmp/ctdb_status
+ - grep 'pnn:1 .*OK' /tmp/ctdb_status
+ - grep 'pnn:2 .*OK' /tmp/ctdb_status
+ - grep 'Number of nodes:3' /tmp/ctdb_status
+ - rm -rf /tmp/svcname /tmp/ctdb_status
+
+# Test the two assigned VIPs
+- cephadm.exec:
+ host.d:
+ - sleep 30
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP0}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP1}}/share1 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP0}}/share2 -c ls"
+ - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U DOMAIN1\\\\ckent%1115Rose. //{{VIP1}}/share2 -c ls"
+
+- cephadm.shell:
+ host.a:
+ - cmd: ceph smb apply -i -
+ stdin: |
+ # --- Begin Embedded YAML
+ - resource_type: ceph.smb.cluster
+ cluster_id: adipctdb
+ intent: removed
+ - resource_type: ceph.smb.join.auth
+ auth_id: join1-admin
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: adipctdb
+ share_id: share1
+ intent: removed
+ - resource_type: ceph.smb.share
+ cluster_id: adipctdb
+ share_id: share2
+ intent: removed
+ # --- End Embedded YAML
+# Wait for the smb service to be removed
+- cephadm.wait_for_service_not_present:
+ service: smb.adipctdb
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml
index ce08d40bb58..f07c298c9fc 100644
--- a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_domain.yaml
@@ -40,6 +40,7 @@ tasks:
--domain-realm=domain1.sink.test
--domain-join-user-pass=Administrator%Passw0rd
--custom-dns={{ctx.samba_ad_dc_ip}}
+ --placement=count:1
- cmd: ceph smb share create modtest1 share1 cephfs / --subvolume=smb/sv1
- cmd: ceph smb share create modtest1 share2 cephfs / --subvolume=smb/sv2
# Wait for the smb service to start
diff --git a/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml b/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml
index 4c5e267408b..8509fcc14e3 100644
--- a/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml
+++ b/qa/suites/orch/cephadm/smoke-roleless/2-services/nvmeof.yaml
@@ -3,6 +3,6 @@ tasks:
host.a:
- ceph osd pool create foo
- rbd pool init foo
- - ceph orch apply nvmeof foo
+ - ceph orch apply nvmeof foo default
- cephadm.wait_for_service:
- service: nvmeof.foo
+ service: nvmeof.foo.default
diff --git a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
index 0080d3bf730..c6bec082843 100644
--- a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
+++ b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
@@ -131,8 +131,10 @@ tasks:
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo
- while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done
- ceph orch ps
+ - ceph versions
# verify all rgw daemons on same version and version hash matches what we are upgrading to
- - ceph versions | jq -e '.rgw | length == 1'
+ # `ceph versions` might not get updated immediately for rgw so retry this
+ - time timeout 60 bash -c "until ceph versions | jq -e '.rgw | length == 1'; do sleep 2; done"
- ceph versions | jq -e '.rgw | keys' | grep $sha1
- ceph orch upgrade status
- ceph health detail
diff --git a/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml b/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml
index 74acebd7037..8c56e41756a 100644
--- a/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml
+++ b/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/test_iscsi_container.yaml
@@ -25,3 +25,4 @@ tasks:
client.0:
- cephadm/test_iscsi_pids_limit.sh
- cephadm/test_iscsi_etc_hosts.sh
+ - cephadm/test_iscsi_setup.sh
diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
new file mode 100644
index 00000000000..5207fd415b7
--- /dev/null
+++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
@@ -0,0 +1,77 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - CEPHADM_FAILED_DAEMON
+ log-only-match:
+ - CEPHADM_
+roles:
+- - host.a
+ - mon.a
+ - mgr.a
+ - osd.0
+- - host.b
+ - mon.b
+ - mgr.b
+ - osd.1
+- - host.c
+ - mon.c
+ - osd.2
+tasks:
+- install:
+- cephadm:
+- cephadm.shell:
+ host.c:
+ - |
+ set -ex
+ # Deploy monitoring stack
+ ceph orch apply node-exporter
+ ceph orch apply grafana
+ ceph orch apply alertmanager
+ ceph orch apply prometheus
+ sleep 240
+ # generate SSL certificate
+ openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*"
+ # Generate a mgmt.spec template
+ cat << EOT > /tmp/mgmt.spec
+ service_type: mgmt-gateway
+ service_id: foo
+ placement:
+ hosts:
+ - ${HOSTNAME}
+ spec:
+ ssl_protocols:
+ - TLSv1.2
+ - TLSv1.3
+ ssl_ciphers:
+ - AES128-SHA
+ - AES256-SHA
+ enable_health_check_endpoint: True
+ EOT
+ # Add generated certificates to spec file
+ echo " ssl_certificate: |" >> /tmp/mgmt.spec
+ while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
+ echo " ssl_certificate_key: |" >> /tmp/mgmt.spec
+ while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/key.pem >> /tmp/mgmt.spec
+ # Apply spec
+ ceph orch apply -i /tmp/mgmt.spec
+- cephadm.wait_for_service:
+ service: mgmt-gateway
+- cephadm.shell:
+ host.a:
+ - |
+ set -ex
+ # retrieve mgmt hostname and ip
+ MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
+ MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+ # check mgmt-gateway health
+ curl -k -s https://${MGMT_GTW_IP}/health
+ curl -k -s https://${MGMT_GTW_IP}:29443/health
+ # wait for background services to be reconfigured following mgmt-gateway installation
+ sleep 180
+ # check grafana endpoints are responsive and database health is okay
+ curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
+ # check prometheus endpoints are responsive
+ curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
+ # check alertmanager endpoints are responsive
+ curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
+
diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml
index 89733dabead..515293ea83a 100644
--- a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml
+++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml
@@ -61,6 +61,6 @@ tasks:
curl -s http://${PROM_IP}:9095/api/v1/alerts
curl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e '.data | .alerts | .[] | select(.labels | .alertname == "CephMonDown") | .state == "firing"'
# check alertmanager endpoints are responsive and mon down alert is active
- curl -s http://${ALERTM_IP}:9093/api/v1/status
- curl -s http://${ALERTM_IP}:9093/api/v1/alerts
- curl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e '.data | .[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"'
+ curl -s http://${ALERTM_IP}:9093/api/v2/status
+ curl -s http://${ALERTM_IP}:9093/api/v2/alerts
+ curl -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e '.[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"'
diff --git a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml
index 1eb4a184dca..e2a2ca03cc9 100644
--- a/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml
+++ b/qa/suites/rados/mgr/tasks/4-units/module_selftest.yaml
@@ -6,7 +6,6 @@ overrides:
- objects misplaced
- Synthetic exception in serve
- influxdb python module not found
- - \(MGR_ZABBIX_
- foo bar
- Failed to open Telegraf
- evicting unresponsive client
diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
index 372bf2561fa..8b3c4c11ac6 100644
--- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
+++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml
@@ -15,6 +15,7 @@ overrides:
# causing tests to fail due to health warns, even if
# the tests themselves are successful.
- \(OSDMAP_FLAGS\)
+ - \(PG_DEGRADED\)
tasks:
- workunit:
clients:
diff --git a/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml
new file mode 100644
index 00000000000..7cd47898544
--- /dev/null
+++ b/qa/suites/rados/objectstore/backends/ceph_test_bluefs.yaml
@@ -0,0 +1,8 @@
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, client.0]
+tasks:
+- install:
+- exec:
+ client.0:
+ - mkdir $TESTDIR/ceph_test_bluefs && cd $TESTDIR/ceph_test_bluefs && ceph_test_bluefs --log-file $TESTDIR/archive/ceph_test_bluefs.log --debug-bluefs 5/20 --gtest_catch_exceptions=0
+ - rm -rf $TESTDIR/ceph_test_bluefs
diff --git a/qa/suites/rados/rest/mgr-restful.yaml b/qa/suites/rados/rest/mgr-restful.yaml
deleted file mode 100644
index 4901f401d30..00000000000
--- a/qa/suites/rados/rest/mgr-restful.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-openstack:
-- volumes: # attached to each instance
- count: 3
- size: 10 # GB
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, mds.a, client.a]
-tasks:
-- install:
-- ceph:
- log-ignorelist:
- - overall HEALTH_
- - \(MGR_DOWN\)
- - \(PG_
- - \(OSD_
- - \(OBJECT_
- - \(OSDMAP_FLAGS\)
- - \(POOL_APP_NOT_ENABLED\)
-- exec:
- mon.a:
- - ceph restful create-key admin
- - ceph restful create-self-signed-cert
- - ceph restful restart
-- workunit:
- clients:
- client.a:
- - rest/test-restful.sh
-- exec:
- mon.a:
- - ceph restful delete-key admin
- - ceph restful list-keys | jq ".admin" | grep null
-
diff --git a/qa/suites/rados/rest/supported-random-distro$ b/qa/suites/rados/rest/supported-random-distro$
deleted file mode 120000
index 7cef21eeffd..00000000000
--- a/qa/suites/rados/rest/supported-random-distro$
+++ /dev/null
@@ -1 +0,0 @@
-../basic/supported-random-distro$ \ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
index 66cf2bc7593..58e253bf6f4 120000
--- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
+++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
@@ -1 +1 @@
-.qa/objectstore_debug/bluestore-bitmap.yaml \ No newline at end of file
+.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml \ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml
index da2e2598c33..d694c94945f 120000
--- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml
+++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-lz4.yaml
@@ -1 +1 @@
-.qa/objectstore_debug/bluestore-comp-lz4.yaml \ No newline at end of file
+.qa/objectstore_debug/bluestore/bluestore-comp-lz4.yaml \ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml
index f75b0e1b48e..d7defabaa3c 120000
--- a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml
+++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp-snappy.yaml
@@ -1 +1 @@
-.qa/objectstore_debug/bluestore-comp-snappy.yaml \ No newline at end of file
+.qa/objectstore_debug/bluestore/bluestore-comp-snappy.yaml \ No newline at end of file
diff --git a/qa/suites/rados/singleton/all/mon-connection-score.yaml b/qa/suites/rados/singleton/all/mon-connection-score.yaml
new file mode 100644
index 00000000000..f9e0ba3452d
--- /dev/null
+++ b/qa/suites/rados/singleton/all/mon-connection-score.yaml
@@ -0,0 +1,40 @@
+roles:
+- - mon.a
+ - mon.b
+ - mon.c
+ - osd.0
+ - osd.1
+ - osd.2
+ - mgr.x
+ - client.0
+
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+tasks:
+- install:
+- ceph:
+ pre-mgr-commands:
+ - sudo ceph config set mgr mgr_pool false --force
+ log-ignorelist:
+ - overall HEALTH_
+ - \(OSDMAP_FLAGS\)
+ - \(OSD_
+ - \(PG_
+ - \(POOL_
+ - \(CACHE_POOL_
+ - \(OBJECT_
+ - \(SLOW_OPS\)
+ - \(REQUEST_SLOW\)
+ - \(TOO_FEW_PGS\)
+ - slow request
+ - \(POOL_APP_NOT_ENABLED\)
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(MON_DOWN\)
+ - \(PG_AVAILABILITY\)
+ - \(SLOW_OPS\)
+- cephfs_test_runner:
+ modules:
+ - tasks.mon_connection_score \ No newline at end of file
diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
new file mode 100644
index 00000000000..69a54b0f1b7
--- /dev/null
+++ b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
@@ -0,0 +1,57 @@
+roles:
+- - mon.a
+ - mon.b
+ - mgr.a
+ - mgr.b
+ - osd.0
+ - osd.1
+ - osd.2
+ - osd.3
+- - mon.c
+ - mon.d
+ - mgr.c
+ - mgr.d
+ - osd.4
+ - osd.5
+ - osd.6
+ - osd.7
+- - mon.e
+- - client.0
+
+openstack:
+ - volumes: # attached to each instance
+ count: 3
+ size: 10 # GB
+overrides:
+ ceph:
+ conf:
+ global:
+ osd pool default size: 3
+ osd pool default min size: 2
+ mon:
+ debug mon: 30
+tasks:
+- install:
+- ceph:
+ pre-mgr-commands:
+ - sudo ceph config set mgr mgr_pool false --force
+ log-ignorelist:
+ - \(POOL_
+ - \(CACHE_POOL_
+ - overall HEALTH_
+ - \(PG_AVAILABILITY\)
+ - Reduced data availability
+ - \(PG_DEGRADED\)
+ - \(MON_DOWN\)
+ - \(OSD_DATACENTER_DOWN\)
+ - \(OSD_DOWN\)
+ - \(OSD_HOST_DOWN\)
+
+
+- workunit:
+ clients:
+ client.0:
+ - mon/mon-stretch-mode-5-mons-8-osds.sh
+- cephfs_test_runner:
+ modules:
+ - tasks.stretch_mode_disable_enable
diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
index 635085f7fc8..08070caa387 120000
--- a/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
+++ b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
@@ -1 +1 @@
-../thrash-erasure-code/objectstore/bluestore-bitmap.yaml \ No newline at end of file
+../thrash-erasure-code/objectstore/bluestore/bluestore-bitmap.yaml \ No newline at end of file
diff --git a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml
index a8bbbafece0..b916bed1475 100644
--- a/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml
+++ b/qa/suites/rados/thrash-old-clients/workloads/rbd_cls.yaml
@@ -2,6 +2,9 @@ meta:
- desc: |
rbd object class functional tests
tasks:
-- exec:
- client.2:
- - ceph_test_cls_rbd --gtest_filter=-TestClsRbd.get_features:TestClsRbd.parents:TestClsRbd.mirror
+- workunit:
+ clients:
+ client.2:
+ - cls/test_cls_rbd.sh
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
diff --git a/qa/suites/rados/valgrind-leaks/1-start.yaml b/qa/suites/rados/valgrind-leaks/1-start.yaml
index 1cdd8a688e8..cc8c8e53766 100644
--- a/qa/suites/rados/valgrind-leaks/1-start.yaml
+++ b/qa/suites/rados/valgrind-leaks/1-start.yaml
@@ -12,6 +12,7 @@ overrides:
- overall HEALTH_
- \(PG_
- \(POOL_APP_NOT_ENABLED\)
+ - OSD bench result
conf:
global:
osd heartbeat grace: 40
diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml
index c70893893fd..17cf141b0cd 100644
--- a/qa/suites/rados/verify/validater/valgrind.yaml
+++ b/qa/suites/rados/verify/validater/valgrind.yaml
@@ -26,6 +26,8 @@ overrides:
- \(MON_DOWN\)
- \(SLOW_OPS\)
- slow request
+ - OSD bench result
+ - OSD_DOWN
valgrind:
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
osd: [--tool=memcheck]
diff --git a/qa/suites/rbd/iscsi/0-single-container-host.yaml b/qa/suites/rbd/iscsi/0-single-container-host.yaml
deleted file mode 120000
index 7406e749cf5..00000000000
--- a/qa/suites/rbd/iscsi/0-single-container-host.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/single-container-host.yaml \ No newline at end of file
diff --git a/qa/suites/rbd/iscsi/base/install.yaml b/qa/suites/rbd/iscsi/base/install.yaml
index 5c5a6c31f60..cca178cafe8 100644
--- a/qa/suites/rbd/iscsi/base/install.yaml
+++ b/qa/suites/rbd/iscsi/base/install.yaml
@@ -9,6 +9,10 @@ tasks:
- ceph orch host ls
- ceph orch device ls
- install:
- extra_packages:
+ extra_system_packages:
+ deb:
+ - open-iscsi
+ - multipath-tools
+ rpm:
- iscsi-initiator-utils
- device-mapper-multipath
diff --git a/qa/suites/rbd/iscsi/supported-container-hosts$ b/qa/suites/rbd/iscsi/supported-container-hosts$
new file mode 120000
index 00000000000..30a61f1575f
--- /dev/null
+++ b/qa/suites/rbd/iscsi/supported-container-hosts$
@@ -0,0 +1 @@
+.qa/distros/supported-container-hosts/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/% b/qa/suites/rbd/migration-external/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rbd/migration-external/%
diff --git a/qa/suites/rbd/migration-external/.qa b/qa/suites/rbd/migration-external/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/1-base/.qa b/qa/suites/rbd/migration-external/1-base/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/1-base/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/1-base/install.yaml b/qa/suites/rbd/migration-external/1-base/install.yaml
new file mode 100644
index 00000000000..0728d3f206a
--- /dev/null
+++ b/qa/suites/rbd/migration-external/1-base/install.yaml
@@ -0,0 +1,8 @@
+meta:
+- desc: run two ceph clusters
+tasks:
+- install:
+- ceph:
+ cluster: cluster1
+- ceph:
+ cluster: cluster2
diff --git a/qa/suites/rbd/migration-external/2-clusters/.qa b/qa/suites/rbd/migration-external/2-clusters/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/2-clusters/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/2-clusters/2-node.yaml b/qa/suites/rbd/migration-external/2-clusters/2-node.yaml
new file mode 100644
index 00000000000..848e63055e9
--- /dev/null
+++ b/qa/suites/rbd/migration-external/2-clusters/2-node.yaml
@@ -0,0 +1,15 @@
+meta:
+- desc: 2 ceph clusters with 1 mon and 3 osds each
+roles:
+- - cluster1.mon.a
+ - cluster1.mgr.x
+ - cluster1.osd.0
+ - cluster1.osd.1
+ - cluster1.osd.2
+ - cluster1.client.0
+- - cluster2.mon.a
+ - cluster2.mgr.x
+ - cluster2.osd.0
+ - cluster2.osd.1
+ - cluster2.osd.2
+ - cluster2.client.0
diff --git a/qa/suites/rbd/migration-external/3-objectstore b/qa/suites/rbd/migration-external/3-objectstore
new file mode 120000
index 00000000000..c40bd326145
--- /dev/null
+++ b/qa/suites/rbd/migration-external/3-objectstore
@@ -0,0 +1 @@
+.qa/objectstore \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/4-supported-random-distro$ b/qa/suites/rbd/migration-external/4-supported-random-distro$
new file mode 120000
index 00000000000..0862b4457b3
--- /dev/null
+++ b/qa/suites/rbd/migration-external/4-supported-random-distro$
@@ -0,0 +1 @@
+.qa/distros/supported-random-distro$ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/5-data-pool/.qa b/qa/suites/rbd/migration-external/5-data-pool/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/5-data-pool/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/5-data-pool/ec.yaml b/qa/suites/rbd/migration-external/5-data-pool/ec.yaml
new file mode 100644
index 00000000000..f8a39979f97
--- /dev/null
+++ b/qa/suites/rbd/migration-external/5-data-pool/ec.yaml
@@ -0,0 +1,29 @@
+tasks:
+- exec:
+ cluster1.client.0:
+ - sudo ceph --cluster cluster1 osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
+ - sudo ceph --cluster cluster1 osd pool create datapool 4 4 erasure teuthologyprofile
+ - sudo ceph --cluster cluster1 osd pool set datapool allow_ec_overwrites true
+ - rbd --cluster cluster1 pool init datapool
+ cluster2.client.0:
+ - sudo ceph --cluster cluster2 osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2
+ - sudo ceph --cluster cluster2 osd pool create datapool 4 4 erasure teuthologyprofile
+ - sudo ceph --cluster cluster2 osd pool set datapool allow_ec_overwrites true
+ - rbd --cluster cluster2 pool init datapool
+
+overrides:
+ thrashosds:
+ bdev_inject_crash: 2
+ bdev_inject_crash_probability: .5
+ ceph:
+ fs: xfs
+ conf:
+ client:
+ rbd default data pool: datapool
+ osd: # force bluestore since it's required for ec overwrites
+ osd objectstore: bluestore
+ bluestore block size: 96636764160
+ enable experimental unrecoverable data corrupting features: "*"
+ osd debug randomize hobject sort order: false
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+# bluestore bluefs env mirror: true
diff --git a/qa/suites/rbd/migration-external/5-data-pool/none.yaml b/qa/suites/rbd/migration-external/5-data-pool/none.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rbd/migration-external/5-data-pool/none.yaml
diff --git a/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml b/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml
new file mode 100644
index 00000000000..3ecbaf8c127
--- /dev/null
+++ b/qa/suites/rbd/migration-external/5-data-pool/replicated.yaml
@@ -0,0 +1,14 @@
+tasks:
+- exec:
+ cluster1.client.0:
+ - sudo ceph --cluster cluster1 osd pool create datapool 4
+ - rbd --cluster cluster1 pool init datapool
+ cluster2.client.0:
+ - sudo ceph --cluster cluster2 osd pool create datapool 4
+ - rbd --cluster cluster2 pool init datapool
+
+overrides:
+ ceph:
+ conf:
+ client:
+ rbd default data pool: datapool
diff --git a/qa/suites/rbd/migration-external/6-prepare/.qa b/qa/suites/rbd/migration-external/6-prepare/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/6-prepare/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml b/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml
new file mode 100644
index 00000000000..2ca92dccfde
--- /dev/null
+++ b/qa/suites/rbd/migration-external/6-prepare/native-clone.yaml
@@ -0,0 +1,29 @@
+tasks:
+ - exec:
+ cluster2.client.0:
+ - echo '{"type":"qcow","stream":{"type":"http","url":"http://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd --cluster cluster2 migration prepare --import-only --source-spec-path - client.0.0-src
+ - rbd --cluster cluster2 migration execute client.0.0-src
+ - rbd --cluster cluster2 migration commit client.0.0-src
+ - rbd --cluster cluster2 snap create client.0.0-src@snap
+ - rbd --cluster cluster2 snap protect client.0.0-src@snap
+ - rbd --cluster cluster2 clone client.0.0-src@snap client.0.0
+ - rbd --cluster cluster2 snap create client.0.0@snap
+ - rbd --cluster cluster2 create --size 1G client.0.1-src
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.1-src
+ - rbd --cluster cluster2 snap create client.0.1-src@snap
+ - rbd --cluster cluster2 snap protect client.0.1-src@snap
+ - rbd --cluster cluster2 clone client.0.1-src@snap client.0.1
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.1
+ - rbd --cluster cluster2 snap create client.0.1@snap
+ - rbd --cluster cluster2 create --size 1G client.0.2-src
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 1M client.0.2-src
+ - rbd --cluster cluster2 snap create client.0.2-src@snap
+ - rbd --cluster cluster2 snap protect client.0.2-src@snap
+ - rbd --cluster cluster2 clone client.0.2-src@snap client.0.2
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.2
+ - rbd --cluster cluster2 snap create client.0.2@snap
+ - exec:
+ cluster1.client.0:
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.0","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.0
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.1","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.1
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.2","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.2
diff --git a/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml b/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml
new file mode 100644
index 00000000000..5fdf4d35c26
--- /dev/null
+++ b/qa/suites/rbd/migration-external/6-prepare/native-standalone.yaml
@@ -0,0 +1,18 @@
+tasks:
+ - exec:
+ cluster2.client.0:
+ - echo '{"type":"qcow","stream":{"type":"http","url":"http://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd --cluster cluster2 migration prepare --import-only --source-spec-path - client.0.0
+ - rbd --cluster cluster2 migration execute client.0.0
+ - rbd --cluster cluster2 migration commit client.0.0
+ - rbd --cluster cluster2 snap create client.0.0@snap
+ - rbd --cluster cluster2 create --size 1G client.0.1
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.1
+ - rbd --cluster cluster2 snap create client.0.1@snap
+ - rbd --cluster cluster2 create --size 1G client.0.2
+ - rbd --cluster cluster2 bench --io-type write --io-pattern rand --io-size 16K --io-threads 1 --io-total 2M client.0.2
+ - rbd --cluster cluster2 snap create client.0.2@snap
+ - exec:
+ cluster1.client.0:
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.0","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.0
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.1","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.1
+ - echo '{"type":"native","cluster_name":"cluster2","client_name":"client.admin","pool_name":"rbd","image_name":"client.0.2","snap_name":"snap"}' | rbd --cluster cluster1 migration prepare --import-only --source-spec-path - client.0.2
diff --git a/qa/suites/rbd/migration-external/7-io-workloads/.qa b/qa/suites/rbd/migration-external/7-io-workloads/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/7-io-workloads/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml b/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml
new file mode 100644
index 00000000000..c44011f0837
--- /dev/null
+++ b/qa/suites/rbd/migration-external/7-io-workloads/qemu_xfstests.yaml
@@ -0,0 +1,14 @@
+io_workload:
+ sequential:
+ - qemu:
+ cluster1.client.0:
+ type: block
+ disks:
+ - action: none
+ image_name: client.0.0
+ - action: none
+ image_name: client.0.1
+ - action: none
+ image_name: client.0.2
+ test: qa/run_xfstests_qemu.sh
+exclude_arch: armv7l
diff --git a/qa/suites/rbd/migration-external/8-migrate-workloads/.qa b/qa/suites/rbd/migration-external/8-migrate-workloads/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rbd/migration-external/8-migrate-workloads/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml b/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml
new file mode 100644
index 00000000000..d0afe7175a1
--- /dev/null
+++ b/qa/suites/rbd/migration-external/8-migrate-workloads/execute.yaml
@@ -0,0 +1,14 @@
+tasks:
+ - parallel:
+ - io_workload
+ - migrate_workload
+migrate_workload:
+ sequential:
+ - exec:
+ cluster1.client.0:
+ - sleep $((RANDOM % 600))
+ - rbd --cluster cluster1 migration execute client.0.0
+ - sleep $((RANDOM % 600))
+ - rbd --cluster cluster1 migration commit client.0.0
+ - sleep $((RANDOM % 600))
+ - rbd --cluster cluster1 migration execute client.0.1
diff --git a/qa/suites/rbd/migration-external/conf b/qa/suites/rbd/migration-external/conf
new file mode 120000
index 00000000000..4bc0fe86c63
--- /dev/null
+++ b/qa/suites/rbd/migration-external/conf
@@ -0,0 +1 @@
+.qa/rbd/conf \ No newline at end of file
diff --git a/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml
new file mode 100644
index 00000000000..d2072c41a68
--- /dev/null
+++ b/qa/suites/rbd/migration/6-prepare/qcow2-https.yaml
@@ -0,0 +1,8 @@
+tasks:
+ - exec:
+ client.0:
+ - mkdir /home/ubuntu/cephtest/migration
+ - qemu-img create -f qcow2 /home/ubuntu/cephtest/migration/empty.qcow2 1G
+ - echo '{"type":"qcow","stream":{"type":"http","url":"https://download.ceph.com/qa/ubuntu-12.04.qcow2"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0
+ - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.1
+ - rbd migration prepare --import-only --source-spec '{"type":"qcow","stream":{"type":"file","file_path":"/home/ubuntu/cephtest/migration/empty.qcow2"}}' client.0.2
diff --git a/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml b/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml
new file mode 100644
index 00000000000..b0e8af4d933
--- /dev/null
+++ b/qa/suites/rbd/migration/6-prepare/qcow2-nbd.yaml
@@ -0,0 +1,12 @@
+tasks:
+ - exec:
+ client.0:
+ - mkdir /home/ubuntu/cephtest/migration
+ - wget -nv -O /home/ubuntu/cephtest/migration/base.client.0.qcow2 http://download.ceph.com/qa/ubuntu-12.04.qcow2
+ - qemu-img create -f qcow2 /home/ubuntu/cephtest/migration/empty.qcow2 1G
+ - qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork /home/ubuntu/cephtest/migration/base.client.0.qcow2
+ - qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork --socket /home/ubuntu/cephtest/migration/qemu-nbd-empty /home/ubuntu/cephtest/migration/empty.qcow2
+ - chmod 0777 /home/ubuntu/cephtest/migration/qemu-nbd-empty
+ - echo '{"type":"raw","stream":{"type":"nbd","uri":"nbd://localhost"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0
+ - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.1
+ - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.2
diff --git a/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml b/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml
new file mode 100644
index 00000000000..d5c2e60fed9
--- /dev/null
+++ b/qa/suites/rbd/migration/6-prepare/raw-nbd.yaml
@@ -0,0 +1,13 @@
+tasks:
+ - exec:
+ client.0:
+ - mkdir /home/ubuntu/cephtest/migration
+ - wget -nv -O /home/ubuntu/cephtest/migration/base.client.0.qcow2 http://download.ceph.com/qa/ubuntu-12.04.qcow2
+ - qemu-img convert -f qcow2 -O raw /home/ubuntu/cephtest/migration/base.client.0.qcow2 /home/ubuntu/cephtest/migration/base.client.0.raw
+ - dd if=/dev/zero of=/home/ubuntu/cephtest/migration/empty.raw count=1 bs=1G
+ - qemu-nbd -f raw --read-only --shared 10 --persistent --fork /home/ubuntu/cephtest/migration/base.client.0.raw
+ - qemu-nbd -f raw --read-only --shared 10 --persistent --fork --socket /home/ubuntu/cephtest/migration/qemu-nbd-empty /home/ubuntu/cephtest/migration/empty.raw
+ - chmod 0777 /home/ubuntu/cephtest/migration/qemu-nbd-empty
+ - echo '{"type":"raw","stream":{"type":"nbd","uri":"nbd://localhost"}}' | rbd migration prepare --import-only --source-spec-path - client.0.0
+ - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.1
+ - rbd migration prepare --import-only --source-spec '{"type":"raw","stream":{"type":"nbd","uri":"nbd+unix:///?socket=/home/ubuntu/cephtest/migration/qemu-nbd-empty"}}' client.0.2
diff --git a/qa/suites/rbd/migration/9-cleanup/cleanup.yaml b/qa/suites/rbd/migration/9-cleanup/cleanup.yaml
index 18c2bb5f4c4..1d724d09086 100644
--- a/qa/suites/rbd/migration/9-cleanup/cleanup.yaml
+++ b/qa/suites/rbd/migration/9-cleanup/cleanup.yaml
@@ -1,4 +1,5 @@
tasks:
- exec:
client.0:
+ - pkill -9 qemu-nbd || true
- rm -rf /home/ubuntu/cephtest/migration
diff --git a/qa/suites/rgw/bucket-logging/% b/qa/suites/rgw/bucket-logging/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/%
diff --git a/qa/suites/rgw/bucket-logging/.qa b/qa/suites/rgw/bucket-logging/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/.qa
@@ -0,0 +1 @@
+../.qa/ \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/0-install.yaml b/qa/suites/rgw/bucket-logging/0-install.yaml
new file mode 100644
index 00000000000..6cf82f57476
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/0-install.yaml
@@ -0,0 +1,13 @@
+tasks:
+- install:
+- ceph:
+- openssl_keys:
+- rgw: [client.0]
+- tox: [client.0]
+
+overrides:
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 10
+ osd_max_pg_log_entries: 10
diff --git a/qa/suites/rgw/bucket-logging/beast.yaml b/qa/suites/rgw/bucket-logging/beast.yaml
new file mode 120000
index 00000000000..09ced62c42a
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/beast.yaml
@@ -0,0 +1 @@
+.qa/rgw_frontend/beast.yaml \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/fixed-1.yaml b/qa/suites/rgw/bucket-logging/fixed-1.yaml
new file mode 120000
index 00000000000..02df5dd0cd0
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/fixed-1.yaml
@@ -0,0 +1 @@
+.qa/clusters/fixed-1.yaml \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml
new file mode 120000
index 00000000000..32340b1fa8b
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/ignore-pg-availability.yaml
@@ -0,0 +1 @@
+.qa/rgw/ignore-pg-availability.yaml \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/overrides.yaml b/qa/suites/rgw/bucket-logging/overrides.yaml
new file mode 100644
index 00000000000..a448a323d36
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/overrides.yaml
@@ -0,0 +1,10 @@
+overrides:
+ ceph:
+ conf:
+ client:
+ setuser: ceph
+ setgroup: ceph
+ debug rgw: 20
+ rgw bucket logging obj roll time: 5
+ rgw:
+ storage classes: LUKEWARM, FROZEN
diff --git a/qa/suites/rgw/bucket-logging/s3tests-branch.yaml b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml
new file mode 120000
index 00000000000..bdcaca48ae0
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/s3tests-branch.yaml
@@ -0,0 +1 @@
+.qa/rgw/s3tests-branch.yaml \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/supported-distros b/qa/suites/rgw/bucket-logging/supported-distros
new file mode 120000
index 00000000000..78f2991b407
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/supported-distros
@@ -0,0 +1 @@
+.qa/distros/supported-random-distro$/ \ No newline at end of file
diff --git a/qa/suites/rgw/bucket-logging/tasks/+ b/qa/suites/rgw/bucket-logging/tasks/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/tasks/+
diff --git a/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml
new file mode 100644
index 00000000000..c1d3b7192e1
--- /dev/null
+++ b/qa/suites/rgw/bucket-logging/tasks/s3tests.yaml
@@ -0,0 +1,6 @@
+tasks:
+- s3tests:
+ client.0:
+ boto3_extensions: True
+ rgw_server: client.0
+ extra_attrs: ["bucket_logging"]
diff --git a/qa/suites/rgw/crypt/2-kms/barbican.yaml b/qa/suites/rgw/crypt/2-kms/barbican.yaml
index 9bf5fb81131..e3f78810416 100644
--- a/qa/suites/rgw/crypt/2-kms/barbican.yaml
+++ b/qa/suites/rgw/crypt/2-kms/barbican.yaml
@@ -27,7 +27,7 @@ tasks:
- tox: [ client.0 ]
- keystone:
client.0:
- force-branch: stable/2023.1
+ force-branch: stable/2024.1
services:
- name: swift
type: object-store
@@ -68,7 +68,7 @@ tasks:
project: s3
- barbican:
client.0:
- force-branch: stable/2023.1
+ force-branch: stable/2024.1
use-keystone-role: client.0
keystone_authtoken:
auth_plugin: password
diff --git a/qa/suites/rgw/multifs/0-install.yaml b/qa/suites/rgw/multifs/0-install.yaml
new file mode 100644
index 00000000000..7e83140e64a
--- /dev/null
+++ b/qa/suites/rgw/multifs/0-install.yaml
@@ -0,0 +1,5 @@
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- tox: [client.0]
diff --git a/qa/suites/rgw/multifs/tasks/+ b/qa/suites/rgw/multifs/tasks/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/multifs/tasks/+
diff --git a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
index e07c8b5ccfe..d9526c365c1 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
@@ -1,13 +1,5 @@
tasks:
-- install:
-- ceph:
-- rgw: [client.0]
- workunit:
clients:
client.0:
- rgw/s3_bucket_quota.pl
-overrides:
- ceph:
- conf:
- client:
- rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
index bac4f401626..ae32e928661 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
@@ -1,13 +1,5 @@
tasks:
-- install:
-- ceph:
-- rgw: [client.0]
- workunit:
clients:
client.0:
- rgw/s3_multipart_upload.pl
-overrides:
- ceph:
- conf:
- client:
- rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
index 66bdff817f5..184555660dc 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
@@ -1,8 +1,4 @@
tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
- ragweed:
client.0:
default-branch: ceph-master
diff --git a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
index 92355f04963..573cffbc30a 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
@@ -1,8 +1,4 @@
tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
- s3tests:
client.0:
rgw_server: client.0
diff --git a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
index 92c63d2e850..393180e5c17 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
@@ -1,13 +1,5 @@
tasks:
-- install:
-- ceph:
-- rgw: [client.0]
- workunit:
clients:
client.0:
- rgw/s3_user_quota.pl
-overrides:
- ceph:
- conf:
- client:
- rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled b/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled
index 06f4cb48909..1266cf9c9c4 100644
--- a/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled
+++ b/qa/suites/rgw/multisite/realms/three-zones.yaml.disabled
@@ -2,7 +2,7 @@ overrides:
rgw-multisite:
realm:
name: test-realm
- is default: true
+ is_default: true
zonegroups:
- name: test-zonegroup
is_master: true
diff --git a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml
index 0836a953d74..ac2104cdd05 100644
--- a/qa/suites/rgw/multisite/realms/two-zonegroup.yaml.disabled
+++ b/qa/suites/rgw/multisite/realms/two-zonegroup.yaml
@@ -2,7 +2,7 @@ overrides:
rgw-multisite:
realm:
name: test-realm
- is default: true
+ is_default: true
zonegroups:
- name: a
is_master: true
@@ -28,4 +28,4 @@ overrides:
- name: b2
endpoints: [c2.client.1]
rgw-multisite-tests:
- args: [tests.py]
+ args: [tests.py, -a, '!fails_with_rgw']
diff --git a/qa/suites/rgw/multisite/realms/two-zones.yaml b/qa/suites/rgw/multisite/realms/two-zones.yaml
index 1bea381077c..9da708bc95e 100644
--- a/qa/suites/rgw/multisite/realms/two-zones.yaml
+++ b/qa/suites/rgw/multisite/realms/two-zones.yaml
@@ -2,7 +2,7 @@ overrides:
rgw-multisite:
realm:
name: test-realm
- is default: true
+ is_default: true
zonegroups:
- name: test-zonegroup
is_master: true
diff --git a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
index 462570e7727..303f98d540e 100644
--- a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
+++ b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
@@ -1,7 +1,7 @@
tasks:
- kafka:
client.0:
- kafka_version: 2.6.0
+ kafka_version: 3.8.1
- notification-tests:
client.0:
extra_attr: ["kafka_test", "data_path_v2_kafka_test"]
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml
new file mode 100644
index 00000000000..5c83d5c0d23
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml
@@ -0,0 +1,20 @@
+tasks:
+- install:
+- ceph:
+- openssl_keys:
+- rgw:
+ client.0:
+
+overrides:
+ install:
+ ceph:
+ extra_system_packages:
+ rpm:
+ - java
+ deb:
+ - default-jre
+ ceph:
+ conf:
+ global:
+ osd_min_pg_log_entries: 10
+ osd_max_pg_log_entries: 10
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros
new file mode 120000
index 00000000000..46280a42a96
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros
@@ -0,0 +1 @@
+../../.qa/distros/supported-random-distro$/ \ No newline at end of file
diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml
new file mode 100644
index 00000000000..01d6fc637de
--- /dev/null
+++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml
@@ -0,0 +1,8 @@
+tasks:
+- kafka-failover:
+ client.0:
+ kafka_version: 3.8.1
+- notification-tests:
+ client.0:
+ extra_attr: ["kafka_failover"]
+ rgw_server: client.0
diff --git a/qa/suites/fs/functional/subvol_versions/.qa b/qa/suites/rgw/sts/auth-order/.qa
index fea2489fdf6..fea2489fdf6 120000
--- a/qa/suites/fs/functional/subvol_versions/.qa
+++ b/qa/suites/rgw/sts/auth-order/.qa
diff --git a/qa/suites/rgw/sts/auth-order/local-sts.yaml b/qa/suites/rgw/sts/auth-order/local-sts.yaml
new file mode 100644
index 00000000000..2f7dcc6b128
--- /dev/null
+++ b/qa/suites/rgw/sts/auth-order/local-sts.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw s3 auth order: local, sts, external
diff --git a/qa/suites/rgw/sts/auth-order/sts-local.yaml b/qa/suites/rgw/sts/auth-order/sts-local.yaml
new file mode 100644
index 00000000000..a7b00d00f0b
--- /dev/null
+++ b/qa/suites/rgw/sts/auth-order/sts-local.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw s3 auth order: sts, local, external
diff --git a/qa/suites/rgw/tempest/0-install.yaml b/qa/suites/rgw/tempest/0-install.yaml
index f968db20c2b..b6ef17de4ee 100644
--- a/qa/suites/rgw/tempest/0-install.yaml
+++ b/qa/suites/rgw/tempest/0-install.yaml
@@ -4,7 +4,7 @@ tasks:
- tox: [ client.0 ]
- keystone:
client.0:
- force-branch: stable/2023.1
+ force-branch: stable/2024.1
services:
- name: swift
type: object-store
diff --git a/qa/suites/rgw/tempest/tasks/s3/% b/qa/suites/rgw/tempest/tasks/s3/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/tempest/tasks/s3/%
diff --git a/qa/suites/rgw/tempest/tasks/s3/.qa b/qa/suites/rgw/tempest/tasks/s3/.qa
new file mode 120000
index 00000000000..fea2489fdf6
--- /dev/null
+++ b/qa/suites/rgw/tempest/tasks/s3/.qa
@@ -0,0 +1 @@
+../.qa \ No newline at end of file
diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa
new file mode 120000
index 00000000000..fea2489fdf6
--- /dev/null
+++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/.qa
@@ -0,0 +1 @@
+../.qa \ No newline at end of file
diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml
new file mode 100644
index 00000000000..c46a51e0958
--- /dev/null
+++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/external-local.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw s3 auth order: sts, external, local
diff --git a/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml
new file mode 100644
index 00000000000..a7b00d00f0b
--- /dev/null
+++ b/qa/suites/rgw/tempest/tasks/s3/auth-order/local-external.yaml
@@ -0,0 +1,5 @@
+overrides:
+ ceph:
+ conf:
+ client:
+ rgw s3 auth order: sts, local, external
diff --git a/qa/suites/rgw/tempest/tasks/s3tests.yaml b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml
index 4efb579fa83..4efb579fa83 100644
--- a/qa/suites/rgw/tempest/tasks/s3tests.yaml
+++ b/qa/suites/rgw/tempest/tasks/s3/s3tests.yaml
diff --git a/qa/suites/rgw/verify/overrides.yaml b/qa/suites/rgw/verify/overrides.yaml
index 1b3b5abd7ad..afc368fc98c 100644
--- a/qa/suites/rgw/verify/overrides.yaml
+++ b/qa/suites/rgw/verify/overrides.yaml
@@ -14,6 +14,7 @@ overrides:
rgw bucket counters cache: true
rgw sts key: abcdefghijklmnop
rgw s3 auth use sts: true
+ rgw reshard progress judge interval: 10
rgw:
compression type: random
storage classes: LUKEWARM, FROZEN
diff --git a/qa/suites/rgw/verify/tasks/cls.yaml b/qa/suites/rgw/verify/tasks/cls.yaml
index 8034715353f..26f948d42ec 100644
--- a/qa/suites/rgw/verify/tasks/cls.yaml
+++ b/qa/suites/rgw/verify/tasks/cls.yaml
@@ -1,3 +1,8 @@
+overrides:
+ ceph:
+ conf:
+ osd:
+ debug objclass: 20
tasks:
- workunit:
clients:
diff --git a/qa/suites/rgw/verify/tasks/s3tests-java.yaml b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml
index 9ad89cc6790..9ad89cc6790 100644
--- a/qa/suites/rgw/verify/tasks/s3tests-java.yaml
+++ b/qa/suites/rgw/verify/tasks/zzz-s3tests-java.yaml
diff --git a/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml b/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
index 66cf2bc7593..58e253bf6f4 120000
--- a/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
+++ b/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
@@ -1 +1 @@
-.qa/objectstore_debug/bluestore-bitmap.yaml \ No newline at end of file
+.qa/objectstore_debug/bluestore/bluestore-bitmap.yaml \ No newline at end of file
diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml
index 57e455ba78d..a0adaecf9b2 100644
--- a/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml
+++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/0-cluster/start.yaml
@@ -19,6 +19,20 @@ overrides:
- \(MGR_DOWN\)
- slow request
- \(MON_MSGR2_NOT_ENABLED\)
+ - \(OSD_DOWN\)
+ - \(OSD_HOST_DOWN\)
+ - \(POOL_APP_NOT_ENABLED\)
+ - OSD_DOWN
+ - mons down
+ - mon down
+ - MON_DOWN
+ - out of quorum
+ - PG_DEGRADED
+ - Reduced data availability
+ - Degraded data redundancy
+ - OSDMAP_FLAGS
+ - OSD_ROOT_DOWN
+
conf:
global:
enable experimental unrecoverable data corrupting features: "*"
@@ -30,4 +44,3 @@ roles:
- mgr.x
- osd.0
- osd.1
- - osd.2 \ No newline at end of file
diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml
index e4897db4d35..48cfa2f756f 100644
--- a/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml
+++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/1-ceph-install/quincy.yaml
@@ -18,9 +18,6 @@ tasks:
mon:
mon_warn_on_insecure_global_id_reclaim: false
mon_warn_on_insecure_global_id_reclaim_allowed: false
- log-ignorelist:
- - Not found or unloadable
- - evicting unresponsive client
- exec:
osd.0:
- ceph osd require-osd-release quincy
@@ -30,14 +27,3 @@ overrides:
conf:
mon:
mon warn on osd down out interval zero: false
- log-ignorelist:
- - \(POOL_APP_NOT_ENABLED\)
- - OSD_DOWN
- - mons down
- - mon down
- - MON_DOWN
- - out of quorum
- - PG_DEGRADED
- - Reduced data availability
- - Degraded data redundancy
- - OSDMAP_FLAGS
diff --git a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml
index 6aa429f18b5..fe4ff9bb113 100644
--- a/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml
+++ b/qa/suites/upgrade/quincy-x/filestore-remove-check/2 - upgrade.yaml
@@ -3,14 +3,13 @@ meta:
install upgrade ceph/-x on cluster
restart : mons, osd.*
tasks:
+- print: "**** start install.upgrade of nodes"
- install.upgrade:
- mon.a:
-- exec:
- osd.0:
- - ceph osd require-osd-release quincy
+ all:
- print: "**** done install.upgrade of nodes"
+- print: "**** start ceph.restart of all osds"
- ceph.restart:
- daemons: [mon.a,mgr.x,osd.0,osd.1,osd.2]
+ daemons: [osd.0,osd.1,osd.2]
mon-health-to-clog: false
wait-for-healthy: false
wait-for-osds-up: false
diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
index 40fbcefe728..62fb6427f72 100644
--- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
@@ -32,13 +32,22 @@ overrides:
osd:
osd shutdown pgref assert: true
log-ignorelist:
- - \(POOL_APP_NOT_ENABLED\)
+ - do not have an application enabled
+ - application not enabled
+ - or freeform for custom applications
+ - POOL_APP_NOT_ENABLED
+ - is down
- OSD_DOWN
- mons down
- mon down
- MON_DOWN
- out of quorum
+ - PG_AVAILABILITY
- PG_DEGRADED
- Reduced data availability
- Degraded data redundancy
+ - pg .* is stuck inactive
+ - pg .* is .*degraded
+ - FS_DEGRADED
- OSDMAP_FLAGS
+ - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
index e57e31f2fbe..f7167975aa9 100644
--- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
@@ -1,3 +1,8 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - Telemetry requires re-opt-in
+ - telemetry module includes new collections
tasks:
- install:
branch: quincy
diff --git a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml
index 9c2ff9da185..9a0585cc074 100644
--- a/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/workload/rados_api.yaml
@@ -9,4 +9,6 @@ workload:
clients:
client.0:
- cls
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done end rados_api.yaml"
diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
index a618ee77c11..5641471629e 100644
--- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
@@ -1,16 +1,25 @@
overrides:
ceph:
log-ignorelist:
- - \(POOL_APP_NOT_ENABLED\)
+ - do not have an application enabled
+ - application not enabled
+ - or freeform for custom applications
+ - POOL_APP_NOT_ENABLED
+ - is down
- OSD_DOWN
- mons down
- mon down
- MON_DOWN
- out of quorum
+ - PG_AVAILABILITY
- PG_DEGRADED
- Reduced data availability
- Degraded data redundancy
+ - pg .* is stuck inactive
+ - pg .* is .*degraded
+ - FS_DEGRADED
- OSDMAP_FLAGS
+ - OSD_UPGRADE_FINISHED
tasks:
- install:
branch: quincy
diff --git a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml
index b722f187361..a55dddf46f7 100644
--- a/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml
+++ b/qa/suites/upgrade/quincy-x/stress-split/2-first-half-tasks/rbd-cls.yaml
@@ -7,4 +7,6 @@ first-half-tasks:
clients:
client.0:
- cls/test_cls_rbd.sh
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml
index 649b024a476..d54ba8039d0 100644
--- a/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml
+++ b/qa/suites/upgrade/quincy-x/stress-split/3-stress-tasks/rbd-cls.yaml
@@ -7,4 +7,6 @@ stress-tasks:
clients:
client.0:
- cls/test_cls_rbd.sh
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 3814ea3efdb..62fb6427f72 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -31,3 +31,23 @@ overrides:
conf:
osd:
osd shutdown pgref assert: true
+ log-ignorelist:
+ - do not have an application enabled
+ - application not enabled
+ - or freeform for custom applications
+ - POOL_APP_NOT_ENABLED
+ - is down
+ - OSD_DOWN
+ - mons down
+ - mon down
+ - MON_DOWN
+ - out of quorum
+ - PG_AVAILABILITY
+ - PG_DEGRADED
+ - Reduced data availability
+ - Degraded data redundancy
+ - pg .* is stuck inactive
+ - pg .* is .*degraded
+ - FS_DEGRADED
+ - OSDMAP_FLAGS
+ - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index 299e3d1b9a0..b5160c2dd00 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -1,10 +1,8 @@
overrides:
ceph:
log-ignorelist:
- - mons down
- - mon down
- - MON_DOWN
- - out of quorum
+ - Telemetry requires re-opt-in
+ - telemetry module includes new collections
tasks:
- install:
branch: reef
diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
new file mode 100644
index 00000000000..fa93b2f2ece
--- /dev/null
+++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
@@ -0,0 +1,19 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - MDS_ALL_DOWN
+ - MDS_UP_LESS_THAN_MAX
+ - OSD_SLOW_PING_TIME
+ - reached quota
+ - running out of quota
+ - overall HEALTH_
+ - CACHE_POOL_NO_HIT_SET
+ - pool\(s\) full
+ - POOL_FULL
+ - SMALLER_PGP_NUM
+ - SLOW_OPS
+ - CACHE_POOL_NEAR_FULL
+ - OBJECT_MISPLACED
+ - slow request
+ - noscrub
+ - nodeep-scrub
diff --git a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml
index a46e34db5dd..79cf1a96601 100644
--- a/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/workload/rados_api.yaml
@@ -9,4 +9,6 @@ workload:
clients:
client.0:
- cls
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done end rados_api.yaml"
diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
index 4cd05432d5f..59ccfe2cd02 100644
--- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
@@ -1,3 +1,25 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - do not have an application enabled
+ - application not enabled
+ - or freeform for custom applications
+ - POOL_APP_NOT_ENABLED
+ - is down
+ - OSD_DOWN
+ - mons down
+ - mon down
+ - MON_DOWN
+ - out of quorum
+ - PG_AVAILABILITY
+ - PG_DEGRADED
+ - Reduced data availability
+ - Degraded data redundancy
+ - pg .* is stuck inactive
+ - pg .* is .*degraded
+ - FS_DEGRADED
+ - OSDMAP_FLAGS
+ - OSD_UPGRADE_FINISHED
tasks:
- install:
branch: reef
diff --git a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml
index f092096f444..79ad2af8ea1 100644
--- a/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/2-first-half-tasks/rbd-cls.yaml
@@ -7,4 +7,6 @@ first-half-tasks:
clients:
client.0:
- cls/test_cls_rbd.sh
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml
index 05bb672b3ac..166327a58f9 100644
--- a/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/3-stress-tasks/rbd-cls.yaml
@@ -7,4 +7,6 @@ stress-tasks:
clients:
client.0:
- cls/test_cls_rbd.sh
+ env:
+ CLS_RBD_GTEST_FILTER: '-TestClsRbd.group_snap_set:TestClsRbd.group_snap_remove'
- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
new file mode 100644
index 00000000000..fa93b2f2ece
--- /dev/null
+++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
@@ -0,0 +1,19 @@
+overrides:
+ ceph:
+ log-ignorelist:
+ - MDS_ALL_DOWN
+ - MDS_UP_LESS_THAN_MAX
+ - OSD_SLOW_PING_TIME
+ - reached quota
+ - running out of quota
+ - overall HEALTH_
+ - CACHE_POOL_NO_HIT_SET
+ - pool\(s\) full
+ - POOL_FULL
+ - SMALLER_PGP_NUM
+ - SLOW_OPS
+ - CACHE_POOL_NEAR_FULL
+ - OBJECT_MISPLACED
+ - slow request
+ - noscrub
+ - nodeep-scrub
diff --git a/qa/tasks/barbican.py b/qa/tasks/barbican.py
index 771304fba92..c32277c3c09 100644
--- a/qa/tasks/barbican.py
+++ b/qa/tasks/barbican.py
@@ -88,6 +88,14 @@ def run_in_barbican_venv(ctx, client, args):
run.Raw('&&')
] + args)
+def get_constraints_url(cconf):
+ version = cconf.get('force-branch', 'master')
+ if '/' in version:
+ # split stable/<version> to <version>
+ version = str(version).split('/')[1]
+ url = f"https://releases.openstack.org/constraints/upper/{version}"
+ return url
+
@contextlib.contextmanager
def setup_venv(ctx, config):
"""
@@ -95,13 +103,14 @@ def setup_venv(ctx, config):
"""
assert isinstance(config, dict)
log.info('Setting up virtualenv for barbican...')
- for (client, _) in config.items():
+ for (client, cconf) in config.items():
run_in_barbican_dir(ctx, client,
['python3', '-m', 'venv', '.barbicanenv'])
run_in_barbican_venv(ctx, client,
['pip', 'install', '--upgrade', 'pip'])
+ url = get_constraints_url(cconf)
run_in_barbican_venv(ctx, client,
- ['pip', 'install', 'pytz',
+ ['pip', 'install', f'-c{url}', 'pytz',
'-e', get_barbican_dir(ctx)])
yield
diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
index 84e096520b4..e6a9dc8223c 100644
--- a/qa/tasks/cbt.py
+++ b/qa/tasks/cbt.py
@@ -47,22 +47,11 @@ class CBT(Task):
benchmark_config = self.config.get('benchmarks')
benchmark_type = next(iter(benchmark_config.keys()))
+
if benchmark_type in ['librbdfio', 'fio']:
testdir = misc.get_testdir(self.ctx)
benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
- if benchmark_type == 'cosbench':
- # create cosbench_dir and cosbench_xml_dir
- testdir = misc.get_testdir(self.ctx)
- benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
- benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
- self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
- benchmark_config['cosbench']['controller'] = osd_hosts[0]
-
- # set auth details
- remotes_and_roles = self.ctx.cluster.remotes.items()
- ips = [host for (host, port) in
- (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
- benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+
client_endpoints_config = self.config.get('client_endpoints', None)
monitoring_profiles = self.config.get('monitoring_profiles', {})
@@ -117,77 +106,6 @@ class CBT(Task):
]
)
- if benchmark_type == 'cosbench':
- # install cosbench
- self.log.info('install dependencies for cosbench')
- if system_type == 'rpm':
- cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
- else:
- cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
- self.first_mon.run(args=install_cmd + cosbench_depends)
- testdir = misc.get_testdir(self.ctx)
- cosbench_version = '0.4.2.c3'
- cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
- os_version = misc.get_system_type(self.first_mon, False, True)
-
- # additional requirements for bionic
- if os_version == '18.04':
- self.first_mon.run(
- args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
- # use our own version of cosbench
- cosbench_version = 'cosbench-0.4.2.c3.1'
- # contains additional parameter "-N" to nc
- cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
- cosbench_dir = os.path.join(testdir, cosbench_version)
- self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'wget',
- cosbench_location, run.Raw('&&'),
- 'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
- ]
- )
- else:
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'wget',
- cosbench_location, run.Raw('&&'),
- 'unzip', '{name}.zip'.format(name=cosbench_version)
- ]
- )
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'ln', '-s', cosbench_version, 'cos',
- ]
- )
- self.first_mon.run(
- args=[
- 'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
- 'chmod', '+x', run.Raw('*.sh'),
- ]
- )
-
- # start cosbench and check info
- self.log.info('start cosbench')
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'cd', 'cos', run.Raw('&&'),
- 'sh', 'start-all.sh'
- ]
- )
- self.log.info('check cosbench info')
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'cd', 'cos', run.Raw('&&'),
- 'sh', 'cli.sh', 'info'
- ]
- )
-
def checkout_cbt(self):
testdir = misc.get_testdir(self.ctx)
repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
@@ -269,51 +187,6 @@ class CBT(Task):
]
)
- if benchmark_type == 'cosbench':
- os_version = misc.get_system_type(self.first_mon, False, True)
- if os_version == '18.04':
- cosbench_version = 'cosbench-0.4.2.c3.1'
- else:
- cosbench_version = '0.4.2.c3'
- # note: stop-all requires 'nc'
- self.first_mon.run(
- args=[
- 'cd', testdir, run.Raw('&&'),
- 'cd', 'cos', run.Raw('&&'),
- 'sh', 'stop-all.sh',
- run.Raw('||'), 'true'
- ]
- )
- self.first_mon.run(
- args=[
- 'sudo', 'killall', '-9', 'java',
- run.Raw('||'), 'true'
- ]
- )
- self.first_mon.run(
- args=[
- 'rm', '--one-file-system', '-rf', '--',
- '{tdir}/cos'.format(tdir=testdir),
- ]
- )
- self.first_mon.run(
- args=[
- 'rm', '--one-file-system', '-rf', '--',
- '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
- ]
- )
- self.first_mon.run(
- args=[
- 'rm', '--one-file-system', '-rf', '--',
- '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
- ]
- )
- self.first_mon.run(
- args=[
- 'rm', '--one-file-system', '-rf', '--',
- '{tdir}/xml'.format(tdir=testdir),
- ]
- )
# Collect cbt performance data
cbt_performance = CBTperformance()
cbt_performance.collect(self.ctx, self.config)
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index b01fe370ec0..8f666d2fa9b 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -376,7 +376,7 @@ def module_setup(ctx, config):
cluster_name,
'mgr',
'module',
- 'emable',
+ 'enable',
m,
]
log.info("enabling module %s", m)
@@ -414,6 +414,15 @@ def conf_setup(ctx, config):
for p in procs:
log.debug("waiting for %s", p)
p.wait()
+ cmd = [
+ 'sudo',
+ 'ceph',
+ '--cluster',
+ cluster_name,
+ 'config',
+ 'dump',
+ ]
+ mon_remote.run(args=cmd)
yield
@contextlib.contextmanager
@@ -1197,8 +1206,18 @@ def cluster(ctx, config):
args.extend([
run.Raw('|'), 'head', '-n', '1',
])
- stdout = mon0_remote.sh(args)
- return stdout or None
+ r = mon0_remote.run(
+ stdout=BytesIO(),
+ args=args,
+ stderr=StringIO(),
+ )
+ stdout = r.stdout.getvalue().decode()
+ if stdout:
+ return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
+ return None
if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
config['log_ignorelist']) is not None:
diff --git a/qa/tasks/ceph_iscsi_client.py b/qa/tasks/ceph_iscsi_client.py
index 189b7fa31fe..0b0a355f925 100644
--- a/qa/tasks/ceph_iscsi_client.py
+++ b/qa/tasks/ceph_iscsi_client.py
@@ -31,8 +31,15 @@ def task(ctx, config):
remote.run(args=['sudo', 'systemctl', 'restart', 'iscsid'])
remote.run(args=['sudo', 'modprobe', 'dm_multipath'])
- remote.run(args=['sudo', 'mpathconf', '--enable'])
conf = dedent('''
+ defaults {
+ user_friendly_names yes
+ find_multipaths yes
+ }
+
+ blacklist {
+ }
+
devices {
device {
vendor "LIO-ORG"
@@ -50,7 +57,7 @@ def task(ctx, config):
}
''')
path = "/etc/multipath.conf"
- remote.sudo_write_file(path, conf, append=True)
+ remote.sudo_write_file(path, conf)
remote.run(args=['sudo', 'systemctl', 'start', 'multipathd'])
yield
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index ccf54648d43..57d22f3b5e6 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -2169,6 +2169,10 @@ class CephManager:
when creating an erasure coded pool.
"""
with self.lock:
+ # msr rules require at least squid
+ if 'crush-osds-per-failure-domain' in profile:
+ self.raw_cluster_cmd(
+ 'osd', 'set-require-min-compat-client', 'squid')
args = cmd_erasure_code_profile(profile_name, profile)
self.raw_cluster_cmd(*args)
@@ -2792,6 +2796,59 @@ class CephManager:
num += 1
return num
+ def _print_not_active_clean_pg(self, pgs):
+ """
+ Print the PGs that are not active+clean.
+ """
+ for pg in pgs:
+ if not (pg['state'].count('active') and
+ pg['state'].count('clean') and
+ not pg['state'].count('stale')):
+ log.debug(
+ "PG %s is not active+clean, but %s",
+ pg['pgid'], pg['state']
+ )
+
+ def pg_all_active_clean(self):
+ """
+ Check if all pgs are active+clean
+ return: True if all pgs are active+clean else False
+ """
+ pgs = self.get_pg_stats()
+ result = self._get_num_active_clean(pgs) == len(pgs)
+ if result:
+ log.debug("All PGs are active+clean")
+ else:
+ log.debug("Not all PGs are active+clean")
+ self._print_not_active_clean_pg(pgs)
+ return result
+
+ def _print_not_active_pg(self, pgs):
+ """
+ Print the PGs that are not active.
+ """
+ for pg in pgs:
+ if not (pg['state'].count('active')
+ and not pg['state'].count('stale')):
+ log.debug(
+ "PG %s is not active, but %s",
+ pg['pgid'], pg['state']
+ )
+
+ def pg_all_active(self):
+ """
+ Check if all pgs are active
+ return: True if all pgs are active else False
+ """
+ pgs = self.get_pg_stats()
+ result = self._get_num_active(pgs) == len(pgs)
+ if result:
+ log.debug("All PGs are active")
+ else:
+ log.debug("Not all PGs are active")
+ self._print_not_active_pg(pgs)
+ return result
+
def is_clean(self):
"""
True if all pgs are clean
@@ -3233,6 +3290,26 @@ class CephManager:
self.make_admin_daemon_dir(remote)
self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()
+ def get_crush_rule_id(self, crush_rule_name):
+ """
+ Get crush rule id by name
+ :returns: int -- crush rule id
+ """
+ out = self.raw_cluster_cmd('osd', 'crush', 'rule', 'dump', '--format=json')
+ j = json.loads('\n'.join(out.split('\n')[1:]))
+ for rule in j:
+ if rule['rule_name'] == crush_rule_name:
+ return rule['rule_id']
+ assert False, 'rule %s not found' % crush_rule_name
+
+ def get_mon_dump_json(self):
+ """
+ mon dump --format=json converted to a python object
+ :returns: the python object
+ """
+ out = self.raw_cluster_cmd('mon', 'dump', '--format=json')
+ return json.loads('\n'.join(out.split('\n')[1:]))
+
def get_mon_status(self, mon):
"""
Extract all the monitor status information from the cluster
@@ -3336,6 +3413,23 @@ class CephManager:
self.log(task_status)
return task_status
+ # Stretch mode related functions
+ def is_degraded_stretch_mode(self):
+ """
+ Return whether the cluster is in degraded stretch mode
+ """
+ try:
+ osdmap = self.get_osd_dump_json()
+ stretch_mode = osdmap.get('stretch_mode', {})
+ degraded_stretch_mode = stretch_mode.get('degraded_stretch_mode', 0)
+ self.log("is_degraded_stretch_mode: {0}".format(degraded_stretch_mode))
+ return degraded_stretch_mode == 1
+ except (TypeError, AttributeError) as e:
+ # Log the error or handle it as needed
+ self.log("Error accessing degraded_stretch_mode: {0}".format(e))
+ return False
+
+
def utility_task(name):
"""
Generate ceph_manager subtask corresponding to ceph_manager
diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py
index 8347b890629..7afcbc2f2eb 100644
--- a/qa/tasks/ceph_test_case.py
+++ b/qa/tasks/ceph_test_case.py
@@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd):
while True:
if condition():
success_time_elapsed = 0
- while success_time_elapsed < success_hold_time:
- if condition():
- success_time_elapsed += 1
- time.sleep(1)
- elapsed += 1
- else:
- break
+ while success_time_elapsed < success_hold_time and condition():
+ success_time_elapsed += 1
+ time.sleep(1)
+ elapsed += 1
if success_time_elapsed == success_hold_time:
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
return
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py
index 166ea9537ee..0cde6050718 100644
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -209,7 +209,9 @@ def normalize_hostnames(ctx):
def download_cephadm(ctx, config, ref):
cluster_name = config['cluster']
- if 'cephadm_binary_url' in config:
+ if 'cephadm_from_container' in config:
+ _fetch_cephadm_from_container(ctx, config)
+ elif 'cephadm_binary_url' in config:
url = config['cephadm_binary_url']
_download_cephadm(ctx, url)
elif config.get('cephadm_mode') != 'cephadm-package':
@@ -232,6 +234,36 @@ def download_cephadm(ctx, config, ref):
_rm_cephadm(ctx)
+def _fetch_cephadm_from_container(ctx, config):
+ image = config['image']
+ cengine = 'podman'
+ try:
+ log.info("Testing if podman is available")
+ ctx.cluster.run(args=['sudo', cengine, '--help'])
+ except CommandFailedError:
+ log.info("Failed to find podman. Using docker")
+ cengine = 'docker'
+
+ ctx.cluster.run(args=['sudo', cengine, 'pull', image])
+ ctx.cluster.run(args=[
+ 'sudo', cengine, 'run', '--rm', '--entrypoint=cat', image, '/usr/sbin/cephadm',
+ run.Raw('>'),
+ ctx.cephadm,
+ ])
+
+ # sanity-check the resulting file and set executable bit
+ cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
+ ctx.cluster.run(
+ args=[
+ 'test', '-s', ctx.cephadm,
+ run.Raw('&&'),
+ 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
+ run.Raw('&&'),
+ 'chmod', '+x', ctx.cephadm,
+ ],
+ )
+
+
def _fetch_cephadm_from_rpm(ctx):
log.info("Copying cephadm installed from an RPM package")
# cephadm already installed from redhat.install task
@@ -443,12 +475,16 @@ def ceph_log(ctx, config):
run.Raw('|'), 'head', '-n', '1',
])
r = ctx.ceph[cluster_name].bootstrap_remote.run(
- stdout=StringIO(),
+ stdout=BytesIO(),
args=args,
+ stderr=StringIO(),
)
- stdout = r.stdout.getvalue()
- if stdout != '':
+ stdout = r.stdout.getvalue().decode()
+ if stdout:
return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
return None
# NOTE: technically the first and third arg to first_in_ceph_log
@@ -1817,6 +1853,12 @@ def conf_setup(ctx, config):
for p in procs:
log.debug("waiting for %s", p)
p.wait()
+ cmd = [
+ 'ceph',
+ 'config',
+ 'dump',
+ ]
+ _shell(ctx, cluster_name, remote, args=cmd)
yield
@contextlib.contextmanager
diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py
index c1312ec5efc..21b96d2b22b 100644
--- a/qa/tasks/cephfs/cephfs_test_case.py
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -252,8 +252,8 @@ class CephFSTestCase(CephTestCase):
def get_session_data(self, client_id):
return self._session_by_id(client_id)
- def _session_list(self):
- ls_data = self.fs.mds_asok(['session', 'ls'])
+ def _session_list(self, rank=None, status=None):
+ ls_data = self.fs.rank_asok(['session', 'ls'], rank=rank, status=status)
ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
return ls_data
@@ -269,9 +269,9 @@ class CephFSTestCase(CephTestCase):
def perf_dump(self, rank=None, status=None):
return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
- def wait_until_evicted(self, client_id, timeout=30):
+ def wait_until_evicted(self, client_id, rank=None, timeout=30):
def is_client_evicted():
- ls = self._session_list()
+ ls = self._session_list(rank=rank)
for s in ls:
if s['id'] == client_id:
return False
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 1c00a49077d..3846ef23f97 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -640,12 +640,17 @@ class FilesystemBase(MDSClusterBase):
def set_joinable(self, joinable=True):
self.set_var("joinable", joinable)
- def set_max_mds(self, max_mds):
- self.set_var("max_mds", "%d" % max_mds)
+ def set_max_mds(self, max_mds, confirm=True):
+ if confirm:
+ self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it')
+ else:
+ self.set_var("max_mds", f"{max_mds}",)
def set_session_timeout(self, timeout):
self.set_var("session_timeout", "%d" % timeout)
+ def set_session_autoclose(self, autoclose_time):
+ self.set_var("session_autoclose", "%d" % autoclose_time)
def set_allow_standby_replay(self, yes):
self.set_var("allow_standby_replay", yes)
diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py
index 3654cde9ca0..52362d853dc 100644
--- a/qa/tasks/cephfs/mount.py
+++ b/qa/tasks/cephfs/mount.py
@@ -775,6 +775,10 @@ class CephFSMountBase(object):
return self.client_remote.run(args=args, **kwargs)
+ def get_shell_stdout(self, args, timeout=300, **kwargs):
+ return self.run_shell(args=args, timeout=timeout, **kwargs).stdout.\
+ getvalue().strip()
+
def run_shell_payload(self, payload, wait=True, timeout=900, **kwargs):
kwargs.setdefault('cwd', self.mountpoint)
kwargs.setdefault('omit_sudo', False)
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index ff9962e7310..beb41019e6d 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -324,6 +324,8 @@ class TestFsStatus(TestAdminCommands):
Test "ceph fs status subcommand.
"""
+ MDSS_REQUIRED = 3
+
def test_fs_status(self):
"""
That `ceph fs status` command functions.
@@ -338,6 +340,31 @@ class TestFsStatus(TestAdminCommands):
mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
self.assertEqual(mdsmap[0]["state"], "active")
+ def test_fs_status_standby_replay(self):
+ """
+ That `ceph fs status` command functions.
+ """
+
+ self.fs.set_allow_standby_replay(True)
+
+ s = self.get_ceph_cmd_stdout("fs", "status")
+ self.assertTrue("active" in s)
+ self.assertTrue("standby-replay" in s)
+ self.assertTrue("0-s" in s)
+ self.assertTrue("standby" in s)
+
+ mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json-pretty"))["mdsmap"]
+ self.assertEqual(mdsmap[0]["state"], "active")
+ self.assertEqual(mdsmap[1]["state"], "standby-replay")
+ self.assertEqual(mdsmap[1]["rank"], "0-s")
+ self.assertEqual(mdsmap[2]["state"], "standby")
+
+ mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
+ self.assertEqual(mdsmap[0]["state"], "active")
+ self.assertEqual(mdsmap[1]["state"], "standby-replay")
+ self.assertEqual(mdsmap[1]["rank"], "0-s")
+ self.assertEqual(mdsmap[2]["state"], "standby")
+
class TestAddDataPool(TestAdminCommands):
"""
@@ -2178,9 +2205,6 @@ class TestFsAuthorizeUpdate(CephFSTestCase):
caps mon = "allow r fsname=a"
caps osd = "allow rw tag cephfs data=a"
"""
- self.skipTest('this test is broken ATM, see '
- 'https://tracker.ceph.com/issues/65808')
-
PERM, PATH = 'rw', 'dir1'
self.mount_a.run_shell(f'mkdir {PATH}')
self.captester = CapTester(self.mount_a, PATH)
@@ -2659,3 +2683,241 @@ class TestMDSFail(TestAdminCommands):
errmsgs=health_warn)
self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it')
self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it')
+
+
+class TestFSSetMaxMDS(TestAdminCommands):
+
+ def test_when_unhealthy_without_confirm(self):
+ '''
+ Test that command "ceph fs set <fsname> max_mds <num>" without the
+ confirmation flag (--yes-i-really-mean-it) fails when cluster is
+ unhealthy.
+ '''
+ self.gen_health_warn_mds_cache_oversized()
+
+ with self.assertRaises(CommandFailedError) as cfe:
+ self.fs.set_max_mds(2, confirm=False)
+ self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+ def test_when_unhealthy_with_confirm(self):
+ '''
+ Test that command "ceph fs set <fsname> max_mds <num>
+ --yes-i-really-mean-it" runs successfully when cluster is unhealthy.
+ '''
+ self.gen_health_warn_mds_cache_oversized()
+
+ self.fs.set_max_mds(2, confirm=True)
+ self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+ def test_when_mds_trim_without_confirm(self):
+ '''
+ Test that command "ceph fs set <fsname> max_mds <num>" without the
+ confirmation flag (--yes-i-really-mean-it) fails when cluster has
+ MDS_TRIM health warning.
+ '''
+ self.gen_health_warn_mds_trim()
+
+ with self.assertRaises(CommandFailedError) as cfe:
+ self.fs.set_max_mds(2, confirm=False)
+ self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+ def test_when_mds_trim_when_with_confirm(self):
+ '''
+ Test that command "ceph fs set <fsname> max_mds <num>
+ --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM
+ health warning.
+ '''
+ self.gen_health_warn_mds_trim()
+
+ self.fs.set_max_mds(2, confirm=True)
+ self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+ def test_when_healthy_with_confirm(self):
+ '''
+ Test that command "ceph fs set <fsname> max_mds <num>
+ --yes-i-really-mean-it" runs successfully also when cluster is
+ healthy.
+ '''
+ self.fs.set_max_mds(2, confirm=True)
+ self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+
+class TestToggleVolumes(CephFSTestCase):
+ '''
+ Contains code for enabling/disabling mgr/volumes plugin.
+ '''
+
+ VOL_MOD_NAME = 'volumes'
+ CONFIRM = '--yes-i-really-mean-it'
+
+ def tearDown(self):
+ '''
+ Ensure that the volumes plugin is enabled after the test has finished
+ running since not doing so might affect tearDown() of CephFSTestCase or
+ other superclasses.
+ '''
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ if 'volumes' in json_output['force_disabled_modules']:
+ self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+
+ super(TestToggleVolumes, self).tearDown()
+
+ def test_force_disable_with_confirmation(self):
+ '''
+ Test that running "ceph mgr module force disable volumes
+ --yes-i-really-mean-it" successfully disables volumes plugin.
+
+ Also test "ceph mgr module ls" output after this.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+ def test_force_disable_fails_without_confirmation(self):
+ '''
+ Test that running "ceph mgr module force disable volumes" fails with
+ EPERM when confirmation flag is not passed along.
+
+ Also test that output of this command suggests user to pass
+ --yes-i-really-mean-it.
+ '''
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable {self.VOL_MOD_NAME}',
+ stderr=StringIO(), check_status=False)
+
+ self.assertEqual(proc.returncode, errno.EPERM)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('EPERM', proc_stderr)
+ # ensure that the confirmation flag was recommended
+ self.assertIn(self.CONFIRM, proc_stderr)
+
+ def test_force_disable_idempotency(self):
+ '''
+ Test that running "ceph mgr module force disable volumes" passes when
+ volumes plugin was already force disabled.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+ sleep(5)
+
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format '
+ 'json-pretty')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+ # XXX: this this test, running this command 2nd time should pass.
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}')
+
+ def test_force_disable_nonexistent_mod(self):
+ '''
+ Test that passing non-existent name to "ceph mgr module force disable"
+ command leads to an error.
+ '''
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable abcd {self.CONFIRM}',
+ check_status=False, stderr=StringIO())
+ self.assertEqual(proc.returncode, errno.EINVAL)
+ self.assertIn('EINVAL', proc.stderr.getvalue())
+
+ def test_force_disable_non_alwayson_mod(self):
+ '''
+ Test that passing non-existent name to "ceph mgr module force disable"
+ command leads to an error.
+ '''
+ json_output = self.get_ceph_cmd_stdout(
+ 'mgr module ls --format json-pretty', check_status=False,
+ stderr=StringIO())
+ output_dict = json.loads(json_output)
+ some_non_alwayson_mod = output_dict['enabled_modules'][0]
+
+ proc = self.run_ceph_cmd(
+ f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}',
+ check_status=False, stderr=StringIO())
+ self.assertEqual(proc.returncode, errno.EINVAL)
+ self.assertIn('EINVAL', proc.stderr.getvalue())
+
+ def test_enabled_by_default(self):
+ '''
+ Test that volumes plugin is enabled by default and is also reported as
+ "always on".
+ '''
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ def test_disable_fails(self):
+ '''
+ Test that running "ceph mgr module disable volumes" fails with EPERM.
+
+ This is expected since volumes is an always-on module and therefore
+ it can only be disabled using command "ceph mgr module force disable
+ volumes".
+ '''
+ proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}',
+ stderr=StringIO(), check_status=False)
+ self.assertEqual(proc.returncode, errno.EPERM)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('EPERM', proc_stderr)
+
+ def test_enable_idempotency(self):
+ '''
+ Test that enabling volumes plugin when it is already enabled doesn't
+ exit with non-zero return value.
+
+ Also test that it reports plugin as already enabled.
+ '''
+ proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}',
+ stderr=StringIO())
+ self.assertEqual(proc.returncode, 0)
+
+ proc_stderr = proc.stderr.getvalue()
+ self.assertIn('already enabled', proc_stderr)
+ self.assertIn('always-on', proc_stderr)
+
+ def test_enable_post_disabling(self):
+ '''
+ Test that enabling volumes plugin after (force-)disabling it works
+ successfully.
+
+ Alo test "ceph mgr module ls" output for volumes plugin afterwards.
+ '''
+ self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+ f'{self.CONFIRM}')
+ # give bit of time for plugin to be disabled.
+ sleep(5)
+
+ self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+ # give bit of time for plugin to be functional again
+ sleep(5)
+ json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+ json_output = json.loads(json_output)
+ self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+ self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+ # plugin is reported properly by "ceph mgr module ls" command, check if
+ # it is also working fine.
+ self.run_ceph_cmd('fs volume ls')
diff --git a/qa/tasks/cephfs/test_backtrace.py b/qa/tasks/cephfs/test_backtrace.py
index 6b094569b7b..cd23c114bfb 100644
--- a/qa/tasks/cephfs/test_backtrace.py
+++ b/qa/tasks/cephfs/test_backtrace.py
@@ -100,3 +100,29 @@ class TestBacktrace(CephFSTestCase):
# we don't update the layout in all the old pools whenever it changes
old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
self.assertEqual(old_pool_layout['object_size'], 4194304)
+
+ def test_backtrace_flush_on_deleted_data_pool(self):
+ """
+ that the MDS does not go read-only when handling backtrace update errors
+ when backtrace updates are batched and flushed to RADOS (during journal trim)
+ and some of the pool have been removed.
+ """
+ data_pool = self.fs.get_data_pool_name()
+ extra_data_pool_name_1 = data_pool + '_extra1'
+ self.fs.add_data_pool(extra_data_pool_name_1)
+
+ self.mount_a.run_shell(["mkdir", "dir_x"])
+ self.mount_a.setfattr("dir_x", "ceph.dir.layout.pool", extra_data_pool_name_1)
+ self.mount_a.run_shell(["touch", "dir_x/file_x"])
+ self.fs.flush()
+
+ extra_data_pool_name_2 = data_pool + '_extra2'
+ self.fs.add_data_pool(extra_data_pool_name_2)
+ self.mount_a.setfattr("dir_x/file_x", "ceph.file.layout.pool", extra_data_pool_name_2)
+ self.mount_a.run_shell(["setfattr", "-x", "ceph.dir.layout", "dir_x"])
+ self.run_ceph_cmd("fs", "rm_data_pool", self.fs.name, extra_data_pool_name_1)
+ self.fs.flush()
+
+ # quick test to check if the mds has handled backtrace update failure
+ # on the deleted data pool without going read-only.
+ self.mount_a.run_shell(["mkdir", "dir_y"])
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
index 16de379f54f..468378fce3d 100644
--- a/qa/tasks/cephfs/test_exports.py
+++ b/qa/tasks/cephfs/test_exports.py
@@ -4,6 +4,7 @@ import time
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while, MaxWhileTries
log = logging.getLogger(__name__)
@@ -152,6 +153,8 @@ class TestExportPin(CephFSTestCase):
# vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap
# to be written out every event. Yuck!
self.config_set('mds', 'mds_debug_subtrees', False)
+ # make sure ESubtreeMap is written frequently enough:
+ self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4')
self.config_rm('mds', 'mds bal split size') # don't split /top
self.mount_a.run_shell_payload("rm -rf 1")
@@ -628,3 +631,186 @@ done
log.info("{0} migrations have occured due to the cluster resizing".format(count))
# rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2
self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget
+
+class TestDumpExportStates(CephFSTestCase):
+ MDSS_REQUIRED = 2
+ CLIENTS_REQUIRED = 1
+
+ EXPORT_STATES = ['locking', 'discovering', 'freezing', 'prepping', 'warning', 'exporting']
+
+ def setUp(self):
+ super().setUp()
+
+ self.fs.set_max_mds(self.MDSS_REQUIRED)
+ self.status = self.fs.wait_for_daemons()
+
+ self.mount_a.run_shell_payload('mkdir -p test/export')
+
+ def tearDown(self):
+ super().tearDown()
+
+ def _wait_for_export_target(self, source, target, sleep=2, timeout=10):
+ try:
+ with safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
+ while proceed():
+ info = self.fs.getinfo().get_rank(self.fs.id, source)
+ log.info(f'waiting for rank {target} to be added to the export target')
+ if target in info['export_targets']:
+ return
+ except MaxWhileTries as e:
+ raise RuntimeError(f'rank {target} has not been added to export target after {timeout}s') from e
+
+ def _dump_export_state(self, rank):
+ states = self.fs.rank_asok(['dump_export_states'], rank=rank, status=self.status)
+ self.assertTrue(type(states) is list)
+ self.assertEqual(len(states), 1)
+ return states[0]
+
+ def _test_base(self, path, source, target, state_index, kill):
+ self.fs.rank_asok(['config', 'set', 'mds_kill_import_at', str(kill)], rank=target, status=self.status)
+
+ self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status)
+ self._wait_for_export_target(source, target)
+
+ target_rank = self.fs.get_rank(rank=target, status=self.status)
+ self.delete_mds_coredump(target_rank['name'])
+
+ state = self._dump_export_state(source)
+
+ self.assertTrue(type(state['tid']) is int)
+ self.assertEqual(state['path'], path)
+ self.assertEqual(state['state'], self.EXPORT_STATES[state_index])
+ self.assertEqual(state['peer'], target)
+
+ return state
+
+ def _test_state_history(self, state):
+ history = state['state_history']
+ self.assertTrue(type(history) is dict)
+ size = 0
+ for name in self.EXPORT_STATES:
+ self.assertTrue(type(history[name]) is dict)
+ size += 1
+ if name == state['state']:
+ break
+ self.assertEqual(len(history), size)
+
+ def _test_freeze_tree(self, state, waiters):
+ self.assertTrue(type(state['freeze_tree_time']) is float)
+ self.assertEqual(state['unfreeze_tree_waiters'], waiters)
+
+ def test_discovering(self):
+ state = self._test_base('/test', 0, 1, 1, 1)
+
+ self._test_state_history(state)
+ self._test_freeze_tree(state, 0)
+
+ self.assertEqual(state['last_cum_auth_pins'], 0)
+ self.assertEqual(state['num_remote_waiters'], 0)
+
+ def test_prepping(self):
+ client_id = self.mount_a.get_global_id()
+
+ state = self._test_base('/test', 0, 1, 3, 3)
+
+ self._test_state_history(state)
+ self._test_freeze_tree(state, 0)
+
+ self.assertEqual(state['flushed_clients'], [client_id])
+ self.assertTrue(type(state['warning_ack_waiting']) is list)
+
+ def test_exporting(self):
+ state = self._test_base('/test', 0, 1, 5, 5)
+
+ self._test_state_history(state)
+ self._test_freeze_tree(state, 0)
+
+ self.assertTrue(type(state['notify_ack_waiting']) is list)
+
+class TestKillExports(CephFSTestCase):
+ MDSS_REQUIRED = 2
+ CLIENTS_REQUIRED = 1
+
+ def setUp(self):
+ CephFSTestCase.setUp(self)
+
+ self.fs.set_max_mds(self.MDSS_REQUIRED)
+ self.status = self.fs.wait_for_daemons()
+
+ self.mount_a.run_shell_payload('mkdir -p test/export')
+
+ def tearDown(self):
+ super().tearDown()
+
+ def _kill_export_as(self, rank, kill):
+ self.fs.rank_asok(['config', 'set', 'mds_kill_export_at', str(kill)], rank=rank, status=self.status)
+
+ def _export_dir(self, path, source, target):
+ self.fs.rank_asok(['export', 'dir', path, str(target)], rank=source, status=self.status)
+
+ def _wait_failover(self):
+ self.wait_until_true(lambda: self.fs.status().hadfailover(self.status), timeout=self.fs.beacon_timeout)
+
+ def _clear_coredump(self, rank):
+ crash_rank = self.fs.get_rank(rank=rank, status=self.status)
+ self.delete_mds_coredump(crash_rank['name'])
+
+ def _run_kill_export(self, kill_at, exporter_rank=0, importer_rank=1, restart=True):
+ self._kill_export_as(exporter_rank, kill_at)
+ self._export_dir("/test", exporter_rank, importer_rank)
+ self._wait_failover()
+ self._clear_coredump(exporter_rank)
+
+ if restart:
+ self.fs.rank_restart(rank=exporter_rank, status=self.status)
+ self.status = self.fs.wait_for_daemons()
+
+ def test_session_cleanup(self):
+ """
+ Test importer's session cleanup after an export subtree task is interrupted.
+ Set 'mds_kill_export_at' to 9 or 10 so that the importer will wait for the exporter
+ to restart while the state is 'acking'.
+
+ See https://tracker.ceph.com/issues/61459
+ """
+
+ kill_export_at = [9, 10]
+
+ exporter_rank = 0
+ importer_rank = 1
+
+ for kill in kill_export_at:
+ log.info(f"kill_export_at: {kill}")
+ self._run_kill_export(kill, exporter_rank, importer_rank)
+
+ if len(self._session_list(importer_rank, self.status)) > 0:
+ client_id = self.mount_a.get_global_id()
+ self.fs.rank_asok(['session', 'evict', "%s" % client_id], rank=importer_rank, status=self.status)
+
+ # timeout if buggy
+ self.wait_until_evicted(client_id, importer_rank)
+
+ # for multiple tests
+ self.mount_a.remount()
+
+ def test_client_eviction(self):
+ # modify the timeout so that we don't have to wait too long
+ timeout = 30
+ self.fs.set_session_timeout(timeout)
+ self.fs.set_session_autoclose(timeout + 5)
+
+ kill_export_at = [9, 10]
+
+ exporter_rank = 0
+ importer_rank = 1
+
+ for kill in kill_export_at:
+ log.info(f"kill_export_at: {kill}")
+ self._run_kill_export(kill, exporter_rank, importer_rank)
+
+ client_id = self.mount_a.get_global_id()
+ self.wait_until_evicted(client_id, importer_rank, timeout + 10)
+ time.sleep(1)
+
+ # failed if buggy
+ self.mount_a.ls()
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 29af1e76a4f..46139163ddd 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,4 @@
+import re
import time
import signal
import logging
@@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase):
self.fs.wait_for_daemons(timeout=90)
+class TestFailoverBeaconHealth(CephFSTestCase):
+ CLIENTS_REQUIRED = 1
+ MDSS_REQUIRED = 1
+
+ def initiate_journal_replay(self, num_files=100):
+ """ Initiate journal replay by creating files and restarting mds server."""
+
+ self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000")
+ self.mounts[0].test_files = [str(x) for x in range(num_files)]
+ self.mounts[0].create_files()
+ self.fs.fail()
+ self.fs.set_joinable()
+
+ def test_replay_beacon_estimated_time(self):
+ """
+ That beacon emits warning message with estimated time to complete replay
+ """
+ self.initiate_journal_replay()
+ self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60)
+ # remove the config so that replay finishes and the cluster
+ # is HEALTH_OK
+ self.config_rm("mds", "mds_delay_journal_replay_for_testing")
+ self.wait_for_health_clear(timeout=60)
+
+ def test_replay_estimated_time_accuracy(self):
+ self.initiate_journal_replay(250)
+ def replay_complete():
+ health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+ codes = [s for s in health['checks']]
+ return 'MDS_ESTIMATED_REPLAY_TIME' not in codes
+
+ def get_estimated_time():
+ completion_percentage = 0.0
+ time_duration = pending_duration = 0
+ with safe_while(sleep=5, tries=360) as proceed:
+ while proceed():
+ health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True)
+ codes = [s for s in health['checks']]
+ if 'MDS_ESTIMATED_REPLAY_TIME' in codes:
+ message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message']
+ ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s"
+ m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message)
+ if not m:
+ continue
+ completion_percentage = float(m.group(1))
+ time_duration = int(m.group(3))
+ pending_duration = int(m.group(4))
+ log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}")
+ if completion_percentage >= 50:
+ return (completion_percentage, time_duration, pending_duration)
+ _, _, pending_duration = get_estimated_time()
+ # wait for 25% more time to avoid false negative failures
+ self.wait_until_true(replay_complete, timeout=pending_duration * 1.25)
+
class TestFailover(CephFSTestCase):
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 2
diff --git a/qa/tasks/cephfs/test_fscrypt.py b/qa/tasks/cephfs/test_fscrypt.py
index d327c43c1fc..c1405415c63 100644
--- a/qa/tasks/cephfs/test_fscrypt.py
+++ b/qa/tasks/cephfs/test_fscrypt.py
@@ -83,9 +83,11 @@ class TestFSCryptRecovery(FSCryptTestCase):
self.fs.set_joinable()
self.fs.wait_for_daemons()
+ # load all inodes into cache (may be cleared by journal reset)
+ self.mount_a.run_shell_payload(f"cd {self.path} && find")
+
verify_alternate_name()
- self.mount_a.run_shell_payload(f"cd {self.path} && find")
self.mount_a.run_shell_payload(f"cd {self.path} && stat {file}")
diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py
index 2f9ebe6b1d5..078db6a4a6d 100644
--- a/qa/tasks/cephfs/test_mirroring.py
+++ b/qa/tasks/cephfs/test_mirroring.py
@@ -204,6 +204,17 @@ class TestMirroring(CephFSTestCase):
self.assertTrue(res[dir_name]['last_synced_snap']['name'] == expected_snap_name)
self.assertTrue(res[dir_name]['snaps_synced'] == expected_snap_count)
+ def check_peer_status_idle(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_name,
+ expected_snap_count):
+ peer_uuid = self.get_peer_uuid(peer_spec)
+ res = self.mirror_daemon_command(f'peer status for fs: {fs_name}',
+ 'fs', 'mirror', 'peer', 'status',
+ f'{fs_name}@{fs_id}', peer_uuid)
+ self.assertTrue(dir_name in res)
+ self.assertTrue('idle' == res[dir_name]['state'])
+ self.assertTrue(expected_snap_name == res[dir_name]['last_synced_snap']['name'])
+ self.assertTrue(expected_snap_count == res[dir_name]['snaps_synced'])
+
def check_peer_status_deleted_snap(self, fs_name, fs_id, peer_spec, dir_name,
expected_delete_count):
peer_uuid = self.get_peer_uuid(peer_spec)
@@ -421,6 +432,34 @@ class TestMirroring(CephFSTestCase):
self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
self.mount_a.run_shell(["rmdir", "d1"])
+ def test_directory_command_ls(self):
+ dir1 = 'dls1'
+ dir2 = 'dls2'
+ self.mount_a.run_shell(["mkdir", dir1])
+ self.mount_a.run_shell(["mkdir", dir2])
+ self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+ try:
+ self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+ self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+ time.sleep(10)
+ dirs_list = json.loads(self.get_ceph_cmd_stdout("fs", "snapshot", "mirror", "ls", self.primary_fs_name))
+ # verify via asok
+ res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+ 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+ dir_count = res['snap_dirs']['dir_count']
+ self.assertTrue(len(dirs_list) == dir_count and f'/{dir1}' in dirs_list and f'/{dir2}' in dirs_list)
+ except CommandFailedError:
+ raise RuntimeError('Error listing directories')
+ except AssertionError:
+ raise RuntimeError('Wrong number of directories listed')
+ finally:
+ self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+ self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+
+ self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+ self.mount_a.run_shell(["rmdir", dir1])
+ self.mount_a.run_shell(["rmdir", dir2])
+
def test_add_relative_directory_path(self):
self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
try:
@@ -549,7 +588,7 @@ class TestMirroring(CephFSTestCase):
# create a bunch of files in a directory to snap
self.mount_a.run_shell(["mkdir", "d0"])
- for i in range(50):
+ for i in range(100):
self.mount_a.write_n_mb(os.path.join('d0', f'file.{i}'), 1)
self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
@@ -563,7 +602,7 @@ class TestMirroring(CephFSTestCase):
# take a snapshot
self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
- time.sleep(30)
+ time.sleep(60)
self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
"client.mirror_remote@ceph", '/d0', 'snap0', 1)
self.verify_snapshot('d0', 'snap0')
@@ -575,10 +614,10 @@ class TestMirroring(CephFSTestCase):
self.assertGreater(second["counters"]["last_synced_start"], first["counters"]["last_synced_start"])
self.assertGreater(second["counters"]["last_synced_end"], second["counters"]["last_synced_start"])
self.assertGreater(second["counters"]["last_synced_duration"], 0)
- self.assertEquals(second["counters"]["last_synced_bytes"], 52428800) # last_synced_bytes = 50 files of 1MB size each
+ self.assertEquals(second["counters"]["last_synced_bytes"], 104857600) # last_synced_bytes = 100 files of 1MB size each
# some more IO
- for i in range(75):
+ for i in range(150):
self.mount_a.write_n_mb(os.path.join('d0', f'more_file.{i}'), 1)
time.sleep(60)
@@ -586,7 +625,7 @@ class TestMirroring(CephFSTestCase):
# take another snapshot
self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"])
- time.sleep(60)
+ time.sleep(120)
self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
"client.mirror_remote@ceph", '/d0', 'snap1', 2)
self.verify_snapshot('d0', 'snap1')
@@ -598,7 +637,7 @@ class TestMirroring(CephFSTestCase):
self.assertGreater(third["counters"]["last_synced_start"], second["counters"]["last_synced_end"])
self.assertGreater(third["counters"]["last_synced_end"], third["counters"]["last_synced_start"])
self.assertGreater(third["counters"]["last_synced_duration"], 0)
- self.assertEquals(third["counters"]["last_synced_bytes"], 78643200) # last_synced_bytes = 75 files of 1MB size each
+ self.assertEquals(third["counters"]["last_synced_bytes"], 157286400) # last_synced_bytes = 150 files of 1MB size each
# delete a snapshot
self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
@@ -1361,7 +1400,7 @@ class TestMirroring(CephFSTestCase):
self.mount_b.umount_wait()
self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
- # create a bunch of files in a directory to snap
+ # create some large files in 3 directories to snap
self.mount_a.run_shell(["mkdir", "d0"])
self.mount_a.run_shell(["mkdir", "d1"])
self.mount_a.run_shell(["mkdir", "d2"])
@@ -1384,30 +1423,38 @@ class TestMirroring(CephFSTestCase):
vbefore = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
# take snapshots
log.debug('taking snapshots')
- self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
- self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
- self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+ snap_name = "snap0"
+ self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+ self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+ self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
- time.sleep(10)
log.debug('checking snap in progress')
- self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d0', 'snap0')
- self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d1', 'snap0')
- self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d2', 'snap0')
+ peer_spec = "client.mirror_remote@ceph"
+ peer_uuid = self.get_peer_uuid(peer_spec)
+ with safe_while(sleep=3, tries=100, action=f'wait for status: {peer_spec}') as proceed:
+ while proceed():
+ res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
+ 'fs', 'mirror', 'peer', 'status',
+ f'{self.primary_fs_name}@{self.primary_fs_id}',
+ peer_uuid)
+ if ('syncing' == res["/d0"]['state'] and 'syncing' == res["/d1"]['state'] and \
+ 'syncing' == res["/d2"]['state']):
+ break
- log.debug('removing directories 1')
+ log.debug('removing directory 1')
self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
- log.debug('removing directories 2')
+ log.debug('removing directory 2')
self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
- log.debug('removing directories 3')
+ log.debug('removing directory 3')
self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
+ # Wait a while for the sync backoff
+ time.sleep(500)
+
log.debug('removing snapshots')
- self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
- self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"])
- self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"])
+ self.mount_a.run_shell(["rmdir", f"d0/.snap/{snap_name}"])
+ self.mount_a.run_shell(["rmdir", f"d1/.snap/{snap_name}"])
+ self.mount_a.run_shell(["rmdir", f"d2/.snap/{snap_name}"])
for i in range(4):
filename = f'file.{i}'
@@ -1427,26 +1474,27 @@ class TestMirroring(CephFSTestCase):
self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
log.debug('creating new snapshots...')
- self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
- self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
- self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+ self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+ self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+ self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
+
+ # Wait for the threads to finish
+ time.sleep(500)
- time.sleep(60)
self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d0', 'snap0', 1)
- self.verify_snapshot('d0', 'snap0')
+ "client.mirror_remote@ceph", '/d0', f'{snap_name}', 1)
+ self.verify_snapshot('d0', f'{snap_name}')
self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d1', 'snap0', 1)
- self.verify_snapshot('d1', 'snap0')
+ "client.mirror_remote@ceph", '/d1', f'{snap_name}', 1)
+ self.verify_snapshot('d1', f'{snap_name}')
self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
- "client.mirror_remote@ceph", '/d2', 'snap0', 1)
- self.verify_snapshot('d2', 'snap0')
+ "client.mirror_remote@ceph", '/d2', f'{snap_name}', 1)
+ self.verify_snapshot('d2', f'{snap_name}')
res = self.mirror_daemon_command(f'counter dump for fs: {self.primary_fs_name}', 'counter', 'dump')
vafter = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
self.assertGreater(vafter["counters"]["snaps_synced"], vbefore["counters"]["snaps_synced"])
- self.assertGreater(vafter["counters"]["snaps_deleted"], vbefore["counters"]["snaps_deleted"])
self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
@@ -1494,8 +1542,86 @@ class TestMirroring(CephFSTestCase):
"""
That get/set ceph.mirror.dirty_snap_id attribute succeeds in a remote filesystem.
"""
+ log.debug('reconfigure client auth caps')
+ self.get_ceph_cmd_result(
+ 'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+ 'mds', 'allow rw',
+ 'mon', 'allow r',
+ 'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+ self.backup_fs.get_data_pool_name(),
+ self.backup_fs.get_data_pool_name()))
+ log.debug(f'mounting filesystem {self.secondary_fs_name}')
+ self.mount_b.umount_wait()
+ self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+ log.debug('setting ceph.mirror.dirty_snap_id attribute')
self.mount_b.run_shell(["mkdir", "-p", "d1/d2/d3"])
attr = str(random.randint(1, 10))
self.mount_b.setfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id", attr)
+ log.debug('getting ceph.mirror.dirty_snap_id attribute')
val = self.mount_b.getfattr("d1/d2/d3", "ceph.mirror.dirty_snap_id")
self.assertEqual(attr, val, f"Mismatch for ceph.mirror.dirty_snap_id value: {attr} vs {val}")
+
+ def test_cephfs_mirror_remote_snap_corrupt_fails_synced_snapshot(self):
+ """
+ That making manual changes to the remote .snap directory shows 'peer status' state: "failed"
+ for a synced snapshot and then restores to "idle" when those changes are reverted.
+ """
+ log.debug('reconfigure client auth caps')
+ self.get_ceph_cmd_result(
+ 'auth', 'caps', "client.{0}".format(self.mount_b.client_id),
+ 'mds', 'allow rwps',
+ 'mon', 'allow r',
+ 'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+ self.backup_fs.get_data_pool_name(),
+ self.backup_fs.get_data_pool_name()))
+ log.debug(f'mounting filesystem {self.secondary_fs_name}')
+ self.mount_b.umount_wait()
+ self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
+
+ self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+ peer_spec = "client.mirror_remote@ceph"
+ self.peer_add(self.primary_fs_name, self.primary_fs_id, peer_spec, self.secondary_fs_name)
+ dir_name = 'd0'
+ self.mount_a.run_shell(['mkdir', dir_name])
+ self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir_name}')
+
+ # take a snapshot
+ snap_name = "snap_a"
+ expected_snap_count = 1
+ self.mount_a.run_shell(['mkdir', f'{dir_name}/.snap/{snap_name}'])
+
+ time.sleep(30)
+ # confirm snapshot synced and status 'idle'
+ self.check_peer_status_idle(self.primary_fs_name, self.primary_fs_id,
+ peer_spec, f'/{dir_name}', snap_name, expected_snap_count)
+
+ remote_snap_name = 'snap_b'
+ remote_snap_path = f'{dir_name}/.snap/{remote_snap_name}'
+ failure_reason = f"snapshot '{remote_snap_name}' has invalid metadata"
+ dir_name = f'/{dir_name}'
+
+ # create a directory in the remote fs and check status 'failed'
+ self.mount_b.run_shell(['sudo', 'mkdir', remote_snap_path], omit_sudo=False)
+ peer_uuid = self.get_peer_uuid(peer_spec)
+ with safe_while(sleep=1, tries=60, action=f'wait for failed status: {peer_spec}') as proceed:
+ while proceed():
+ res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
+ 'fs', 'mirror', 'peer', 'status',
+ f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid)
+ if('failed' == res[dir_name]['state'] and \
+ failure_reason == res.get(dir_name, {}).get('failure_reason', {}) and \
+ snap_name == res[dir_name]['last_synced_snap']['name'] and \
+ expected_snap_count == res[dir_name]['snaps_synced']):
+ break
+ # remove the directory in the remote fs and check status restores to 'idle'
+ self.mount_b.run_shell(['sudo', 'rmdir', remote_snap_path], omit_sudo=False)
+ with safe_while(sleep=1, tries=60, action=f'wait for idle status: {peer_spec}') as proceed:
+ while proceed():
+ res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
+ 'fs', 'mirror', 'peer', 'status',
+ f'{self.primary_fs_name}@{self.primary_fs_id}', peer_uuid)
+ if('idle' == res[dir_name]['state'] and 'failure_reason' not in res and \
+ snap_name == res[dir_name]['last_synced_snap']['name'] and \
+ expected_snap_count == res[dir_name]['snaps_synced']):
+ break
+ self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
index 7917bd9202f..14f54a784e7 100644
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -558,16 +558,18 @@ class TestSessionClientEvict(CephFSTestCase):
self.assertEqual(ce.exception.exitstatus, errno.EINVAL)
def _evict_with_invalid_id(self, cmd):
+ info_initial = self.fs.rank_asok(cmd + ['ls'])
# with invalid id
- with self.assertRaises(CommandFailedError) as ce:
- self.fs.rank_tell(cmd + ['evict', 'id=1'])
- self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+ self.fs.rank_tell(cmd + ['evict', 'id=1'])
+ info = self.fs.rank_asok(cmd + ['ls'])
+ self.assertEqual(len(info), len(info_initial)) # session list is status-quo
def _evict_with_negative_id(self, cmd):
+ info_initial = self.fs.rank_asok(cmd + ['ls'])
# with negative id
- with self.assertRaises(CommandFailedError) as ce:
- self.fs.rank_tell(cmd + ['evict', 'id=-9'])
- self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+ self.fs.rank_tell(cmd + ['evict', 'id=-9'])
+ info = self.fs.rank_asok(cmd + ['ls'])
+ self.assertEqual(len(info), len(info_initial)) # session list is status-quo
def _evict_with_valid_id(self, cmd):
info_initial = self.fs.rank_asok(cmd + ['ls'])
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index 6d1c65dfb7d..0a1c07dce04 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -8,12 +8,15 @@ from io import BytesIO, StringIO
from tasks.mgr.mgr_test_case import MgrTestCase
from teuthology import contextutil
from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import Raw
log = logging.getLogger(__name__)
NFS_POOL_NAME = '.nfs' # should match mgr_module.py
# TODO Add test for cluster update when ganesha can be deployed on multiple ports.
+
+
class TestNFS(MgrTestCase):
def _cmd(self, *args):
return self.get_ceph_cmd_stdout(args)
@@ -52,15 +55,16 @@ class TestNFS(MgrTestCase):
"squash": "none",
"security_label": True,
"protocols": [
- 4
+ 3, 4
],
"transports": [
"TCP"
],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.1",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": self.fs_name,
+ "cmount_path": "/",
},
"clients": []
}
@@ -118,7 +122,7 @@ class TestNFS(MgrTestCase):
return
self.fail(fail_msg)
- def _check_auth_ls(self, export_id=1, check_in=False):
+ def _check_auth_ls(self, fs_name, check_in=False, user_id=None):
'''
Tests export user id creation or deletion.
:param export_id: Denotes export number
@@ -126,10 +130,12 @@ class TestNFS(MgrTestCase):
'''
output = self._cmd('auth', 'ls')
client_id = f'client.nfs.{self.cluster_id}'
+ search_id = f'client.{user_id}' if user_id else f'{client_id}.{fs_name}'
+
if check_in:
- self.assertIn(f'{client_id}.{export_id}', output)
+ self.assertIn(search_id, output)
else:
- self.assertNotIn(f'{client_id}.{export_id}', output)
+ self.assertNotIn(search_id, output)
def _test_idempotency(self, cmd_func, cmd_args):
'''
@@ -216,7 +222,7 @@ class TestNFS(MgrTestCase):
# Runs the nfs export create command
self._cmd(*export_cmd)
# Check if user id for export is created
- self._check_auth_ls(export_id, check_in=True)
+ self._check_auth_ls(self.fs_name, check_in=True)
res = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'get',
f'export-{export_id}', '-'])
# Check if export object is created
@@ -230,12 +236,12 @@ class TestNFS(MgrTestCase):
self._test_create_cluster()
self._create_export(export_id='1', create_fs=True)
- def _delete_export(self):
+ def _delete_export(self, pseduo_path=None, check_in=False, user_id=None):
'''
Delete an export.
'''
- self._nfs_cmd('export', 'rm', self.cluster_id, self.pseudo_path)
- self._check_auth_ls()
+ self._nfs_cmd('export', 'rm', self.cluster_id, pseduo_path if pseduo_path else self.pseudo_path)
+ self._check_auth_ls(self.fs_name, check_in, user_id)
def _test_list_export(self):
'''
@@ -256,26 +262,27 @@ class TestNFS(MgrTestCase):
self.sample_export['export_id'] = 2
self.sample_export['pseudo'] = self.pseudo_path + '1'
self.sample_export['access_type'] = 'RO'
- self.sample_export['fsal']['user_id'] = f'{self.expected_name}.2'
+ self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603'
self.assertDictEqual(self.sample_export, nfs_output[1])
# Export-3 for subvolume with r only
self.sample_export['export_id'] = 3
self.sample_export['path'] = sub_vol_path
self.sample_export['pseudo'] = self.pseudo_path + '2'
- self.sample_export['fsal']['user_id'] = f'{self.expected_name}.3'
+ self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603'
self.assertDictEqual(self.sample_export, nfs_output[2])
# Export-4 for subvolume
self.sample_export['export_id'] = 4
self.sample_export['pseudo'] = self.pseudo_path + '3'
self.sample_export['access_type'] = 'RW'
- self.sample_export['fsal']['user_id'] = f'{self.expected_name}.4'
+ self.sample_export['fsal']['user_id'] = f'{self.expected_name}.{self.fs_name}.3746f603'
self.assertDictEqual(self.sample_export, nfs_output[3])
- def _get_export(self):
+ def _get_export(self, pseudo_path=None):
'''
Returns export block in json format
'''
- return json.loads(self._nfs_cmd('export', 'info', self.cluster_id, self.pseudo_path))
+ return json.loads(self._nfs_cmd('export', 'info', self.cluster_id,
+ pseudo_path if pseudo_path else self.pseudo_path))
def _test_get_export(self):
'''
@@ -313,7 +320,7 @@ class TestNFS(MgrTestCase):
else:
log.warning(f'{e}, retrying')
- def _test_mnt(self, pseudo_path, port, ip, check=True):
+ def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False):
'''
Test mounting of created exports
:param pseudo_path: It is the pseudo root name
@@ -341,10 +348,64 @@ class TestNFS(MgrTestCase):
self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
try:
+ # Clean up volumes directory created by subvolume create by some tests
+ self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes'])
self.ctx.cluster.run(args=['touch', '/mnt/test'])
out_mnt = self._sys_cmd(['ls', '/mnt'])
self.assertEqual(out_mnt, b'test\n')
+ if datarw:
+ self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1'])
+ out_test1 = self._sys_cmd(['cat', '/mnt/test1'])
+ self.assertEqual(out_test1, b'test data\n')
+ finally:
+ self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
+
+ def _test_data_read_write(self, pseudo_path, port, ip):
+ '''
+ Check if read/write works fine
+ '''
+ try:
+ self._test_mnt(pseudo_path, port, ip, True, True)
+ except CommandFailedError as e:
+ self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
+
+ def _mnt_nfs(self, pseudo_path, port, ip):
+ '''
+ Mount created export
+ :param pseudo_path: It is the pseudo root name
+ :param port: Port of deployed nfs cluster
+ :param ip: IP of deployed nfs cluster
+ '''
+ tries = 3
+ while True:
+ try:
+ self.ctx.cluster.run(
+ args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}',
+ f'{ip}:{pseudo_path}', '/mnt'])
+ break
+ except CommandFailedError:
+ if tries:
+ tries -= 1
+ time.sleep(2)
+ continue
+ raise
+
+ self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
+
+ def _test_fio(self, pseudo_path, port, ip):
+ '''
+ run fio with libaio on /mnt/fio
+ :param mnt_path: nfs mount point
+ '''
+ try:
+ self._mnt_nfs(pseudo_path, port, ip)
+ self.ctx.cluster.run(args=['mkdir', '/mnt/fio'])
+ fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50']
+ self.ctx.cluster.run(args=fio_cmd)
+ except CommandFailedError as e:
+ self.fail(f"expected fio to be successful but failed with {e.exitstatus}")
finally:
+ self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio'])
self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
def _write_to_read_only_export(self, pseudo_path, port, ip):
@@ -506,7 +567,7 @@ class TestNFS(MgrTestCase):
self._test_delete_cluster()
# Check if rados ganesha conf object is deleted
self._check_export_obj_deleted(conf_obj=True)
- self._check_auth_ls()
+ self._check_auth_ls(self.fs_name)
def test_exports_on_mgr_restart(self):
'''
@@ -593,6 +654,30 @@ class TestNFS(MgrTestCase):
self._write_to_read_only_export(self.pseudo_path, port, ip)
self._test_delete_cluster()
+ def test_data_read_write(self):
+ '''
+ Test date read and write on export.
+ '''
+ self._test_create_cluster()
+ self._create_export(export_id='1', create_fs=True,
+ extra_cmd=['--pseudo-path', self.pseudo_path])
+ port, ip = self._get_port_ip_info()
+ self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+ self._test_data_read_write(self.pseudo_path, port, ip)
+ self._test_delete_cluster()
+
+ def test_async_io_fio(self):
+ '''
+ Test async io using fio. Expect completion without hang or crash
+ '''
+ self._test_create_cluster()
+ self._create_export(export_id='1', create_fs=True,
+ extra_cmd=['--pseudo-path', self.pseudo_path])
+ port, ip = self._get_port_ip_info()
+ self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+ self._test_fio(self.pseudo_path, port, ip)
+ self._test_delete_cluster()
+
def test_cluster_info(self):
'''
Test cluster info outputs correct ip and hostname
@@ -935,7 +1020,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.1",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": self.fs_name
}
},
@@ -948,7 +1033,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.2",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": "invalid_fs_name" # invalid fs
}
},
@@ -961,7 +1046,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.3",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": self.fs_name
}
}
@@ -1008,7 +1093,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.1",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": "invalid_fs_name" # invalid fs
}
}
@@ -1048,7 +1133,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.1",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": self.fs_name
}
},
@@ -1061,7 +1146,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.2",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": self.fs_name
}
},
@@ -1075,7 +1160,7 @@ class TestNFS(MgrTestCase):
"protocols": [4],
"fsal": {
"name": "CEPH",
- "user_id": "nfs.test.3",
+ "user_id": "nfs.test.nfs-cephfs.3746f603",
"fs_name": "invalid_fs_name"
}
}
@@ -1211,3 +1296,65 @@ class TestNFS(MgrTestCase):
finally:
self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}/*'])
self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode)
+
+ def test_nfs_export_creation_without_cmount_path(self):
+ """
+ Test that ensure cmount_path is present in FSAL block
+ """
+ self._create_cluster_with_fs(self.fs_name)
+
+ pseudo_path = '/test_without_cmount'
+ self._create_export(export_id='1',
+ extra_cmd=['--pseudo-path', pseudo_path])
+ nfs_output = self._get_export(pseudo_path)
+ self.assertIn('cmount_path', nfs_output['fsal'])
+
+ self._delete_export(pseudo_path)
+
+ def test_nfs_exports_with_same_and_diff_user_id(self):
+ """
+ Test that exports with same FSAL share same user_id
+ """
+ self._create_cluster_with_fs(self.fs_name)
+
+ pseudo_path_1 = '/test1'
+ pseudo_path_2 = '/test2'
+ pseudo_path_3 = '/test3'
+
+ # Create subvolumes
+ self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol_1')
+ self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol_2')
+
+ fs_path_1 = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol_1').strip()
+ fs_path_2 = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol_2').strip()
+ # Both exports should have same user_id(since cmount_path=/ & fs_name is same)
+ self._create_export(export_id='1',
+ extra_cmd=['--pseudo-path', pseudo_path_1,
+ '--path', fs_path_1])
+ self._create_export(export_id='2',
+ extra_cmd=['--pseudo-path', pseudo_path_2,
+ '--path', fs_path_2])
+
+ nfs_output_1 = self._get_export(pseudo_path_1)
+ nfs_output_2 = self._get_export(pseudo_path_2)
+ # Check if both exports have same user_id
+ self.assertEqual(nfs_output_2['fsal']['user_id'], nfs_output_1['fsal']['user_id'])
+ self.assertEqual(nfs_output_1['fsal']['user_id'], 'nfs.test.nfs-cephfs.3746f603')
+
+ cmount_path = '/volumes'
+ self._create_export(export_id='3',
+ extra_cmd=['--pseudo-path', pseudo_path_3,
+ '--path', fs_path_1,
+ '--cmount-path', cmount_path])
+
+ nfs_output_3 = self._get_export(pseudo_path_3)
+ self.assertNotEqual(nfs_output_3['fsal']['user_id'], nfs_output_1['fsal']['user_id'])
+ self.assertEqual(nfs_output_3['fsal']['user_id'], 'nfs.test.nfs-cephfs.32cd8545')
+
+ # Deleting export with same user_id should not delete the user_id
+ self._delete_export(pseudo_path_1, True, nfs_output_1['fsal']['user_id'])
+ # Deleting export 22 should delete the user_id since it's only export left with that user_id
+ self._delete_export(pseudo_path_2, False, nfs_output_2['fsal']['user_id'])
+
+ # Deleting export 23 should delete the user_id since it's only export with that user_id
+ self._delete_export(pseudo_path_3, False, nfs_output_3['fsal']['user_id'])
diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py
index b5691c83852..ae1c1f2056c 100644
--- a/qa/tasks/cephfs/test_quota.py
+++ b/qa/tasks/cephfs/test_quota.py
@@ -115,9 +115,11 @@ class TestQuota(CephFSTestCase):
readable_values = {"10K": "10240",
"100Ki": "102400",
+ "100KiB": "102400",
"10M": "10485760",
"100Mi": "104857600",
"2G": "2147483648",
+ "2GB": "2147483648",
"4Gi": "4294967296",
"1T": "1099511627776",
"2Ti": "2199023255552"}
@@ -135,7 +137,8 @@ class TestQuota(CephFSTestCase):
self.mount_a.run_shell(["mkdir", "subdir"])
- invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1"]
+ invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1",
+ "1GT", "2MM", "5Di", "8Bi", "i", "7iB"]
for invalid_value in invalid_values:
with self.assertRaises(CommandFailedError):
self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
diff --git a/qa/tasks/cephfs/test_snap_schedules.py b/qa/tasks/cephfs/test_snap_schedules.py
index 1fff047f468..bdfec3db540 100644
--- a/qa/tasks/cephfs/test_snap_schedules.py
+++ b/qa/tasks/cephfs/test_snap_schedules.py
@@ -1093,6 +1093,56 @@ class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper):
self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY])
+class TestSnapSchedulesFetchForeignConfig(TestSnapSchedulesHelper):
+ def test_fetch_for_mds_max_snaps_per_dir(self):
+ """Test the correctness of snap directory name"""
+ dir_path = TestSnapSchedulesHelper.TEST_DIRECTORY
+ sdn = self.get_snap_dir_name()
+
+ self.mount_a.run_shell(['mkdir', '-p', dir_path])
+
+ # set a schedule on the dir
+ self.fs_snap_schedule_cmd('add', path=dir_path, snap_schedule='1m')
+
+ self.config_set('mds', 'mds_max_snaps_per_dir', 10)
+
+ time.sleep(11*60) # wait for 9 snaps to be retained
+
+ snap_path = f"{dir_path}/{sdn}"
+ snapshots = self.mount_a.ls(path=snap_path)
+ fs_count = len(snapshots)
+
+ self.assertTrue(fs_count == 9)
+
+ self.config_set('mds', 'mds_max_snaps_per_dir', 8)
+
+ time.sleep(1*60 + 10) # wait for max_snaps_per_dir limit to be breached
+
+ snap_path = f"{dir_path}/{sdn}"
+ snapshots = self.mount_a.ls(path=snap_path)
+ fs_count = len(snapshots)
+
+ self.assertTrue(fs_count == 7)
+
+ self.config_set('mds', 'mds_max_snaps_per_dir', 10)
+
+ time.sleep(2*60 + 10) # wait for more snaps to be created
+
+ snap_path = f"{dir_path}/{sdn}"
+ snapshots = self.mount_a.ls(path=snap_path)
+ fs_count = len(snapshots)
+
+ self.assertTrue(fs_count == 9)
+
+ # remove snapshot schedule
+ self.fs_snap_schedule_cmd('remove', path=dir_path)
+
+ # remove all scheduled snapshots
+ self.remove_snapshots(dir_path, sdn)
+
+ self.mount_a.run_shell(['rmdir', dir_path])
+
+
"""
Note that the class TestSnapSchedulesMandatoryFSArgument tests snap-schedule
commands only for multi-fs scenario. Commands for a single default fs should
diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py
index ba3bc0fbd8a..c2184c41eff 100644
--- a/qa/tasks/cephfs/test_snapshots.py
+++ b/qa/tasks/cephfs/test_snapshots.py
@@ -376,6 +376,32 @@ class TestSnapshots(CephFSTestCase):
self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")])
+ def test_snapshot_check_access(self):
+ """
+ """
+
+ self.mount_a.run_shell_payload("mkdir -p dir1/dir2")
+ self.mount_a.umount_wait(require_clean=True)
+
+ newid = 'foo'
+ keyring = self.fs.authorize(newid, ('/dir1', 'rws'))
+ keyring_path = self.mount_a.client_remote.mktemp(data=keyring)
+ self.mount_a.remount(client_id=newid, client_keyring_path=keyring_path, cephfs_mntpt='/dir1')
+
+ self.mount_a.run_shell_payload("pushd dir2; dd if=/dev/urandom of=file bs=4k count=1;")
+ self.mount_a.run_shell_payload("mkdir .snap/one")
+ self.mount_a.run_shell_payload("rm -rf dir2")
+ # ???
+ # Session check_access path ~mds0/stray3/10000000001/file
+ # 2024-07-04T02:05:07.884+0000 7f319ce86640 20 Session check_access: [inode 0x10000000002 [2,2] ~mds0/stray2/10000000001/file ...] caller_uid=1141 caller_gid=1141 caller_gid_list=[1000,1141]
+ # 2024-07-04T02:05:07.884+0000 7f319ce86640 20 Session check_access path ~mds0/stray2/10000000001/file
+ # should be
+ # 2024-07-04T02:11:26.990+0000 7f6b14e71640 20 Session check_access: [inode 0x10000000002 [2,2] ~mds0/stray2/10000000001/file ...] caller_uid=1141 caller_gid=1141 caller_gid_list=[1000,1141]
+ # 2024-07-04T02:11:26.990+0000 7f6b14e71640 20 Session check_access stray_prior_path /dir1/dir2
+ # 2024-07-04T02:11:26.990+0000 7f6b14e71640 10 MDSAuthCap is_capable inode(path /dir1/dir2 owner 1141:1141 mode 0100644) by caller 1141:1141 mask 1 new 0:0 cap: MDSAuthCaps[allow rws fsname=cephfs path="/dir1"]
+ self.mount_a.run_shell_payload("stat .snap/one/dir2/file")
+
+
def test_multimds_mksnap(self):
"""
check if snapshot takes effect across authority subtrees
diff --git a/qa/tasks/cephfs/test_uninlining.py b/qa/tasks/cephfs/test_uninlining.py
new file mode 100644
index 00000000000..91d34a0e277
--- /dev/null
+++ b/qa/tasks/cephfs/test_uninlining.py
@@ -0,0 +1,332 @@
+
+"""
+Test that data is uninlined using scrubbing.
+
+The idea is to untar a linux-5.4.0 kernel tarball's kernel/ dir
+consisting of about 8000 files and uninline about 5145 of those which are
+less than or equal to client_max_inline_size bytes and can be inlined when
+written to while the inline_data config option is enabled.
+
+This test runs across 1 or 2 active MDS, where a subset of the dirs under the
+kernel/ dir are pinned to either of the MDS.
+"""
+
+import os
+import logging
+import threading
+import time
+import json
+
+from io import StringIO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+def remote_mntpt_cmd(mount, cmd):
+ final_cmd = f'cd {mount.hostfs_mntpt} && ' + cmd
+ out = mount.client_remote.sh(final_cmd, stdout=StringIO())
+ return out.strip()
+
+
+class InlineDataInfo:
+ def __init__(self, length: int, version: int):
+ self.inline_data_length = length
+ self.inline_data_version = version
+
+
+class SnapshotterThread(threading.Thread):
+ def __init__(self, base_dir: str, snap_count: int, mount: CephFSMount):
+ super(SnapshotterThread, self).__init__()
+ self.base_dir: str = base_dir
+ self.snap_count: int = snap_count
+ self.mount = mount
+
+ def run(self):
+ for i in range(self.snap_count):
+ cmd = f"mkdir {self.base_dir}/.snap/snap_{i}"
+ remote_mntpt_cmd(self.mount, cmd)
+ time.sleep(1)
+
+
+class TestDataUninlining(CephFSTestCase):
+ MDSS_REQUIRED = 2
+ CLIENTS_REQUIRED = 2
+
+ # data version number of uninlined inode: ((1 << 64) - 1)
+ CEPH_INLINE_NONE = 18446744073709551615
+
+ NUM_SNAPS = 10
+ DUMP_INODE_RETRIES = 10
+
+ def setUp(self):
+ super(TestDataUninlining, self).setUp()
+ self.cache_info = dict()
+ self.unmount_info = dict()
+ self.mount_openbg_info = dict()
+ self.multimds_info = dict()
+ self.snapshot_info = dict()
+
+ self.cache_info[0] = "without clearing cache"
+ self.cache_info[1] = "clear cache before scrub"
+ self.cache_info[2] = "clear cache after scrub"
+ self.unmount_info[0] = "without unmount client"
+ self.unmount_info[1] = "unmount client before scrub"
+ self.unmount_info[2] = "unmount client after scrub"
+ self.mount_openbg_info[0] = "without mount.open_background"
+ self.mount_openbg_info[1] = "with mount.open_background"
+ self.multimds_info[0] = "without multimds"
+ self.multimds_info[1] = "with multimds"
+ self.snapshot_info[0] = "without snapshots"
+ self.snapshot_info[1] = "with snapshots"
+
+ def tearDown(self):
+ super(TestDataUninlining, self).tearDown()
+
+ def extract_inodes(self, files):
+ inodes = []
+ for fil in files:
+ log.debug(f"getting inode for:{fil}")
+ cmd = f'ls -i {fil}'
+ o = remote_mntpt_cmd(self.mount_a, cmd)
+ inodes.append(o.split(' ')[0])
+ return inodes
+
+ def get_inline_data_info(self, inodes, files, dir_pins, num_mds):
+ def get_inode_dump(inode, rank, retries):
+ for i in range(retries):
+ log.debug(f"try #{i+1} - dump inode {inode}")
+ try:
+ json_out = self.fs.rank_tell(['dump', 'inode', inode], rank=rank)
+ if len(json_out) != 0:
+ return json_out
+ except json.decoder.JSONDecodeError:
+ time.sleep(1)
+ finally:
+ if len(json_out) == 0:
+ time.sleep(1)
+ raise json.decoder.JSONDecodeError(f'No JSON found after {retries} attempts', None, 0)
+
+ info = []
+ for i in range(len(inodes)):
+ inode = inodes[i]
+ log.debug(f"getting inode info #{i+1} of {len(inodes)}:{inode}")
+ path = os.path.dirname(files[i])
+ rank = dir_pins[path] if path in dir_pins else 0
+ r = rank
+ while r < rank + num_mds:
+ try:
+ json_out = get_inode_dump(inode,
+ r % num_mds,
+ self.DUMP_INODE_RETRIES)
+ break
+ except json.decoder.JSONDecodeError:
+ pass
+ finally:
+ r += 1
+ self.assertTrue(json_out is not None)
+ self.assertTrue('inline_data_length' in json_out)
+ self.assertTrue('inline_data_version' in json_out)
+ info.append(InlineDataInfo(json_out['inline_data_length'],
+ json_out['inline_data_version']))
+ return info
+
+ def run_test_worker(self,
+ opt_clear_cache,
+ opt_unmount,
+ opt_mount_openbg,
+ opt_multimds,
+ opt_snapshot):
+ log.info("Running Data Uninlining test with: "
+ f"{self.cache_info[opt_clear_cache]}, "
+ f"{self.unmount_info[opt_unmount]}, "
+ f"{self.mount_openbg_info[opt_mount_openbg]}, "
+ f"{self.multimds_info[opt_multimds]}, "
+ f"{self.snapshot_info[opt_snapshot]}")
+
+ # Set max_mds to 1 or 2
+ num_mds = 2 if opt_multimds else 1
+ log.debug(f"setting max_mds:{num_mds}")
+ self.fs.set_max_mds(num_mds)
+
+ # Get configured max inline data size
+ log.debug("getting client_max_inline_size")
+ idsize = self.fs.fs_config.get('client_max_inline_size', 4096)
+ idsize = int(idsize)
+ log.debug(f"got client_max_inline_size:{idsize}")
+
+ # IMPORTANT
+ # At this time, the kernel client doesn't work correctly if
+ # client_max_inline_size is greater tham 4096
+ self.assertTrue(idsize == 4096)
+
+ snapshotter = None
+ if opt_snapshot:
+ log.debug("starting snapshotter thread")
+ cmd = 'mkdir linux-5.4'
+ remote_mntpt_cmd(self.mount_b, cmd)
+ snapshotter = SnapshotterThread("linux-5.4",
+ self.NUM_SNAPS,
+ self.mount_b)
+ snapshotter.start()
+
+ # Extract test data tarball
+ # FIXME
+ log.debug("extracting tarball")
+ cmd = 'tar -x -z -f linux-5.4.tar.gz linux-5.4/fs/ceph linux-5.4/fs/orangefs linux-5.4/fs/ext2'
+ # cmd = 'tar -x -z -f linux-5.4.tar.gz'
+ remote_mntpt_cmd(self.mount_a, cmd)
+
+ bg_proc = None
+ # the data uninlining or snapshot should cause the caps to be revoked
+ # and get the data uninlined without any problems
+ if opt_mount_openbg:
+ log.debug("opening file in background")
+ cap_test_dir = "linux-5.4/fs/cap_revoke_test"
+ cmd = f"mkdir {cap_test_dir}"
+ remote_mntpt_cmd(self.mount_b, cmd)
+ test_file = f"{cap_test_dir}/test_file"
+ bg_proc = self.mount_b.open_background(test_file, True)
+
+ # Get dirs under linux-5.4.0/kernel/
+ # FIXME
+ log.debug("fetching dir list")
+ cmd = 'find linux-5.4/ -mindepth 2 -maxdepth 2 -type d'
+ # cmd = 'find linux-5.4/ -mindepth 1 -maxdepth 1 -type d'
+ o = remote_mntpt_cmd(self.mount_a, cmd)
+ dirs = o.split('\n')
+
+ # Pin dirs alternately to available mds
+ dir_pins = {}
+ log.debug("distributing dir pins")
+ for i in range(len(dirs)):
+ self.mount_a.setfattr(dirs[i], 'ceph.dir.pin', str(i % num_mds))
+ dir_pins[dirs[i]] = i % num_mds
+
+ # Count files with size <= idsize
+ log.debug(f"listing files with size <= {idsize}")
+ cmd = f'find linux-5.4/ -type f -size -{idsize + 1}c'
+ o = remote_mntpt_cmd(self.mount_a, cmd)
+ files = o.split('\n')
+
+ # Dump file count
+ log.info(f'Found {len(files)} inlined files')
+
+ if opt_unmount == 1:
+ log.debug("unmounting mount_a before scrub")
+ self.mount_a.umount()
+
+ if opt_clear_cache == 1:
+ log.debug("clearing cache")
+ for i in range(num_mds):
+ self.fs.rank_tell(['cache', 'drop'], rank=i)
+
+ # Start recursive scrub on rank 0
+ log.debug("starting scrub")
+ out_json = self.fs.run_scrub(["start", "/", "recursive"])
+ log.debug(f"scrub start response: {out_json}")
+
+ # Wait for scrub completion
+ log.debug("waiting for scrub to complete")
+ status = self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"])
+ self.assertEqual(status, True)
+
+ if opt_unmount == 2:
+ log.debug("unmounting mount_a after scrub")
+ self.mount_a.umount()
+
+ if opt_snapshot:
+ log.debug("joining snapshotter thread")
+ snapshotter.join()
+ for i in range(self.NUM_SNAPS):
+ cmd = f"rmdir linux-5.4/.snap/snap_{i}"
+ remote_mntpt_cmd(self.mount_b, cmd)
+
+ if opt_clear_cache == 2:
+ log.debug("clearing cache")
+ for i in range(num_mds):
+ self.fs.rank_tell(['cache', 'drop'], rank=i)
+
+ if opt_unmount > 0:
+ log.debug("remounting mount_a")
+ self.mount_a.mount()
+
+ # Extract inode numbers of inlined files
+ log.debug("extracting inodes")
+ inodes = self.extract_inodes(files)
+
+ # Dump inode info of files with size <= idsize
+ self.assertEqual(len(files), len(inodes))
+
+ log.debug("getting inline data info")
+ info = self.get_inline_data_info(inodes, files, dir_pins, num_mds)
+
+ # cleanup
+ if opt_mount_openbg:
+ log.debug("killing background open file process")
+ self.mount_b.kill_background(bg_proc)
+
+ log.debug("removing dir linux-5.4")
+ remote_mntpt_cmd(self.mount_a, "rm -rf linux-5.4/")
+
+ self.assertEqual(len(info), len(inodes))
+
+ # Count files with inline_data_length == 0 and validate
+ zero_length_count = 0
+ for finfo in info:
+ if int(finfo.inline_data_length) == 0:
+ zero_length_count += 1
+ log.info(f'Found {zero_length_count} files with '
+ 'inline_data_length == 0')
+ self.assertTrue(zero_length_count == len(files))
+
+ # Count files with inline_data_version == 18446744073709551615
+ # and validate
+ uninlined_version_count = 0
+ for finfo in info:
+ if int(finfo.inline_data_version) == self.CEPH_INLINE_NONE:
+ uninlined_version_count += 1
+ log.info(f'Found {uninlined_version_count} files with '
+ 'inline_data_version == CEPH_INLINE_NONE')
+ self.assertTrue(uninlined_version_count == len(files))
+
+ def test_data_uninlining(self):
+ # Enable inline_data
+ log.debug("setting inline_data:1")
+ self.fs.set_var('inline_data', '1', '--yes-i-really-really-mean-it')
+
+ # Fetch tarball
+ log.debug("fetching tarball")
+ cmd = 'wget http://download.ceph.com/qa/linux-5.4.tar.gz'
+ remote_mntpt_cmd(self.mount_a, cmd)
+
+ # multimds
+ # 0: without multimds
+ # 1: with multimds
+ for opt_multimds in [0, 1]:
+ # unmount
+ # 0: do not unmount
+ # 1: unmount before scrub
+ # 2: unmount after scrub
+ for opt_unmount in [0, 1, 2]:
+ # mount
+ # 0: no mount.open_background
+ # 1: mount.open_background
+ for opt_mount_openbg in [0, 1]:
+ # clear cache
+ # 0: do not clear cache
+ # 1: clear cache before scrub
+ # 2: clear cache after scrub
+ for opt_clear_cache in [0, 1, 2]:
+ # snapshots
+ # 0: without snapshots
+ # 1: with snapshots
+ for opt_snapshot in [0, 1]:
+ self.run_test_worker(opt_clear_cache,
+ opt_unmount,
+ opt_mount_openbg,
+ opt_multimds,
+ opt_snapshot)
+
+ remote_mntpt_cmd(self.mount_a, "rm -f linux-5.4.tar.gz")
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 037b046304e..2ee3b6ac052 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -13,10 +13,18 @@ from io import StringIO
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.contextutil import safe_while
from teuthology.exceptions import CommandFailedError
log = logging.getLogger(__name__)
+
+class RsizeDoesntMatch(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+
class TestVolumesHelper(CephFSTestCase):
"""Helper class for testing FS volume, subvolume group and subvolume operations."""
TEST_FILE_NAME_PREFIX="subvolume_file"
@@ -35,19 +43,26 @@ class TestVolumesHelper(CephFSTestCase):
def _raw_cmd(self, *args):
return self.get_ceph_cmd_stdout(args)
- def __check_clone_state(self, state, clone, clone_group=None, timo=120):
- check = 0
+ def __check_clone_state(self, states, clone, clone_group=None, timo=120):
+ if isinstance(states, str):
+ states = (states, )
+
args = ["clone", "status", self.volname, clone]
if clone_group:
args.append(clone_group)
args = tuple(args)
- while check < timo:
- result = json.loads(self._fs_cmd(*args))
- if result["status"]["state"] == state:
- break
- check += 1
- time.sleep(1)
- self.assertTrue(check < timo)
+
+ msg = (f'Executed cmd "{args}" {timo} times; clone was never in '
+ f'"{states}" state(s).')
+
+ with safe_while(tries=timo, sleep=1, action=msg) as proceed:
+ while proceed():
+ result = json.loads(self._fs_cmd(*args))
+ current_state = result["status"]["state"]
+
+ log.debug(f'current clone state = {current_state}')
+ if current_state in states:
+ return
def _get_clone_status(self, clone, clone_group=None):
args = ["clone", "status", self.volname, clone]
@@ -57,6 +72,23 @@ class TestVolumesHelper(CephFSTestCase):
result = json.loads(self._fs_cmd(*args))
return result
+ def _wait_for_clone_to_be_pending(self, clone, clone_group=None,
+ timo=120):
+ # check for "in-progress" state too along with "pending" state, because
+ # if former has occurred it means latter has occured before (which can
+ # happen for such a small time that it is easy to miss) and it won't
+ # occur again.
+ states = ('pending', 'in-progress')
+ self.__check_clone_state(states, clone, clone_group, timo)
+
+ def _wait_for_clone_to_be_canceled(self, clone, clone_group=None,
+ timo=120):
+ # check for "cancelled" state too along with "complete" state, because
+ # it takes some time for a clone job to be cancelled and in that time
+ # a clone job might finish.
+ states = ('canceled', 'complete')
+ self.__check_clone_state(states, clone, clone_group, timo)
+
def _wait_for_clone_to_complete(self, clone, clone_group=None, timo=120):
self.__check_clone_state("complete", clone, clone_group, timo)
@@ -280,6 +312,8 @@ class TestVolumesHelper(CephFSTestCase):
filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i)
self.mount_a.write_n_mb(os.path.join(io_path, filename), file_size)
+ return number_of_files * file_size * 1024 * 1024
+
def _do_subvolume_io_mixed(self, subvolume, subvolume_group=None):
subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group)
@@ -2333,6 +2367,124 @@ class TestSubvolumes(TestVolumesHelper):
# verify trash dir is clean.
self._wait_for_trash_empty()
+
+ def test_subvolume_create_with_earmark(self):
+ # create subvolume with earmark
+ subvolume = self._gen_subvol_name()
+ earmark = "nfs.test"
+ self._fs_cmd("subvolume", "create", self.volname, subvolume, "--earmark", earmark)
+
+ # make sure it exists
+ subvolpath = self._get_subvolume_path(self.volname, subvolume)
+ self.assertNotEqual(subvolpath, None)
+
+ # verify the earmark
+ get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume)
+ self.assertEqual(get_earmark.rstrip('\n'), earmark)
+
+ def test_subvolume_set_and_get_earmark(self):
+ # create subvolume
+ subvolume = self._gen_subvol_name()
+ self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+ # set earmark
+ earmark = "smb"
+ self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
+
+ # get earmark
+ get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume)
+ self.assertEqual(get_earmark.rstrip('\n'), earmark)
+
+ def test_subvolume_clear_earmark(self):
+ # create subvolume
+ subvolume = self._gen_subvol_name()
+ self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+ # set earmark
+ earmark = "smb"
+ self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
+
+ # remove earmark
+ self._fs_cmd("subvolume", "earmark", "rm", self.volname, subvolume)
+
+ # get earmark
+ get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume)
+ self.assertEqual(get_earmark, "")
+
+ def test_earmark_on_non_existing_subvolume(self):
+ subvolume = "non_existing_subvol"
+ earmark = "nfs.test"
+ commands = [
+ ("set", earmark),
+ ("get", None),
+ ("rm", None),
+ ]
+
+ for action, arg in commands:
+ try:
+ # Build the command arguments
+ cmd_args = ["subvolume", "earmark", action, self.volname, subvolume]
+ if arg is not None:
+ cmd_args.extend(["--earmark", arg])
+
+ # Execute the command with built arguments
+ self._fs_cmd(*cmd_args)
+ except CommandFailedError as ce:
+ self.assertEqual(ce.exitstatus, errno.ENOENT)
+
+ def test_get_remove_earmark_when_not_set(self):
+ # Create a subvolume without setting an earmark
+ subvolume = self._gen_subvol_name()
+ self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+ # Attempt to get an earmark when it's not set
+ get_earmark = self._fs_cmd("subvolume", "earmark", "get", self.volname, subvolume)
+ self.assertEqual(get_earmark, "")
+
+ # Attempt to remove an earmark when it's not set
+ self._fs_cmd("subvolume", "earmark", "rm", self.volname, subvolume)
+
+ def test_set_invalid_earmark(self):
+ # Create a subvolume
+ subvolume = self._gen_subvol_name()
+ self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+ # Attempt to set an invalid earmark
+ invalid_earmark = "invalid_format"
+ expected_message = (
+ f"Invalid earmark specified: '{invalid_earmark}'. A valid earmark should "
+ "either be empty or start with 'nfs' or 'smb', followed by dot-separated "
+ "non-empty components."
+ )
+ try:
+ self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", invalid_earmark)
+ except CommandFailedError as ce:
+ self.assertEqual(ce.exitstatus, errno.EINVAL, expected_message)
+
+ def test_earmark_on_deleted_subvolume_with_retained_snapshot(self):
+ subvolume = self._gen_subvol_name()
+ snapshot = self._gen_subvol_snap_name()
+
+ # Create subvolume and snapshot
+ self._fs_cmd("subvolume", "create", self.volname, subvolume)
+ self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+ # Delete subvolume while retaining the snapshot
+ self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots")
+
+ # Define the expected error message
+ error_message = f'subvolume "{subvolume}" is removed and has only snapshots retained'
+
+ # Test cases for setting, getting, and removing earmarks
+ for operation in ["get", "rm", "set"]:
+ try:
+ extra_arg = "smb" if operation == "set" else None
+ if operation == "set":
+ self._fs_cmd("subvolume", "earmark", operation, self.volname, subvolume, "--earmark", extra_arg)
+ else:
+ self._fs_cmd("subvolume", "earmark", operation, self.volname, subvolume)
+ except CommandFailedError as ce:
+ self.assertEqual(ce.exitstatus, errno.ENOENT, error_message)
def test_subvolume_expand(self):
"""
@@ -2406,6 +2558,14 @@ class TestSubvolumes(TestVolumesHelper):
for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']:
self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
+ # set earmark
+ earmark = "smb"
+ self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
+
+ subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
+
+ self.assertEqual(subvol_info["earmark"], earmark)
+
# remove subvolumes
self._fs_cmd("subvolume", "rm", self.volname, subvolume)
@@ -5811,7 +5971,7 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper):
self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
# insert delay at the beginning of snapshot clone
- self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5)
+ self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 15)
# disable "capped" clones
self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', False)
@@ -7653,6 +7813,778 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper):
self._wait_for_trash_empty()
+# NOTE: these tests consumes considerable amount of CPU and RAM due generation
+# random of files and due to multiple cloning jobs that are run simultaneously.
+#
+# NOTE: mgr/vol code generates progress bars for cloning jobs and these tests
+# capture them through "ceph status --format json-pretty" and checks if they
+# are as expected. If cloning happens too fast, these tests will fail to
+# capture progress bars, at least in desired state. Thus, these tests are
+# slightly racy by their very nature.
+#
+# Two measure can be taken to avoid this (and thereby inconsistent results in
+# testing) -
+# 1. Slow down cloning. This was done by adding a sleep after every file is
+# copied. However, this method was rejected since a new config for this would
+# have to be added.
+# 2. Amount of data that will cloned is big enough so that cloning takes enough
+# time for test code to capture the progress bar in desired state and finish
+# running. This is method that has been currently employed. This consumes
+# significantly more time, CPU and RAM in comparison.
+class TestCloneProgressReporter(TestVolumesHelper):
+ '''
+ This class contains tests for features that show how much progress cloning
+ jobs have made.
+ '''
+
+ CLIENTS_REQUIRED = 1
+
+ def setUp(self):
+ super(TestCloneProgressReporter, self).setUp()
+
+ # save this config value so that it can be set again at the end of test
+ # and therefore other tests that might depend on this won't be
+ # disturbed unnecessarily.
+ self.num_of_cloner_threads_def = self.get_ceph_cmd_stdout(
+ 'config get mgr mgr/volumes/max_concurrent_clones').strip()
+
+ # set number of cloner threads to 4, tests in this class depend on this.
+ self.run_ceph_cmd('config set mgr mgr/volumes/max_concurrent_clones 4')
+
+ def tearDown(self):
+ v = self.volname
+ o = self.get_ceph_cmd_stdout('fs volume ls')
+ if self.volname not in o:
+ super(TestCloneProgressReporter, self).tearDown()
+ return
+
+ subvols = self.get_ceph_cmd_stdout(f'fs subvolume ls {v} --format '
+ 'json')
+ subvols = json.loads(subvols)
+ for i in subvols:
+ sv = tuple(i.values())[0]
+ if 'clone' in sv:
+ self.run_ceph_cmd(f'fs subvolume rm --force {v} {sv}')
+ continue
+
+ p = self.run_ceph_cmd(f'fs subvolume snapshot ls {v} {sv} '
+ '--format json', stdout=StringIO())
+ snaps = p.stdout.getvalue().strip()
+ snaps = json.loads(snaps)
+ for j in snaps:
+ ss = tuple(j.values())[0]
+ self.run_ceph_cmd('fs subvolume snapshot rm --force '
+ f'--format json {v} {sv} {ss}')
+
+ try:
+ self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+ except CommandFailedError as e:
+ if e.exitstatus == errno.ENOENT:
+ log.info(
+ 'ignoring this error, perhaps subvolume was deleted '
+ 'during the test and snapshot deleted above is a '
+ 'retained snapshot. when a retained snapshot (which is '
+ 'snapshot retained despite of subvolume deletion) is '
+ 'deleted, the subvolume directory is also deleted '
+ 'along. and before retained snapshot deletion, the '
+ 'subvolume is reported by "subvolume ls" command, which'
+ 'is what probably caused confusion here')
+ pass
+ else:
+ raise
+
+ # verify trash dir is clean
+ self._wait_for_trash_empty()
+
+ self.run_ceph_cmd('config set mgr mgr/volumes/max_concurrent_clones '
+ f'{self.num_of_cloner_threads_def}')
+
+ # this doesn't work as expected because cleanup is not done when a
+ # volume is deleted.
+ #
+ # delete volumes so that all async purge threads, async cloner
+ # threads, progress bars, etc. associated with it are removed from
+ # Ceph cluster.
+ #self.run_ceph_cmd(f'fs volume rm {self.volname} --yes-i-really-mean-it')
+
+ super(self.__class__, self).tearDown()
+
+ # XXX: it is important to wait for rbytes value to catch up to actual size of
+ # subvolume so that progress bar shows sensible amount of progress
+ def wait_till_rbytes_is_right(self, v_name, sv_name, exp_size,
+ grp_name=None, sleep=2, max_count=60):
+ getpath_cmd = f'fs subvolume getpath {v_name} {sv_name}'
+ if grp_name:
+ getpath_cmd += f' {grp_name}'
+ sv_path = self.get_ceph_cmd_stdout(getpath_cmd)
+ sv_path = sv_path[1:]
+
+ for i in range(max_count):
+ r_size = self.mount_a.get_shell_stdout(
+ f'getfattr -n ceph.dir.rbytes {sv_path}').split('rbytes=')[1]
+ r_size = int(r_size.replace('"', '').replace('"', ''))
+ log.info(f'r_size = {r_size} exp_size = {exp_size}')
+ if exp_size == r_size:
+ break
+
+ time.sleep(sleep)
+ else:
+ msg = ('size reported by rstat is not the expected size.\n'
+ f'expected size = {exp_size}\n'
+ f'size reported by rstat = {r_size}')
+ raise RsizeDoesntMatch(msg)
+
+ def test_progress_is_printed_in_clone_status_output(self):
+ '''
+ Test that the command "ceph fs clone status" prints progress stats
+ for the clone.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ # "clone" must be part of clone name for sake of tearDown()
+ c = 'ss1clone1'
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+ self._wait_for_clone_to_be_in_progress(c)
+
+ with safe_while(tries=120, sleep=1) as proceed:
+ while proceed():
+ o = self.get_ceph_cmd_stdout(f'fs clone status {v} {c}')
+ o = json.loads(o)
+
+ try:
+ p = o['status']['progress_report']['percentage cloned']
+ log.debug(f'percentage cloned = {p}')
+ except KeyError:
+ # if KeyError is caught, either progress_report is present
+ # or clone is complete
+ if 'progress_report' in ['status']:
+ self.assertEqual(o['status']['state'], 'complete')
+ break
+
+ self._wait_for_clone_to_complete(c)
+
+ def filter_in_only_clone_pevs(self, progress_events):
+ '''
+ Progress events dictionary in output of "ceph status --format json"
+ has the progress bars and message associated with each progress bar.
+ Sometimes during testing of clone progress bars, and sometimes
+ otherwise too, an extra progress bar is seen with message "Global
+ Recovery Event". This extra progress bar interferes with testing of
+ progress bars for cloning.
+
+ This helper methods goes through this dictionary and picks only
+ (filters in) clone events.
+ '''
+ clone_pevs = {}
+
+ for k, v in progress_events.items():
+ if 'mgr-vol-ongoing-clones' in k or 'mgr-vol-total-clones' in k:
+ clone_pevs[k] = v
+
+ return clone_pevs
+
+ def get_pevs_from_ceph_status(self, clones=None, check=True):
+ o = self.get_ceph_cmd_stdout('status --format json-pretty')
+ o = json.loads(o)
+
+ try:
+ pevs = o['progress_events'] # pevs = progress events
+ except KeyError as e:
+ try:
+ if check and clones:
+ self.__check_clone_state('completed', clone=clones, timo=1)
+ except:
+ msg = ('Didn\'t find expected entries in dictionary '
+ '"progress_events" which is obtained from the '
+ 'output of command "ceph status".\n'
+ f'Exception - {e}\npev -\n{pevs}')
+ raise Exception(msg)
+
+ pevs = self.filter_in_only_clone_pevs(pevs)
+
+ return pevs
+
+ def test_clones_less_than_cloner_threads(self):
+ '''
+ Test that one progress bar is printed in output of "ceph status" output
+ when number of clone jobs is less than number of cloner threads.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ # XXX: "clone" must be part of clone name for sake of tearDown()
+ c = 'ss1clone1'
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+
+ with safe_while(tries=10, sleep=1) as proceed:
+ while proceed():
+ pev = self.get_pevs_from_ceph_status(c)
+
+ if len(pev) < 1:
+ continue
+ elif len(pev) > 1:
+ raise RuntimeError('For 1 clone "ceph status" output has 2 '
+ 'progress bars, it should have only 1 '
+ f'progress bar.\npev -\n{pev}')
+
+ # ensure that exactly 1 progress bar for cloning is present in
+ # "ceph status" output
+ msg = ('"progress_events" dict in "ceph status" output must have '
+ f'exactly one entry.\nprogress_event dict -\n{pev}')
+ self.assertEqual(len(pev), 1, msg)
+
+ pev_msg = tuple(pev.values())[0]['message']
+ self.assertIn('1 ongoing clones', pev_msg)
+ break
+
+ # allowing clone jobs to finish will consume too much time and space
+ # and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def test_clone_to_diff_group_and_less_than_cloner_threads(self):
+ '''
+ Initiate cloning where clone subvolume and source subvolume are located
+ in different groups and then test that when this clone is in progress,
+ one progress bar is printed in output of command "ceph status" that
+ shows progress of this clone.
+ '''
+ v = self.volname
+ group = 'group1'
+ sv = 'sv1'
+ ss = 'ss1'
+ # XXX: "clone" must be part of clone name for sake of tearDown()
+ c = 'ss1clone1'
+
+ self.run_ceph_cmd(f'fs subvolumegroup create {v} {group}')
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} {group} --mode=777')
+ size = self._do_subvolume_io(sv, group, None, 10, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss} {group}')
+ self.wait_till_rbytes_is_right(v, sv, size, group)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c} '
+ f'--group-name {group}')
+
+ with safe_while(tries=10, sleep=1) as proceed:
+ while proceed():
+ pev = self.get_pevs_from_ceph_status(c)
+
+ if len(pev) < 1:
+ continue
+ elif len(pev) > 1:
+ raise RuntimeError('For 1 clone "ceph status" output has 2 '
+ 'progress bars, it should have only 1 '
+ f'progress bar.\npev -\n{pev}')
+
+ # ensure that exactly 1 progress bar for cloning is present in
+ # "ceph status" output
+ msg = ('"progress_events" dict in "ceph status" output must have '
+ f'exactly one entry.\nprogress_event dict -\n{pev}')
+ self.assertEqual(len(pev), 1, msg)
+
+ pev_msg = tuple(pev.values())[0]['message']
+ self.assertIn('1 ongoing clones', pev_msg)
+ break
+
+ # allowing clone jobs to finish will consume too much time and space
+ # and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def test_clone_after_subvol_is_removed(self):
+ '''
+ Initiate cloning after source subvolume has been deleted but with
+ snapshots retained and then test that, when this clone is in progress,
+ one progress bar is printed in output of command "ceph status" that
+ shows progress of this clone.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ # XXX: "clone" must be part of clone name for sake of tearDown()
+ c = 'ss1clone1'
+
+ # XXX: without setting mds_snap_rstat to true rstats are not updated on
+ # a subvolume snapshot and therefore clone progress bar will not show
+ # any progress.
+ self.config_set('mds', 'mds_snap_rstat', 'true')
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots')
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+
+ with safe_while(tries=15, sleep=10) as proceed:
+ while proceed():
+ pev = self.get_pevs_from_ceph_status(c)
+
+ if len(pev) < 1:
+ continue
+ elif len(pev) > 1:
+ raise RuntimeError('For 1 clone "ceph status" output has 2 '
+ 'progress bars, it should have only 1 '
+ f'progress bar.\npev -\n{pev}')
+
+ # ensure that exactly 1 progress bar for cloning is present in
+ # "ceph status" output
+ msg = ('"progress_events" dict in "ceph status" output must have '
+ f'exactly one entry.\nprogress_event dict -\n{pev}')
+ self.assertEqual(len(pev), 1, msg)
+
+ pev_msg = tuple(pev.values())[0]['message']
+ self.assertIn('1 ongoing clones', pev_msg)
+ break
+
+ # allowing clone jobs to finish will consume too much time and space
+ # and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def test_clones_equal_to_cloner_threads(self):
+ '''
+ Test that one progress bar is printed in output of "ceph status" output
+ when number of clone jobs is equal to number of cloner threads.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ c = self._gen_subvol_clone_name(4)
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ for i in c:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+
+ with safe_while(tries=10, sleep=1) as proceed:
+ while proceed():
+ pev = self.get_pevs_from_ceph_status(c)
+
+ if len(pev) < 1:
+ time.sleep(1)
+ continue
+ elif len(pev) > 1:
+ raise RuntimeError('For 1 clone "ceph status" output has 2 '
+ 'progress bars, it should have only 1 '
+ f'progress bar.\npev -\n{pev}')
+
+ # ensure that exactly 1 progress bar for cloning is present in
+ # "ceph status" output
+ msg = ('"progress_events" dict in "ceph status" output must have '
+ f'exactly one entry.\nprogress_event dict -\n{pev}')
+ self.assertEqual(len(pev), 1, msg)
+
+ pev_msg = tuple(pev.values())[0]['message']
+ self.assertIn('ongoing clones', pev_msg)
+ break
+
+ # allowing clone jobs to finish will consume too much time and space
+ # and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def wait_for_both_progress_bars_to_appear(self, sleep=1, iters=20):
+ pevs = []
+ msg = (f'Waited for {iters*sleep} seconds but couldn\'t 2 progress '
+ 'bars in output of "ceph status" command.')
+ with safe_while(tries=iters, sleep=sleep, action=msg) as proceed:
+ while proceed():
+ o = self.get_ceph_cmd_stdout('status --format json-pretty')
+ o = json.loads(o)
+ pevs = o['progress_events']
+ pevs = self.filter_in_only_clone_pevs(pevs)
+ if len(pevs) == 2:
+ v = tuple(pevs.values())
+ if 'ongoing+pending' in v[1]['message']:
+ self.assertIn('ongoing', v[0]['message'])
+ else:
+ self.assertIn('ongoing', v[1]['message'])
+ self.assertIn('ongoing+pending', v[0]['message'])
+ break
+
+ def test_clones_more_than_cloner_threads(self):
+ '''
+ Test that 2 progress bars are printed in output of "ceph status"
+ command when number of clone jobs is greater than number of cloner
+ threads.
+
+ Also, test that one of these progress bars is for ongoing clones and
+ other progress bar for ongoing+pending clones.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ c = self._gen_subvol_clone_name(7)
+
+ self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false')
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ for i in c:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+
+ msg = ('messages for progress bars for snapshot cloning are not how '
+ 'they were expected')
+ with safe_while(tries=20, sleep=1, action=msg) as proceed:
+ while proceed():
+ pevs = self.get_pevs_from_ceph_status(c)
+
+ if len(pevs) <= 1:
+ continue # let's wait for second progress bar to appear
+ elif len(pevs) > 2:
+ raise RuntimeError(
+ 'More than 2 progress bars were found in the output '
+ 'of "ceph status" command.\nprogress events -'
+ f'\n{pevs}')
+
+ msg = ('"progress_events" dict in "ceph -s" output must have '
+ f'only two entries.\n{pevs}')
+ self.assertEqual(len(pevs), 2, msg)
+ pev1, pev2 = pevs.values()
+ if ('ongoing clones' in pev1['message'].lower() and
+ 'total ' in pev2['message'].lower()):
+ break
+ elif ('ongoing clones' in pev2['message'].lower() or
+ 'total ' in pev1['message'].lower()):
+ break
+ else:
+ raise RuntimeError(msg)
+
+ # allowing clone jobs to finish will consume too much time, space and
+ # CPU and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def get_onpen_count(self, pev):
+ '''
+ Return number of clones reported in the message of progress bar for
+ ongoing+pending clones.
+ '''
+ i = pev['message'].find('ongoing+pending')
+ if i == -1:
+ return
+ count = pev['message'][:i]
+ count = count[:-1] # remomve trailing space
+ count = int(count)
+ return count
+
+ def get_both_progress_fractions_and_onpen_count(self):
+ '''
+ Go through output of "ceph status --format json-pretty" and return
+ progress made by both clones (that is progress fractions) and return
+ number of clones in reported in message of ongoing+pending progress
+ bar.
+ '''
+ msg = 'Expected 2 progress bars but found ' # rest continued in loop
+ with safe_while(tries=20, sleep=1, action=msg) as proceed:
+ while proceed():
+ o = self.get_ceph_cmd_stdout('status --format json-pretty')
+ o = json.loads(o)
+ pevs = o['progress_events']
+ pevs = self.filter_in_only_clone_pevs(pevs)
+ if len(pevs.values()) == 2:
+ break
+ else:
+ msg += f'{len(pevs)} instead'
+
+ log.info(f'pevs -\n{pevs}')
+ # on_p - progress fraction for ongoing clone jobs
+ # onpen_p - progress fraction for ongoing+pending clone jobs
+ pev1, pev2 = tuple(pevs.values())
+ if 'ongoing+pending' in pev1['message']:
+ onpen_p = pev1['progress']
+ onpen_count = self.get_onpen_count(pev1)
+ on_p = pev2['progress']
+ else:
+ onpen_p = pev2['progress']
+ onpen_count = self.get_onpen_count(pev2)
+ on_p = pev1['progress']
+
+ on_p = float(on_p)
+ onpen_p = float(onpen_p)
+
+ return on_p, onpen_p, onpen_count
+
+ # "ceph fs clone cancel" command takes considerable time to finish running.
+ # test cases where more than 4 clones are being cancelled, this error is
+ # seen, and can be safely ignored since it only implies that cloning has
+ # been finished.
+ def cancel_clones_and_ignore_if_finished(self, clones):
+ if isinstance(clones, str):
+ clones = (clones, )
+
+ for c in clones:
+ cmdargs = f'fs clone cancel {self.volname} {c}'
+ proc = self.run_ceph_cmd(args=cmdargs, stderr=StringIO(),
+ check_status=False)
+
+ stderr = proc.stderr.getvalue().strip().lower()
+ if proc.exitstatus == 0:
+ continue
+ elif proc.exitstatus == 22 and 'clone finished' in stderr:
+ continue
+ else:
+ cmdargs = './bin/ceph ' + cmdargs
+ raise CommandFailedError(cmdargs, proc.exitstatus)
+
+ def cancel_clones(self, clones, check_status=True):
+ v = self.volname
+ if not isinstance(clones, (tuple, list)):
+ clones = (clones, )
+
+ for i in clones:
+ self.run_ceph_cmd(f'fs clone cancel {v} {i}',
+ check_status=check_status)
+ time.sleep(2)
+
+ # check status is False since this method is meant to cleanup clones at
+ # the end of a test case and some clones might already be complete.
+ def cancel_clones_and_confirm(self, clones, check_status=False):
+ if not isinstance(clones, (tuple, list)):
+ clones = (clones, )
+
+ self.cancel_clones(clones, check_status)
+
+ for i in clones:
+ self._wait_for_clone_to_be_canceled(i)
+
+ def cancel_clones_and_assert(self, clones):
+ v = self.volname
+ if not isinstance(clones, (tuple, list)):
+ clones = (clones, )
+
+ self.cancel_clones(clones, True)
+
+ for i in clones:
+ o = self.get_ceph_cmd_stdout(f'fs clone status {v} {i}')
+ try:
+ self.assertIn('canceled', o)
+ except AssertionError:
+ self.assertIn('complete', o)
+
+ def test_progress_drops_when_new_jobs_are_added(self):
+ '''
+ Test that progress indicated by progress bar for ongoing+pending clones
+ drops when more clone jobs are launched.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ c = self._gen_subvol_clone_name(20)
+
+ self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false')
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ for i in c[:5]:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+
+ tuple_ = self.get_both_progress_fractions_and_onpen_count()
+ if isinstance(tuple_, (list, tuple)) and len(tuple_) == 3:
+ on_p, onpen_p, onpen_count = tuple_
+
+ # this should cause onpen progress bar to go back
+ for i in c[5:]:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+ time.sleep(2)
+
+ with safe_while(tries=30, sleep=0.5) as proceed:
+ while proceed():
+ tuple_ = self.get_both_progress_fractions_and_onpen_count()
+ new_on_p, new_onpen_p, new_onpen_count = tuple_
+ if new_onpen_p < onpen_p:
+ log.info('new_onpen_p is less than onpen_p.')
+ log.info(f'new_onpen_p = {new_onpen_p}; onpen_p = {onpen_p}')
+ break
+ log.info(f'on_p = {on_p} new_on_p = {new_on_p}')
+ log.info(f'onpen_p = {onpen_p} new_onpen_p = {new_onpen_p}')
+ log.info(f'onpen_count = {onpen_count} new_onpen_count = '
+ f'{new_onpen_count}')
+ else:
+ self.cancel_clones_and_ignore_if_finished(c)
+ raise RuntimeError('Test failed: it was expected for '
+ '"new_onpen_p < onpen_p" to be true.')
+
+ # average progress for "ongoing + pending" clone jobs must
+ # reduce since a new job was added to penidng state
+ self.assertLess(new_onpen_p, onpen_p)
+
+ # allowing clone jobs to finish will consume too much time and space
+ # and not cancelling these clone doesnt affect this test case.
+ self.cancel_clones_and_ignore_if_finished(c)
+
+ def _wait_for_clone_progress_bars_to_be_removed(self):
+ with safe_while(tries=10, sleep=0.5) as proceed:
+ while proceed():
+ o = self.get_ceph_cmd_stdout('status --format json-pretty')
+ o = json.loads(o)
+
+ pevs = o['progress_events'] # pevs = progress events
+ pevs = self.filter_in_only_clone_pevs(pevs)
+ if not pevs:
+ break
+
+ def test_when_clones_cancelled_are_less_than_cloner_threads(self):
+ '''
+ Test that the progress bar that is printed for 1 ongoing clone job is
+ removed from the output of "ceph status" command when a clone is
+ cancelled.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ # "clone" must be part of clone name for sake of tearDown()
+ c = 'ss1clone1'
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+
+ sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}')
+ sv_path = sv_path[1:]
+
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+ time.sleep(1)
+ self.cancel_clones_and_ignore_if_finished(c)
+ self._wait_for_clone_to_be_canceled(c)
+ self._wait_for_clone_progress_bars_to_be_removed()
+
+ # test that cloning had begun but didn't finish.
+ try:
+ sv_path = sv_path.replace(sv, c)
+ o = self.mount_a.run_shell(f'ls -lh {sv_path}')
+ o = o.stdout.getvalue().strip()
+ # ensure that all files were not copied. 'ls -lh' will print 1 file
+ # per line with an extra line for summary, so this command must
+ # print less than 4 lines
+ self.assertLess(len(o.split('\n')), 4)
+ except CommandFailedError as cfe:
+ # if command failed due to errno 2 (no such file or dir), this
+ # means cloning hadn't begun yet. that too is fine
+ if cfe.exitstatus == 2:
+ pass
+ else:
+ raise
+
+ def test_when_clones_cancelled_are_equal_to_cloner_threads(self):
+ '''
+ Test that progress bars, that printed for 3 ongoing clone jobs, are
+ removed from the output of "ceph status" command when all 3 clone jobs
+ are cancelled.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ c = self._gen_subvol_clone_name(3)
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+
+ sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}')
+ sv_path = sv_path[1:]
+
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ for i in c:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+ time.sleep(1)
+ self.cancel_clones_and_ignore_if_finished(c)
+ for i in c:
+ self._wait_for_clone_to_be_canceled(i)
+ self._wait_for_clone_progress_bars_to_be_removed()
+
+ try:
+ sv_path = sv_path.replace(sv, c[0])
+ o = self.mount_a.run_shell(f'ls -lh {sv_path}')
+ o = o.stdout.getvalue().strip()
+ log.info(o)
+ # ensure that all files were not copied. 'ls -lh' will print 1 file
+ # per line with an extra line for summary, so this command must
+ # print less than 4 lines
+ self.assertLess(len(o.split('\n')), 4)
+ except CommandFailedError as cfe:
+ # if command failed due to errno 2 (no such file or dir), this
+ # means cloning hadn't begun yet. that too is fine
+ if cfe.exitstatus == errno.ENOENT:
+ pass
+ else:
+ raise
+
+ def test_when_clones_cancelled_are_more_than_cloner_threads(self):
+ '''
+ Test that both the progress bars, that are printed for all 7 clone
+ jobs, are removed from the output of "ceph status" command when all
+ these clones are cancelled.
+ '''
+ v = self.volname
+ sv = 'sv1'
+ ss = 'ss1'
+ c = self._gen_subvol_clone_name(7)
+
+ self.config_set('mgr', 'mgr/volumes/snapshot_clone_no_wait', 'false')
+
+ self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+
+ sv_path = self.get_ceph_cmd_stdout(f'fs subvolume getpath {v} {sv}')
+ sv_path = sv_path[1:]
+
+ size = self._do_subvolume_io(sv, None, None, 3, 1024)
+ self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+ self.wait_till_rbytes_is_right(v, sv, size)
+
+ for i in c:
+ self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {i}')
+ time.sleep(1)
+ self.cancel_clones_and_ignore_if_finished(c)
+ for i in c:
+ self._wait_for_clone_to_be_canceled(i)
+ self._wait_for_clone_progress_bars_to_be_removed()
+
+ try:
+ sv_path = sv_path.replace(sv, c[0])
+ o = self.mount_a.run_shell(f'ls -lh {sv_path}')
+ o = o.stdout.getvalue().strip()
+ log.info(o)
+ # ensure that all files were not copied. 'ls -lh' will print 1 file
+ # per line with an extra line for summary, so this command must
+ # print less than 4 lines
+ self.assertLess(len(o.split('\n')), 4)
+ except CommandFailedError as cfe:
+ # if command failed due to errno 2 (no such file or dir), this
+ # means cloning hadn't begun yet. that too is fine
+ if cfe.exitstatus == errno.ENOENT:
+ pass
+ else:
+ raise
+
+
class TestMisc(TestVolumesHelper):
"""Miscellaneous tests related to FS volume, subvolume group, and subvolume operations."""
def test_connection_expiration(self):
diff --git a/qa/tasks/check_counter.py b/qa/tasks/check_counter.py
index 40818f3f475..1f63b6a0bd4 100644
--- a/qa/tasks/check_counter.py
+++ b/qa/tasks/check_counter.py
@@ -1,11 +1,14 @@
import logging
import json
+import errno
from teuthology.task import Task
from teuthology import misc
from tasks import ceph_manager
+from tasks.cephfs.filesystem import MDSCluster
+from teuthology.exceptions import CommandFailedError
log = logging.getLogger(__name__)
@@ -61,6 +64,9 @@ class CheckCounter(Task):
mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager'))
active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"]
+ mds_cluster = MDSCluster(self.ctx)
+ status = mds_cluster.status()
+
for daemon_type, counters in targets.items():
# List of 'a', 'b', 'c'...
daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
@@ -80,13 +86,31 @@ class CheckCounter(Task):
else:
log.debug("Getting stats from {0}".format(daemon_id))
- manager = self.ctx.managers[cluster_name]
- proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
- response_data = proc.stdout.getvalue().strip()
+ if daemon_type == 'mds':
+ mds_info = status.get_mds(daemon_id)
+ if not mds_info:
+ continue
+ mds = f"mds.{mds_info['gid']}"
+ if mds_info['state'] != "up:active":
+ log.debug(f"skipping {mds}")
+ continue
+ log.debug(f"Getting stats from {mds}")
+ try:
+ proc = mon_manager.raw_cluster_cmd("tell", mds, "perf", "dump",
+ "--format=json-pretty")
+ response_data = proc.strip()
+ except CommandFailedError as e:
+ if e.exitstatus == errno.ENOENT:
+ log.debug(f"Failed to do 'perf dump' on {mds}")
+ continue
+ else:
+ manager = self.ctx.managers[cluster_name]
+ proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
+ response_data = proc.stdout.getvalue().strip()
if response_data:
perf_dump = json.loads(response_data)
else:
- log.warning("No admin socket response from {0}, skipping".format(daemon_id))
+ log.warning("No response from {0}, skipping".format(daemon_id))
continue
minval = ''
diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py
index 2ac92439de6..d955d232c2c 100644
--- a/qa/tasks/fwd_scrub.py
+++ b/qa/tasks/fwd_scrub.py
@@ -33,6 +33,8 @@ class ForwardScrubber(ThrasherGreenlet):
def _run(self):
try:
self.do_scrub()
+ except ThrasherGreenlet.Stopped:
+ pass
except Exception as e:
self.set_thrasher_exception(e)
self.logger.exception("exception:")
diff --git a/qa/tasks/kafka.py b/qa/tasks/kafka.py
index 5e6c208ca30..833f03babf6 100644
--- a/qa/tasks/kafka.py
+++ b/qa/tasks/kafka.py
@@ -4,6 +4,7 @@ Deploy and configure Kafka for Teuthology
import contextlib
import logging
import time
+import os
from teuthology import misc as teuthology
from teuthology import contextutil
@@ -33,6 +34,13 @@ def install_kafka(ctx, config):
assert isinstance(config, dict)
log.info('Installing Kafka...')
+ # programmatically find a nearby mirror so as not to hammer archive.apache.org
+ apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+ "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+ log.info("determining apache mirror by running: " + apache_mirror_cmd)
+ apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+ log.info("chosen apache mirror is " + apache_mirror_url_front)
+
for (client, _) in config.items():
(remote,) = ctx.cluster.only(client).remotes.keys()
test_dir=teuthology.get_testdir(ctx)
@@ -40,7 +48,8 @@ def install_kafka(ctx, config):
kafka_file = kafka_prefix + current_version + '.tgz'
- link1 = 'https://archive.apache.org/dist/kafka/' + current_version + '/' + kafka_file
+ link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+ current_version + '/' + kafka_file
ctx.cluster.only(client).run(
args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1],
)
diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py
new file mode 100644
index 00000000000..3ca60ab84fc
--- /dev/null
+++ b/qa/tasks/kafka_failover.py
@@ -0,0 +1,244 @@
+"""
+Deploy and configure Kafka for Teuthology
+"""
+import contextlib
+import logging
+import time
+import os
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def get_kafka_version(config):
+ for client, client_config in config.items():
+ if 'kafka_version' in client_config:
+ kafka_version = client_config.get('kafka_version')
+ return kafka_version
+
+kafka_prefix = 'kafka_2.13-'
+
+def get_kafka_dir(ctx, config):
+ kafka_version = get_kafka_version(config)
+ current_version = kafka_prefix + kafka_version
+ return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version)
+
+
+@contextlib.contextmanager
+def install_kafka(ctx, config):
+ """
+ Downloading the kafka tar file.
+ """
+ assert isinstance(config, dict)
+ log.info('Installing Kafka...')
+
+ # programmatically find a nearby mirror so as not to hammer archive.apache.org
+ apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+ "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+ log.info("determining apache mirror by running: " + apache_mirror_cmd)
+ apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+ log.info("chosen apache mirror is " + apache_mirror_url_front)
+
+ for (client, _) in config.items():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ test_dir=teuthology.get_testdir(ctx)
+ current_version = get_kafka_version(config)
+
+ kafka_file = kafka_prefix + current_version + '.tgz'
+
+ link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+ current_version + '/' + kafka_file
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1],
+ )
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file],
+ )
+
+ kafka_dir = get_kafka_dir(ctx, config)
+ # create config for second broker
+ second_broker_config_name = "server2.properties"
+ second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir)
+ second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/")
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'),
+ 'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+ 'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir)
+ ],
+ )
+
+ # edit config
+ ctx.cluster.only(client).run(
+ args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+ 'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+ 'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+ 'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'),
+ 'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name)
+ ]
+ )
+
+ try:
+ yield
+ finally:
+ log.info('Removing packaged dependencies of Kafka...')
+ test_dir=get_kafka_dir(ctx, config)
+ current_version = get_kafka_version(config)
+ for (client,_) in config.items():
+ ctx.cluster.only(client).run(
+ args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)],
+ )
+
+ ctx.cluster.only(client).run(
+ args=['rm', '-rf', test_dir],
+ )
+
+ ctx.cluster.only(client).run(
+ args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)],
+ )
+
+
+@contextlib.contextmanager
+def run_kafka(ctx,config):
+ """
+ This includes two parts:
+ 1. Starting Zookeeper service
+ 2. Starting Kafka service
+ """
+ assert isinstance(config, dict)
+ log.info('Bringing up Zookeeper and Kafka services...')
+ for (client,_) in config.items():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ kafka_dir = get_kafka_dir(ctx, config)
+
+ second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir)
+ second_broker_java_log_dir = "{}/java_logs".format(second_broker_data)
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+ './zookeeper-server-start.sh',
+ '{tir}/config/zookeeper.properties'.format(tir=kafka_dir),
+ run.Raw('&'), 'exit'
+ ],
+ )
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+ './kafka-server-start.sh',
+ '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)),
+ run.Raw('&'), 'exit'
+ ],
+ )
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'),
+ run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)),
+ './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir),
+ run.Raw('&'), 'exit'
+ ],
+ )
+
+ try:
+ yield
+ finally:
+ log.info('Stopping Zookeeper and Kafka Services...')
+
+ for (client, _) in config.items():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+ './kafka-server-stop.sh',
+ '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)),
+ ],
+ )
+
+ time.sleep(5)
+
+ ctx.cluster.only(client).run(
+ args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+ './zookeeper-server-stop.sh',
+ '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)),
+ ],
+ )
+
+ time.sleep(5)
+
+ ctx.cluster.only(client).run(args=['killall', '-9', 'java'])
+
+
+@contextlib.contextmanager
+def run_admin_cmds(ctx,config):
+ """
+ Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic.
+ """
+ assert isinstance(config, dict)
+ log.info('Checking kafka server through producer/consumer commands...')
+ for (client,_) in config.items():
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+ './kafka-topics.sh', '--create', '--topic', 'quickstart-events',
+ '--bootstrap-server', 'localhost:9092'
+ ],
+ )
+
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+ 'echo', "First", run.Raw('|'),
+ './kafka-console-producer.sh', '--topic', 'quickstart-events',
+ '--bootstrap-server', 'localhost:9092'
+ ],
+ )
+
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'),
+ './kafka-console-consumer.sh', '--topic', 'quickstart-events',
+ '--from-beginning',
+ '--bootstrap-server', 'localhost:9092',
+ run.Raw('&'), 'exit'
+ ],
+ )
+
+ try:
+ yield
+ finally:
+ pass
+
+
+@contextlib.contextmanager
+def task(ctx,config):
+ """
+ Following is the way how to run kafka::
+ tasks:
+ - kafka:
+ client.0:
+ kafka_version: 2.6.0
+ """
+ assert config is None or isinstance(config, list) \
+ or isinstance(config, dict), \
+ "task kafka only supports a list or dictionary for configuration"
+
+ all_clients = ['client.{id}'.format(id=id_)
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+ if config is None:
+ config = all_clients
+ if isinstance(config, list):
+ config = dict.fromkeys(config)
+
+ log.debug('Kafka config is %s', config)
+
+ with contextutil.nested(
+ lambda: install_kafka(ctx=ctx, config=config),
+ lambda: run_kafka(ctx=ctx, config=config),
+ lambda: run_admin_cmds(ctx=ctx, config=config),
+ ):
+ yield
+
diff --git a/qa/tasks/mgr/dashboard/helper.py b/qa/tasks/mgr/dashboard/helper.py
index d80e238a2a8..55355048a36 100644
--- a/qa/tasks/mgr/dashboard/helper.py
+++ b/qa/tasks/mgr/dashboard/helper.py
@@ -9,7 +9,8 @@ import re
import string
import time
from collections import namedtuple
-from typing import List
+from functools import wraps
+from typing import List, Optional, Tuple, Type, Union
import requests
from tasks.mgr.mgr_test_case import MgrTestCase
@@ -219,13 +220,11 @@ class DashboardTestCase(MgrTestCase):
# To avoid any issues with e.g. unlink bugs, we destroy and recreate
# the filesystem rather than just doing a rm -rf of files
- cls.mds_cluster.mds_stop()
- cls.mds_cluster.mds_fail()
cls.mds_cluster.delete_all_filesystems()
+ cls.mds_cluster.mds_restart() # to reset any run-time configs, etc.
cls.fs = None # is now invalid!
cls.fs = cls.mds_cluster.newfs(create=True)
- cls.fs.mds_restart()
# In case some test messed with auth caps, reset them
# pylint: disable=not-an-iterable
@@ -343,16 +342,16 @@ class DashboardTestCase(MgrTestCase):
@classmethod
def _view_cache_get(cls, url, retries=5):
- retry = True
- while retry and retries > 0:
- retry = False
+ _retry = True
+ while _retry and retries > 0:
+ _retry = False
res = cls._get(url, version=DEFAULT_API_VERSION)
if isinstance(res, dict):
res = [res]
for view in res:
assert 'value' in view
if not view['value']:
- retry = True
+ _retry = True
retries -= 1
if retries == 0:
raise Exception("{} view cache exceeded number of retries={}"
@@ -722,3 +721,25 @@ def _validate_json(val, schema, path=[]):
return _validate_json(val, JLeaf(schema), path)
assert False, str(path)
+
+
+def retry(
+ on_exception: Union[Type[Exception], Tuple[Type[Exception], ...]],
+ tries=3,
+ delay=0,
+ logger: Optional[logging.Logger] = None,
+):
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ for i in range(tries):
+ try:
+ return func(*args, **kwargs)
+ except on_exception as e:
+ err = e
+ if logger:
+ logger.warn(f"Retried #{i+1}/{tries}: '{func.__name__}' raised '{e}'")
+ time.sleep(delay)
+ raise err
+ return wrapper
+ return decorator
diff --git a/qa/tasks/mgr/dashboard/test_auth.py b/qa/tasks/mgr/dashboard/test_auth.py
index a2266229bef..2b9240b635e 100644
--- a/qa/tasks/mgr/dashboard/test_auth.py
+++ b/qa/tasks/mgr/dashboard/test_auth.py
@@ -152,7 +152,8 @@ class AuthTest(DashboardTestCase):
self._post("/api/auth/logout")
self.assertStatus(200)
self.assertJsonBody({
- "redirect_url": "#/login"
+ "redirect_url": "#/login",
+ "protocol": 'local'
})
self._get("/api/host", version='1.1')
self.assertStatus(401)
@@ -167,7 +168,8 @@ class AuthTest(DashboardTestCase):
self._post("/api/auth/logout", set_cookies=True)
self.assertStatus(200)
self.assertJsonBody({
- "redirect_url": "#/login"
+ "redirect_url": "#/login",
+ "protocol": 'local'
})
self._get("/api/host", set_cookies=True, version='1.1')
self.assertStatus(401)
diff --git a/qa/tasks/mgr/dashboard/test_mgr_module.py b/qa/tasks/mgr/dashboard/test_mgr_module.py
index 2b8b672f284..1dbdef23d34 100644
--- a/qa/tasks/mgr/dashboard/test_mgr_module.py
+++ b/qa/tasks/mgr/dashboard/test_mgr_module.py
@@ -4,9 +4,11 @@ from __future__ import absolute_import
import logging
import requests
+from urllib3.exceptions import MaxRetryError
from .helper import (DashboardTestCase, JLeaf, JList, JObj,
- module_options_object_schema, module_options_schema)
+ module_options_object_schema, module_options_schema,
+ retry)
logger = logging.getLogger(__name__)
@@ -14,6 +16,7 @@ logger = logging.getLogger(__name__)
class MgrModuleTestCase(DashboardTestCase):
MGRS_REQUIRED = 1
+ @retry(on_exception=RuntimeError, tries=2, delay=0.5, logger=logger)
def wait_until_rest_api_accessible(self):
"""
Wait until the REST API is accessible.
@@ -22,10 +25,11 @@ class MgrModuleTestCase(DashboardTestCase):
def _check_connection():
try:
# Try reaching an API endpoint successfully.
+ logger.info('Trying to reach the REST API endpoint')
self._get('/api/mgr/module')
if self._resp.status_code == 200:
return True
- except requests.ConnectionError:
+ except (MaxRetryError, requests.ConnectionError):
pass
return False
diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
index 71cf3d87194..be7afccf331 100644
--- a/qa/tasks/mgr/dashboard/test_osd.py
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -5,12 +5,13 @@ from __future__ import absolute_import
import json
from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple,
- devices_schema)
+ devices_schema, log, retry)
class OsdTest(DashboardTestCase):
AUTH_ROLES = ['cluster-manager']
+ _VERSION = '1.1'
@classmethod
def setUpClass(cls):
@@ -24,7 +25,7 @@ class OsdTest(DashboardTestCase):
@DashboardTestCase.RunAs('test', 'test', ['block-manager'])
def test_access_permissions(self):
- self._get('/api/osd')
+ self._get('/api/osd', version=self._VERSION)
self.assertStatus(403)
self._get('/api/osd/0')
self.assertStatus(403)
@@ -33,7 +34,7 @@ class OsdTest(DashboardTestCase):
self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
def test_list(self):
- data = self._get('/api/osd')
+ data = self._get('/api/osd', version=self._VERSION)
self.assertStatus(200)
self.assertGreaterEqual(len(data), 1)
@@ -283,13 +284,18 @@ class OsdFlagsTest(DashboardTestCase):
if osd['osd'] == osd_initial['osd']:
self.assertGreater(len(osd['flags']), len(osd_initial['flags']))
- self._ceph_cmd(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
- flags_removed = self._get('/api/osd/flags/individual')
- self.assertStatus(200)
- for osd in flags_removed:
- if osd['osd'] in [0, 1, 2]:
- self.assertNotIn('noout', osd['flags'])
- self.assertNotIn('noin', osd['flags'])
+ ret = self._ceph_cmd_result(['osd', 'unset-group', 'noout,noin', 'osd.0', 'osd.1', 'osd.2'])
+ self.assertEqual(ret, 0)
+
+ @retry(on_exception=AssertionError, tries=2, delay=0.5, logger=log)
+ def check_osd_flags():
+ flags_removed = self._get('/api/osd/flags/individual')
+ self.assertStatus(200)
+ for osd in flags_removed:
+ if osd['osd'] in [0, 1, 2]:
+ self.assertNotIn('noout', osd['flags'])
+ self.assertNotIn('noin', osd['flags'])
+ check_osd_flags()
def test_add_indiv_flag(self):
flags_update = {'noup': None, 'nodown': None, 'noin': None, 'noout': True}
diff --git a/qa/tasks/mgr/dashboard/test_rbd.py b/qa/tasks/mgr/dashboard/test_rbd.py
index a872645e33e..83b3bf520c2 100644
--- a/qa/tasks/mgr/dashboard/test_rbd.py
+++ b/qa/tasks/mgr/dashboard/test_rbd.py
@@ -869,7 +869,19 @@ class RbdTest(DashboardTestCase):
self.assertEqual(clone_format_version, 2)
self.assertStatus(200)
+ # if empty list is sent, then the config will remain as it is
value = []
+ res = [{'section': "global", 'value': "2"}]
+ self._post('/api/cluster_conf', {
+ 'name': config_name,
+ 'value': value
+ })
+ self.wait_until_equal(
+ lambda: _get_config_by_name(config_name),
+ res,
+ timeout=60)
+
+ value = [{'section': "global", 'value': ""}]
self._post('/api/cluster_conf', {
'name': config_name,
'value': value
diff --git a/qa/tasks/mgr/dashboard/test_rgw.py b/qa/tasks/mgr/dashboard/test_rgw.py
index 5c7b0329675..a9071bc2a3a 100644
--- a/qa/tasks/mgr/dashboard/test_rgw.py
+++ b/qa/tasks/mgr/dashboard/test_rgw.py
@@ -785,7 +785,7 @@ class RgwUserSubuserTest(RgwTestCase):
'access': 'readwrite',
'key_type': 'swift'
})
- self.assertStatus(200)
+ self.assertStatus(201)
data = self.jsonBody()
subuser = self.find_object_in_list('id', 'teuth-test-user:tux', data)
self.assertIsInstance(subuser, object)
@@ -808,7 +808,7 @@ class RgwUserSubuserTest(RgwTestCase):
'access_key': 'yyy',
'secret_key': 'xxx'
})
- self.assertStatus(200)
+ self.assertStatus(201)
data = self.jsonBody()
subuser = self.find_object_in_list('id', 'teuth-test-user:hugo', data)
self.assertIsInstance(subuser, object)
diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py
index 74b1e9d850c..4a5506391f2 100644
--- a/qa/tasks/mgr/mgr_test_case.py
+++ b/qa/tasks/mgr/mgr_test_case.py
@@ -1,5 +1,6 @@
import json
import logging
+import socket
from unittest import SkipTest
@@ -108,7 +109,7 @@ class MgrTestCase(CephTestCase):
# Unload all non-default plugins
loaded = json.loads(cls.mgr_cluster.mon_manager.raw_cluster_cmd(
"mgr", "module", "ls", "--format=json-pretty"))['enabled_modules']
- unload_modules = set(loaded) - {"cephadm", "restful"}
+ unload_modules = set(loaded) - {"cephadm"}
for m in unload_modules:
cls.mgr_cluster.mon_manager.raw_cluster_cmd(
@@ -137,7 +138,7 @@ class MgrTestCase(CephTestCase):
raise SkipTest(
"Only have {0} manager daemons, {1} are required".format(
len(cls.mgr_cluster.mgr_ids), cls.MGRS_REQUIRED))
-
+
# We expect laggy OSDs in this testing environment so turn off this warning.
# See https://tracker.ceph.com/issues/61907
cls.mgr_cluster.mon_manager.raw_cluster_cmd('config', 'set', 'mds',
@@ -229,15 +230,22 @@ class MgrTestCase(CephTestCase):
"""
# Start handing out ports well above Ceph's range.
assign_port = min_port
+ ip_addr = cls.mgr_cluster.get_mgr_map()['active_addr'].split(':')[0]
for mgr_id in cls.mgr_cluster.mgr_ids:
cls.mgr_cluster.mgr_stop(mgr_id)
cls.mgr_cluster.mgr_fail(mgr_id)
+
for mgr_id in cls.mgr_cluster.mgr_ids:
- log.debug("Using port {0} for {1} on mgr.{2}".format(
- assign_port, module_name, mgr_id
- ))
+ # Find a port that isn't in use
+ while True:
+ if not cls.is_port_in_use(ip_addr, assign_port):
+ break
+ log.debug(f"Port {assign_port} in use, trying next")
+ assign_port += 1
+
+ log.debug(f"Using port {assign_port} for {module_name} on mgr.{mgr_id}")
cls.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
config_name,
str(assign_port),
@@ -255,3 +263,8 @@ class MgrTestCase(CephTestCase):
mgr_map['active_name'], mgr_map['active_gid']))
return done
cls.wait_until_true(is_available, timeout=30)
+
+ @classmethod
+ def is_port_in_use(cls, ip_addr: str, port: int) -> bool:
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ return s.connect_ex((ip_addr, port)) == 0
diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py
index 7ac2960371c..c41a95c71f7 100644
--- a/qa/tasks/mgr/test_module_selftest.py
+++ b/qa/tasks/mgr/test_module_selftest.py
@@ -36,13 +36,6 @@ class TestModuleSelftest(MgrTestCase):
self.mgr_cluster.mon_manager.raw_cluster_cmd(
"mgr", "self-test", "module", module_name)
- def test_zabbix(self):
- # Set these mandatory config fields so that the zabbix module
- # won't trigger health/log errors on load/serve.
- self.mgr_cluster.set_module_conf("zabbix", "zabbix_host", "localhost")
- self.mgr_cluster.set_module_conf("zabbix", "identifier", "foo")
- self._selftest_plugin("zabbix")
-
def test_prometheus(self):
self._assign_ports("prometheus", "server_port", min_port=8100)
self._selftest_plugin("prometheus")
diff --git a/qa/tasks/mon_connection_score.py b/qa/tasks/mon_connection_score.py
new file mode 100644
index 00000000000..3d1fdb2a736
--- /dev/null
+++ b/qa/tasks/mon_connection_score.py
@@ -0,0 +1,95 @@
+from tasks.ceph_test_case import CephTestCase
+import json
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestStretchClusterNew(CephTestCase):
+
+ CLUSTER = "ceph"
+ MONS = {
+ "a": {
+ "rank": 0,
+ },
+ "b": {
+ "rank": 1,
+ },
+ "c": {
+ "rank": 2,
+ }
+ }
+ WRITE_PERIOD = 10
+ RECOVERY_PERIOD = WRITE_PERIOD * 6
+ SUCCESS_HOLD_TIME = 10
+
+ def setUp(self):
+ """
+ Set up the cluster for the test.
+ """
+ super(TestStretchClusterNew, self).setUp()
+
+ def tearDown(self):
+ """
+ Clean up the cluter after the test.
+ """
+ super(TestStretchClusterNew, self).tearDown()
+
+ def _check_connection_score(self):
+ """
+ Check the connection score of all the mons.
+ """
+ for mon, _ in self.MONS.items():
+ # get the connection score
+ cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
+ 'daemon', 'mon.{}'.format(mon),
+ 'connection', 'scores', 'dump')
+ # parse the connection score
+ cscore = json.loads(cscore)
+ # check if the current mon rank is correct
+ if cscore["rank"] != self.MONS[mon]["rank"]:
+ log.error(
+ "Rank mismatch {} != {}".format(
+ cscore["rank"], self.MONS[mon]["rank"]
+ )
+ )
+ return False
+ # check if current mon have all the peer reports and ourself
+ if len(cscore['reports']) != len(self.MONS):
+ log.error(
+ "Reports count mismatch {}".format(cscore['reports'])
+ )
+ return False
+
+ for report in cscore["reports"]:
+ report_rank = []
+ for peer in report["peer_scores"]:
+ # check if the peer is alive
+ if not peer["peer_alive"]:
+ log.error("Peer {} is not alive".format(peer))
+ return False
+ report_rank.append(peer["peer_rank"])
+
+ # check if current mon has all the ranks and no duplicates
+ expected_ranks = [
+ rank
+ for data in self.MONS.values()
+ for rank in data.values()
+ ]
+ if report_rank.sort() != expected_ranks.sort():
+ log.error("Rank mismatch in report {}".format(report))
+ return False
+
+ log.info("Connection score is clean!")
+ return True
+
+ def test_connection_score(self):
+ # check if all mons are in quorum
+ self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
+ # check if all connection scores reflect this
+ self.wait_until_true_and_hold(
+ lambda: self._check_connection_score(),
+ # Wait for 4 minutes for the connection score to recover
+ timeout=self.RECOVERY_PERIOD * 4,
+ # Hold the clean connection score for 60 seconds
+ success_hold_time=self.SUCCESS_HOLD_TIME * 6
+ )
diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py
index 34aa1f9cc9e..84b0b6c521b 100644
--- a/qa/tasks/mon_thrash.py
+++ b/qa/tasks/mon_thrash.py
@@ -161,7 +161,7 @@ class MonitorThrasher(Thrasher):
"""
Stop the thrashing process.
"""
- self.stopping = True
+ self.stopping.set()
def join(self):
"""
diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py
index b4697a6f797..f1eae3c89c4 100644
--- a/qa/tasks/notification_tests.py
+++ b/qa/tasks/notification_tests.py
@@ -220,7 +220,7 @@ def run_tests(ctx, config):
for client, client_config in config.items():
(remote,) = ctx.cluster.only(client).remotes.keys()
- attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"]
+ attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"]
if 'extra_attr' in client_config:
attr = client_config.get('extra_attr')
diff --git a/qa/tasks/nvme_loop.py b/qa/tasks/nvme_loop.py
index 5b29c11f007..fdec467a16d 100644
--- a/qa/tasks/nvme_loop.py
+++ b/qa/tasks/nvme_loop.py
@@ -67,9 +67,10 @@ def task(ctx, config):
with contextutil.safe_while(sleep=1, tries=15) as proceed:
while proceed():
+ remote.run(args=['lsblk'], stdout=StringIO())
p = remote.run(args=['sudo', 'nvme', 'list', '-o', 'json'], stdout=StringIO())
new_devs = []
- # `nvme list -o json` will return the following output:
+ # `nvme list -o json` will return one of the following output:
'''{
"Devices" : [
{
@@ -90,12 +91,112 @@ def task(ctx, config):
}
]
}'''
+ '''{
+ "Devices":[
+ {
+ "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4",
+ "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c",
+ "Subsystems":[
+ {
+ "Subsystem":"nvme-subsys0",
+ "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT623300LN400BGN INTEL SSDPEDMD400G4",
+ "Controllers":[
+ {
+ "Controller":"nvme0",
+ "Cntlid":"0",
+ "SerialNumber":"CVFT623300LN400BGN",
+ "ModelNumber":"INTEL SSDPEDMD400G4",
+ "Firmware":"8DV101H0",
+ "Transport":"pcie",
+ "Address":"0000:02:00.0",
+ "Slot":"2",
+ "Namespaces":[
+ {
+ "NameSpace":"nvme0n1",
+ "Generic":"ng0n1",
+ "NSID":1,
+ "UsedBytes":400088457216,
+ "MaximumLBA":781422768,
+ "PhysicalSize":400088457216,
+ "SectorSize":512
+ }
+ ],
+ "Paths":[
+ ]
+ }
+ ],
+ "Namespaces":[
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ '''
+ '''{
+ "Devices":[
+ {
+ "HostNQN":"nqn.2014-08.org.nvmexpress:uuid:00000000-0000-0000-0000-0cc47ada6ba4",
+ "HostID":"898a0e10-da2d-4a42-8017-d9c445089d0c",
+ "Subsystems":[
+ {
+ "Subsystem":"nvme-subsys0",
+ "SubsystemNQN":"nqn.2014.08.org.nvmexpress:80868086CVFT534400C2400BGN INTEL SSDPEDMD400G4",
+ "Controllers":[
+ {
+ "Controller":"nvme0",
+ "Cntlid":"0",
+ "SerialNumber":"CVFT534400C2400BGN",
+ "ModelNumber":"INTEL SSDPEDMD400G4",
+ "Firmware":"8DV101H0",
+ "Transport":"pcie",
+ "Address":"0000:02:00.0",
+ "Slot":"2",
+ "Namespaces":[
+ {
+ "NameSpace":"nvme0n1",
+ "Generic":"ng0n1",
+ "NSID":1,
+ "UsedBytes":400088457216,
+ "MaximumLBA":781422768,
+ "PhysicalSize":400088457216,
+ "SectorSize":512
+ }
+ ],
+ "Paths":[
+ ]
+ }
+ ],
+ "Namespaces":[
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ '''
nvme_list = json.loads(p.stdout.getvalue())
for device in nvme_list['Devices']:
- dev = device['DevicePath']
- vendor = device['ModelNumber']
- if dev.startswith('/dev/') and vendor == 'Linux':
- new_devs.append(dev)
+ try:
+ # first try format 1 / older format
+ dev = device['DevicePath']
+ vendor = device['ModelNumber']
+ if dev.startswith('/dev/') and vendor == 'Linux':
+ new_devs.append(dev)
+ bluestore_zap(remote, dev)
+ except KeyError:
+ for subsystem in device['Subsystems']:
+ # format 2
+ if 'Namespaces' in subsystem and subsystem['Namespaces']:
+ dev = '/dev/' + subsystem['Namespaces'][0]['NameSpace']
+ # try format 3 last
+ else:
+ dev = '/dev/' + subsystem['Controllers'][0]['Namespaces'][0]['NameSpace']
+ # vendor is the same for format 2 and 3
+ vendor = subsystem['Controllers'][0]['ModelNumber']
+ if vendor == 'Linux':
+ new_devs.append(dev)
+ bluestore_zap(remote, dev)
log.info(f'new_devs {new_devs}')
assert len(new_devs) <= len(devs)
if len(new_devs) == len(devs):
@@ -128,3 +229,13 @@ def task(ctx, config):
data=old_scratch_by_remote[remote],
sudo=True
)
+
+def bluestore_zap(remote, device: str) -> None:
+ for offset in [0, 1073741824, 10737418240]:
+ remote.run(args=['sudo', 'dd',
+ 'if=/dev/zero', f'of={device}',
+ f'seek={offset}', 'bs=1',
+ 'count=4096'], stdout=StringIO())
+ remote.run(args=['sudo', 'hexdump', '-n22',
+ '-C', f'-s{offset}', f'{device}'],
+ stdout=StringIO()) \ No newline at end of file
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py
index b56bcae0d0b..691a6f7dd86 100644
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -17,14 +17,14 @@ from tasks.thrasher import Thrasher
log = logging.getLogger(__name__)
conf_file = '/etc/ceph/nvmeof.env'
-
+gw_yaml_file = '/etc/ceph/nvmeof-gw.yaml'
class Nvmeof(Task):
"""
Setup nvmeof gateway on client and then share gateway config to target host.
- nvmeof:
- client: client.0
+ installer: host.a // or 'nvmeof.nvmeof.a'
version: default
rbd:
pool_name: mypool
@@ -32,21 +32,18 @@ class Nvmeof(Task):
gateway_config:
namespaces_count: 10
cli_version: latest
+ create_mtls_secrets: False
"""
def setup(self):
super(Nvmeof, self).setup()
try:
- self.client = self.config['client']
+ host = self.config['installer']
except KeyError:
- raise ConfigError('nvmeof requires a client to connect with')
-
- self.cluster_name, type_, self.client_id = misc.split_role(self.client)
- if type_ != 'client':
- msg = 'client role ({0}) must be a client'.format(self.client)
- raise ConfigError(msg)
- self.remote = get_remote_for_role(self.ctx, self.client)
+ raise ConfigError('nvmeof requires a installer host to deploy service')
+ self.cluster_name, _, _ = misc.split_role(host)
+ self.remote = get_remote_for_role(self.ctx, host)
def begin(self):
super(Nvmeof, self).begin()
@@ -64,6 +61,8 @@ class Nvmeof(Task):
gateway_config = self.config.get('gateway_config', {})
self.cli_image = gateway_config.get('cli_image', 'quay.io/ceph/nvmeof-cli:latest')
+ self.groups_count = gateway_config.get('groups_count', 1)
+ self.groups_prefix = gateway_config.get('groups_prefix', 'mygroup')
self.nqn_prefix = gateway_config.get('subsystem_nqn_prefix', 'nqn.2016-06.io.spdk:cnode')
self.subsystems_count = gateway_config.get('subsystems_count', 1)
self.namespaces_count = gateway_config.get('namespaces_count', 1) # namepsaces per subsystem
@@ -71,6 +70,7 @@ class Nvmeof(Task):
self.serial = gateway_config.get('serial', 'SPDK00000000000001')
self.port = gateway_config.get('port', '4420')
self.srport = gateway_config.get('srport', '5500')
+ self.create_mtls_secrets = gateway_config.get('create_mtls_secrets', False)
def deploy_nvmeof(self):
"""
@@ -114,23 +114,31 @@ class Nvmeof(Task):
'rbd', 'pool', 'init', poolname
])
- log.info(f'[nvmeof]: ceph orch apply nvmeof {poolname}')
- _shell(self.ctx, self.cluster_name, self.remote, [
- 'ceph', 'orch', 'apply', 'nvmeof', poolname,
- '--placement', str(len(nodes)) + ';' + ';'.join(nodes)
- ])
+ group_to_nodes = defaultdict(list)
+ for index, node in enumerate(nodes):
+ group_name = self.groups_prefix + str(index % int(self.groups_count))
+ group_to_nodes[group_name] += [node]
+ for group_name in group_to_nodes:
+ gp_nodes = group_to_nodes[group_name]
+ log.info(f'[nvmeof]: ceph orch apply nvmeof {poolname} {group_name}')
+ _shell(self.ctx, self.cluster_name, self.remote, [
+ 'ceph', 'orch', 'apply', 'nvmeof', poolname, group_name,
+ '--placement', ';'.join(gp_nodes)
+ ])
total_images = int(self.namespaces_count) * int(self.subsystems_count)
log.info(f'[nvmeof]: creating {total_images} images')
+ rbd_create_cmd = []
for i in range(1, total_images + 1):
imagename = self.image_name_prefix + str(i)
- log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}')
- _shell(self.ctx, self.cluster_name, self.remote, [
- 'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}'
- ])
+ rbd_create_cmd += ['rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}', run.Raw(';')]
+ _shell(self.ctx, self.cluster_name, self.remote, rbd_create_cmd)
for role, i in daemons.items():
remote, id_ = i
+ _shell(self.ctx, self.cluster_name, remote, [
+ 'ceph', 'orch', 'ls', 'nvmeof', '--export', run.Raw('>'), gw_yaml_file
+ ])
self.ctx.daemons.register_daemon(
remote, 'nvmeof', id_,
cluster=self.cluster_name,
@@ -140,7 +148,38 @@ class Nvmeof(Task):
started=True,
)
log.info("[nvmeof]: executed deploy_nvmeof successfully!")
-
+
+ def write_mtls_config(self, gateway_ips):
+ log.info("[nvmeof]: writing mtls config...")
+ allowed_ips = ""
+ for ip in gateway_ips:
+ allowed_ips += ("IP:" + ip + ",")
+ self.remote.run(
+ args=[
+ "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/server.key",
+ "-out", "/etc/ceph/server.crt", "-days", "3650", "-subj", "/CN=my.server", "-addext", f"subjectAltName={allowed_ips[:-1]}"
+ ]
+ )
+ self.remote.run(
+ args=[
+ "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/client.key",
+ "-out", "/etc/ceph/client.crt", "-days", "3650", "-subj", "/CN=client1"
+ ]
+ )
+ secrets_files = {"/etc/ceph/server.key": None,
+ "/etc/ceph/server.crt": None,
+ "/etc/ceph/client.key": None,
+ "/etc/ceph/client.crt": None,
+ }
+ for file in secrets_files.keys():
+ secrets_files[file] = self.remote.read_file(path=file, sudo=True)
+
+ for remote in self.ctx.cluster.remotes.keys():
+ for remote_file in secrets_files.keys():
+ data = secrets_files[remote_file]
+ remote.sudo_write_file(path=remote_file, data=data, mode='0644')
+ log.info("[nvmeof]: written mtls config!")
+
def set_gateway_cfg(self):
log.info('[nvmeof]: running set_gateway_cfg...')
ip_address = self.remote.ip_address
@@ -167,6 +206,8 @@ class Nvmeof(Task):
data=conf_data,
sudo=True
)
+ if self.create_mtls_secrets:
+ self.write_mtls_config(gateway_ips)
log.info("[nvmeof]: executed set_gateway_cfg successfully!")
@@ -209,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet):
daemon_max_thrash_times:
For now, NVMeoF daemons have limitation that each daemon can
- be thrashed only 3 times in span of 30 mins. This option
+ be thrashed only 5 times in span of 30 mins. This option
allows to set the amount of times it could be thrashed in a period
- of time. (default: 3)
+ of time. (default: 5)
daemon_max_thrash_period:
This option goes with the above option. It sets the period of time
over which each daemons can be thrashed for daemon_max_thrash_times
@@ -264,17 +305,17 @@ class NvmeofThrasher(Thrasher, Greenlet):
self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1))
# Limits on thrashing each daemon
- self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3))
+ self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 5))
self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds
self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60))
self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30))
- self.min_revive_delay = int(self.config.get('min_revive_delay', 100))
+ self.min_revive_delay = int(self.config.get('min_revive_delay', 60))
self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30))
def _get_devices(self, remote):
GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \
- "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'"
+ "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'"
devices = remote.sh(GET_DEVICE_CMD).split()
return devices
@@ -305,6 +346,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof',
run.Raw('&&'), 'ceph', 'health', 'detail',
run.Raw('&&'), 'ceph', '-s',
+ run.Raw('&&'), 'sudo', 'nvme', 'list',
]
for dev in self.devices:
check_cmd += [
@@ -335,6 +377,37 @@ class NvmeofThrasher(Thrasher, Greenlet):
self.log('switch_task: done waiting for the other thrasher')
other_thrasher.switch_thrasher.clear()
+ def kill_daemon(self, daemon):
+ kill_methods = [
+ "ceph_daemon_stop", "systemctl_stop",
+ "daemon_remove",
+ ]
+ chosen_method = self.rng.choice(kill_methods)
+ d_name = '%s.%s' % (daemon.type_, daemon.id_)
+ if chosen_method == "ceph_daemon_stop":
+ daemon.remote.run(args=[
+ "ceph", "orch", "daemon", "stop",
+ d_name
+ ], check_status=False)
+ elif chosen_method == "systemctl_stop":
+ daemon.stop()
+ elif chosen_method == "daemon_remove":
+ daemon.remote.run(args=[
+ "ceph", "orch", "daemon", "rm",
+ d_name
+ ], check_status=False)
+ return chosen_method
+
+ def revive_daemon(self, daemon, killed_method):
+ if killed_method == "ceph_daemon_stop":
+ name = '%s.%s' % (daemon.type_, daemon.id_)
+ daemon.remote.run(args=[
+ "ceph", "orch", "daemon", "restart",
+ name
+ ])
+ elif killed_method == "systemctl_stop":
+ daemon.restart()
+
def do_thrash(self):
self.log('start thrashing')
self.log(f'seed: {self.random_seed}, , '\
@@ -346,15 +419,13 @@ class NvmeofThrasher(Thrasher, Greenlet):
summary = []
while not self.stopping.is_set():
- killed_daemons = []
+ killed_daemons = defaultdict(list)
- weight = 1.0 / len(self.daemons)
- count = 0
+ thrash_daemon_num = self.rng.randint(1, self.max_thrash_daemons)
+ selected_daemons = self.rng.sample(self.daemons, thrash_daemon_num)
for daemon in self.daemons:
- skip = self.rng.uniform(0.0, 1.0)
- if weight <= skip:
- self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
- label=daemon.id_, skip=skip, weight=weight))
+ if daemon not in selected_daemons:
+ self.log(f'skipping daemon {daemon.id_} ...')
continue
# For now, nvmeof daemons can only be thrashed 3 times in last 30mins.
@@ -372,18 +443,16 @@ class NvmeofThrasher(Thrasher, Greenlet):
continue
self.log('kill {label}'.format(label=daemon.id_))
- daemon.stop()
+ kill_method = self.kill_daemon(daemon)
- killed_daemons.append(daemon)
+ killed_daemons[kill_method].append(daemon)
daemons_thrash_history[daemon.id_] += [datetime.now()]
- # only thrash max_thrash_daemons amount of daemons
- count += 1
- if count >= self.max_thrash_daemons:
- break
-
if killed_daemons:
- summary += ["killed: " + ", ".join([d.id_ for d in killed_daemons])]
+ iteration_summary = "thrashed- "
+ for kill_method in killed_daemons:
+ iteration_summary += (", ".join([d.id_ for d in killed_daemons[kill_method]]) + f" (by {kill_method}); ")
+ summary += [iteration_summary]
# delay before reviving
revive_delay = self.min_revive_delay
if self.randomize:
@@ -391,15 +460,17 @@ class NvmeofThrasher(Thrasher, Greenlet):
self.log(f'waiting for {revive_delay} secs before reviving')
time.sleep(revive_delay) # blocking wait
- self.log('done waiting before reviving')
+ self.log(f'done waiting before reviving - iteration #{len(summary)}: {iteration_summary}')
self.do_checks()
self.switch_task()
# revive after thrashing
- for daemon in killed_daemons:
- self.log('reviving {label}'.format(label=daemon.id_))
- daemon.restart()
+ for kill_method in killed_daemons:
+ for daemon in killed_daemons[kill_method]:
+ self.log('reviving {label}'.format(label=daemon.id_))
+ # daemon.restart()
+ self.revive_daemon(daemon, kill_method)
# delay before thrashing
thrash_delay = self.min_thrash_delay
@@ -408,7 +479,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
if thrash_delay > 0.0:
self.log(f'waiting for {thrash_delay} secs before thrashing')
time.sleep(thrash_delay) # blocking
- self.log('done waiting before thrashing')
+ self.log('done waiting before thrashing - everything should be up now')
self.do_checks()
self.switch_task()
diff --git a/qa/tasks/qemu.py b/qa/tasks/qemu.py
index 760e4b82b73..e7ce73e45d0 100644
--- a/qa/tasks/qemu.py
+++ b/qa/tasks/qemu.py
@@ -29,7 +29,8 @@ DEFAULT_MEM = 4096 # in megabytes
def normalize_disks(config):
# normalize the 'disks' parameter into a list of dictionaries
- for client, client_config in config.items():
+ for role, client_config in config.items():
+ _, typ, id_ = teuthology.split_role(role)
clone = client_config.get('clone', False)
image_url = client_config.get('image_url', DEFAULT_IMAGE_URL)
device_type = client_config.get('type', 'filesystem')
@@ -39,8 +40,8 @@ def normalize_disks(config):
disks = client_config.get('disks', DEFAULT_NUM_DISKS)
if not isinstance(disks, list):
- disks = [{'image_name': '{client}.{num}'.format(client=client,
- num=i)}
+ disks = [{'image_name': '{typ}.{id_}.{num}'.format(typ=typ, id_=id_,
+ num=i)}
for i in range(int(disks))]
client_config['disks'] = disks
@@ -90,7 +91,7 @@ def normalize_disks(config):
disks.append(clone)
def create_images(ctx, config, managers):
- for client, client_config in config.items():
+ for role, client_config in config.items():
disks = client_config['disks']
for disk in disks:
if disk.get('action') != 'create' or (
@@ -101,7 +102,7 @@ def create_images(ctx, config, managers):
if disk['encryption_format'] != 'none':
image_size += ENCRYPTION_HEADER_SIZE
create_config = {
- client: {
+ role: {
'image_name': disk['image_name'],
'image_format': 2,
'image_size': image_size,
@@ -114,14 +115,14 @@ def create_images(ctx, config, managers):
)
def create_clones(ctx, config, managers):
- for client, client_config in config.items():
+ for role, client_config in config.items():
disks = client_config['disks']
for disk in disks:
if disk['action'] != 'clone':
continue
create_config = {
- client: {
+ role: {
'image_name': disk['image_name'],
'parent_name': disk['parent_name'],
'encryption_format': disk['encryption_format'],
@@ -133,7 +134,7 @@ def create_clones(ctx, config, managers):
)
def create_encrypted_devices(ctx, config, managers):
- for client, client_config in config.items():
+ for role, client_config in config.items():
disks = client_config['disks']
for disk in disks:
if (disk['encryption_format'] == 'none' and
@@ -141,7 +142,7 @@ def create_encrypted_devices(ctx, config, managers):
'device_letter' not in disk:
continue
- dev_config = {client: disk}
+ dev_config = {role: disk}
managers.append(
lambda dev_config=dev_config:
rbd.dev_create(ctx=ctx, config=dev_config)
@@ -153,9 +154,9 @@ def create_dirs(ctx, config):
Handle directory creation and cleanup
"""
testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.items():
+ for role, client_config in config.items():
assert 'test' in client_config, 'You must specify a test to run'
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ (remote,) = ctx.cluster.only(role).remotes.keys()
remote.run(
args=[
'install', '-d', '-m0755', '--',
@@ -166,9 +167,9 @@ def create_dirs(ctx, config):
try:
yield
finally:
- for client, client_config in config.items():
+ for role, client_config in config.items():
assert 'test' in client_config, 'You must specify a test to run'
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ (remote,) = ctx.cluster.only(role).remotes.keys()
remote.run(
args=[
'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true',
@@ -181,20 +182,20 @@ def install_block_rbd_driver(ctx, config):
Make sure qemu rbd block driver (block-rbd.so) is installed
"""
packages = {}
- for client, _ in config.items():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role, _ in config.items():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
if remote.os.package_type == 'rpm':
- packages[client] = ['qemu-kvm-block-rbd']
+ packages[role] = ['qemu-kvm-block-rbd']
else:
- packages[client] = ['qemu-block-extra', 'qemu-utils']
- for pkg in packages[client]:
+ packages[role] = ['qemu-block-extra', 'qemu-utils']
+ for pkg in packages[role]:
install_package(pkg, remote)
try:
yield
finally:
- for client, _ in config.items():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- for pkg in packages[client]:
+ for role, _ in config.items():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ for pkg in packages[role]:
remove_package(pkg, remote)
@contextlib.contextmanager
@@ -210,23 +211,23 @@ def generate_iso(ctx, config):
git_url = teuth_config.get_ceph_qa_suite_git_url()
log.info('Pulling tests from %s ref %s', git_url, refspec)
- for client, client_config in config.items():
+ for role, client_config in config.items():
assert 'test' in client_config, 'You must specify a test to run'
test = client_config['test']
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ (remote,) = ctx.cluster.only(role).remotes.keys()
- clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client)
+ clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=role)
remote.run(args=refspec.clone(git_url, clone_dir))
src_dir = os.path.dirname(__file__)
- userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client)
- metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client)
+ userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + role)
+ metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + role)
with open(os.path.join(src_dir, 'userdata_setup.yaml')) as f:
test_setup = ''.join(f.readlines())
# configuring the commands to setup the nfs mount
- mnt_dir = "/export/{client}".format(client=client)
+ mnt_dir = "/export/{role}".format(role=role)
test_setup = test_setup.format(
mnt_dir=mnt_dir
)
@@ -285,9 +286,10 @@ def generate_iso(ctx, config):
with open(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f:
remote.write_file(metadata_path, f)
- test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client)
+ test_file = '{tdir}/qemu/{role}.test.sh'.format(tdir=testdir, role=role)
+ cluster, _, _ = teuthology.split_role(role)
- log.info('fetching test %s for %s', test, client)
+ log.info('fetching test %s for %s', test, role)
remote.run(
args=[
'cp', '--', os.path.join(clone_dir, test), test_file,
@@ -299,28 +301,28 @@ def generate_iso(ctx, config):
args=[
'genisoimage', '-quiet', '-input-charset', 'utf-8',
'-volid', 'cidata', '-joliet', '-rock',
- '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+ '-o', '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role),
'-graft-points',
'user-data={userdata}'.format(userdata=userdata_path),
'meta-data={metadata}'.format(metadata=metadata_path),
- 'ceph.conf=/etc/ceph/ceph.conf',
- 'ceph.keyring=/etc/ceph/ceph.keyring',
+ 'ceph.conf=/etc/ceph/{cluster}.conf'.format(cluster=cluster),
+ 'ceph.keyring=/etc/ceph/{cluster}.keyring'.format(cluster=cluster),
'test.sh={file}'.format(file=test_file),
],
)
try:
yield
finally:
- for client in config.keys():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role in config.keys():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
remote.run(
args=[
'rm', '-rf',
- '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
- os.path.join(testdir, 'qemu', 'userdata.' + client),
- os.path.join(testdir, 'qemu', 'metadata.' + client),
- '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
- '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client),
+ '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role),
+ os.path.join(testdir, 'qemu', 'userdata.' + role),
+ os.path.join(testdir, 'qemu', 'metadata.' + role),
+ '{tdir}/qemu/{role}.test.sh'.format(tdir=testdir, role=role),
+ '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=role),
],
)
@@ -331,10 +333,11 @@ def download_image(ctx, config):
testdir = teuthology.get_testdir(ctx)
client_base_files = {}
- for client, client_config in config.items():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role, client_config in config.items():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
- client_base_files[client] = []
+ cluster, _, _ = teuthology.split_role(role)
+ client_base_files[role] = []
disks = client_config['disks']
for disk in disks:
if disk['action'] != 'create' or 'image_url' not in disk:
@@ -342,7 +345,7 @@ def download_image(ctx, config):
base_file = '{tdir}/qemu/base.{name}.qcow2'.format(tdir=testdir,
name=disk['image_name'])
- client_base_files[client].append(base_file)
+ client_base_files[role].append(base_file)
remote.run(
args=[
@@ -354,15 +357,16 @@ def download_image(ctx, config):
remote.run(
args=[
'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw',
- base_file, 'rbd:rbd/{image_name}'.format(image_name=disk['image_name'])
+ base_file,'rbd:rbd/{image_name}:conf=/etc/ceph/{cluster}.conf'.format(
+ image_name=disk['image_name'], cluster=cluster)
]
)
else:
- dev_config = {client: {'image_name': disk['image_name'],
- 'encryption_format': disk['encryption_format']}}
+ dev_config = {role: {'image_name': disk['image_name'],
+ 'encryption_format': disk['encryption_format']}}
raw_file = '{tdir}/qemu/base.{name}.raw'.format(
tdir=testdir, name=disk['image_name'])
- client_base_files[client].append(raw_file)
+ client_base_files[role].append(raw_file)
remote.run(
args=[
'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw',
@@ -373,11 +377,12 @@ def download_image(ctx, config):
remote.run(
args=[
'dd', 'if={name}'.format(name=raw_file),
- 'of={name}'.format(name=dev_config[client]['device_path']),
+ 'of={name}'.format(name=dev_config[role]['device_path']),
'bs=4M', 'conv=fdatasync'
]
)
+ cluster, _, _ = teuthology.split_role(role)
for disk in disks:
if disk['action'] == 'clone' or \
disk['encryption_format'] != 'none' or \
@@ -386,7 +391,7 @@ def download_image(ctx, config):
remote.run(
args=[
- 'rbd', 'resize',
+ 'rbd', '--cluster', cluster, 'resize',
'--size={image_size}M'.format(image_size=disk['image_size']),
disk['image_name'], run.Raw('||'), 'true'
]
@@ -396,8 +401,8 @@ def download_image(ctx, config):
yield
finally:
log.debug('cleaning up base image files')
- for client, base_files in client_base_files.items():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role, base_files in client_base_files.items():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
for base_file in base_files:
remote.run(
args=[
@@ -406,14 +411,14 @@ def download_image(ctx, config):
)
-def _setup_nfs_mount(remote, client, service_name, mount_dir):
+def _setup_nfs_mount(remote, role, service_name, mount_dir):
"""
Sets up an nfs mount on the remote that the guest can use to
store logs. This nfs mount is also used to touch a file
at the end of the test to indicate if the test was successful
or not.
"""
- export_dir = "/export/{client}".format(client=client)
+ export_dir = "/export/{role}".format(role=role)
log.info("Creating the nfs export directory...")
remote.run(args=[
'sudo', 'mkdir', '-p', export_dir,
@@ -442,13 +447,13 @@ def _setup_nfs_mount(remote, client, service_name, mount_dir):
remote.run(args=['sudo', 'systemctl', 'restart', service_name])
-def _teardown_nfs_mount(remote, client, service_name):
+def _teardown_nfs_mount(remote, role, service_name):
"""
Tears down the nfs mount on the remote used for logging and reporting the
status of the tests being ran in the guest.
"""
log.info("Tearing down the nfs mount for {remote}".format(remote=remote))
- export_dir = "/export/{client}".format(client=client)
+ export_dir = "/export/{role}".format(role=role)
log.info("Stopping NFS...")
if remote.os.package_type == "deb":
remote.run(args=[
@@ -483,9 +488,9 @@ def run_qemu(ctx, config):
"""Setup kvm environment and start qemu"""
procs = []
testdir = teuthology.get_testdir(ctx)
- for client, client_config in config.items():
- (remote,) = ctx.cluster.only(client).remotes.keys()
- log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client)
+ for role, client_config in config.items():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
+ log_dir = '{tdir}/archive/qemu/{role}'.format(tdir=testdir, role=role)
remote.run(
args=[
'mkdir', log_dir, run.Raw('&&'),
@@ -502,7 +507,7 @@ def run_qemu(ctx, config):
# make an nfs mount to use for logging and to
# allow to test to tell teuthology the tests outcome
- _setup_nfs_mount(remote, client, nfs_service_name, log_dir)
+ _setup_nfs_mount(remote, role, nfs_service_name, log_dir)
# Hack to make sure /dev/kvm permissions are set correctly
# See http://tracker.ceph.com/issues/17977 and
@@ -524,13 +529,13 @@ def run_qemu(ctx, config):
'-smp', str(client_config.get('cpus', DEFAULT_CPUS)),
'-m', str(client_config.get('memory', DEFAULT_MEM)),
# cd holding metadata for cloud-init
- '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+ '-cdrom', '{tdir}/qemu/{role}.iso'.format(tdir=testdir, role=role),
]
cachemode = 'none'
- ceph_config = ctx.ceph['ceph'].conf.get('global', {})
- ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
- ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+ cluster, _, id_ = teuthology.split_role(role)
+ ceph_config = ctx.ceph[cluster].conf.get('global', {})
+ ceph_config.update(ctx.ceph[cluster].conf.get('client', {}))
if ceph_config.get('rbd cache', True):
if ceph_config.get('rbd cache max dirty', 1) > 0:
cachemode = 'writeback'
@@ -545,10 +550,8 @@ def run_qemu(ctx, config):
if disk['encryption_format'] == 'none' and \
disk.get('parent_encryption_format', 'none') == 'none':
interface = 'virtio'
- disk_spec = 'rbd:rbd/{img}:id={id}'.format(
- img=disk['image_name'],
- id=client[len('client.'):]
- )
+ disk_spec = 'rbd:rbd/{img}:conf=/etc/ceph/{cluster}.conf:id={id}'.format(
+ img=disk['image_name'], cluster=cluster, id=id_)
else:
# encrypted disks use ide as a temporary workaround for
# a bug in qemu when using virtio over nbd
@@ -570,7 +573,7 @@ def run_qemu(ctx, config):
procs.append(
remote.run(
args=args,
- logger=log.getChild(client),
+ logger=log.getChild(role),
stdin=run.PIPE,
wait=False,
)
@@ -588,12 +591,12 @@ def run_qemu(ctx, config):
time.sleep(time_wait)
log.debug('checking that qemu tests succeeded...')
- for client in config.keys():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role in config.keys():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
# ensure we have permissions to all the logs
- log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
- client=client)
+ log_dir = '{tdir}/archive/qemu/{role}'.format(tdir=testdir,
+ role=role)
remote.run(
args=[
'sudo', 'chmod', 'a+rw', '-R', log_dir
@@ -601,20 +604,20 @@ def run_qemu(ctx, config):
)
# teardown nfs mount
- _teardown_nfs_mount(remote, client, nfs_service_name)
+ _teardown_nfs_mount(remote, role, nfs_service_name)
# check for test status
remote.run(
args=[
'test', '-f',
- '{tdir}/archive/qemu/{client}/success'.format(
+ '{tdir}/archive/qemu/{role}/success'.format(
tdir=testdir,
- client=client
+ role=role
),
],
)
log.info("Deleting exported directory...")
- for client in config.keys():
- (remote,) = ctx.cluster.only(client).remotes.keys()
+ for role in config.keys():
+ (remote,) = ctx.cluster.only(role).remotes.keys()
remote.run(args=[
'sudo', 'rm', '-r', '/export'
])
@@ -693,6 +696,14 @@ def task(ctx, config):
test data
type: text/plain
filename: /tmp/data
+
+ This task supports roles that include a ceph cluster, e.g.::
+
+ tasks:
+ - ceph:
+ - qemu:
+ backup.client.0: [foo]
+ client.1: [bar] # cluster is implicitly 'ceph'
"""
assert isinstance(config, dict), \
"task qemu only supports a dictionary for configuration"
diff --git a/qa/tasks/rabbitmq.py b/qa/tasks/rabbitmq.py
index 944233d9775..e9e39cfdf4a 100644
--- a/qa/tasks/rabbitmq.py
+++ b/qa/tasks/rabbitmq.py
@@ -70,22 +70,25 @@ def run_rabbitmq(ctx, config):
(remote,) = ctx.cluster.only(client).remotes.keys()
ctx.cluster.only(client).run(args=[
- 'sudo', 'systemctl', 'enable', 'rabbitmq-server.service'
+ 'echo', 'loopback_users.guest = false', run.Raw('|'), 'sudo', 'tee', '-a', '/etc/rabbitmq/rabbitmq.conf'
],
)
ctx.cluster.only(client).run(args=[
- 'sudo', '/sbin/service', 'rabbitmq-server', 'start'
+ 'sudo', 'systemctl', 'enable', 'rabbitmq-server'
+ ],
+ )
+
+ ctx.cluster.only(client).run(args=[
+ 'sudo', 'systemctl', 'start', 'rabbitmq-server'
],
)
- '''
# To check whether rabbitmq-server is running or not
ctx.cluster.only(client).run(args=[
- 'sudo', '/sbin/service', 'rabbitmq-server', 'status'
+ 'sudo', 'systemctl', 'status', 'rabbitmq-server'
],
)
- '''
try:
yield
@@ -96,7 +99,7 @@ def run_rabbitmq(ctx, config):
(remote,) = ctx.cluster.only(client).remotes.keys()
ctx.cluster.only(client).run(args=[
- 'sudo', '/sbin/service', 'rabbitmq-server', 'stop'
+ 'sudo', 'systemctl', 'stop', 'rabbitmq-server'
],
)
diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py
index d8eac5d886f..96bcc770511 100644
--- a/qa/tasks/rados.py
+++ b/qa/tasks/rados.py
@@ -36,6 +36,8 @@ def task(ctx, config):
write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
This mean data don't access in the near future.
Let osd backend don't keep data in cache.
+ pct_update_delay: delay before primary propogates pct on write pause,
+ defaults to 5s if balance_reads is set
For example::
@@ -139,6 +141,7 @@ def task(ctx, config):
object_size = int(config.get('object_size', 4000000))
op_weights = config.get('op_weights', {})
testdir = teuthology.get_testdir(ctx)
+ pct_update_delay = None
args = [
'adjust-ulimits',
'ceph-coverage',
@@ -166,6 +169,7 @@ def task(ctx, config):
args.extend(['--pool-snaps'])
if config.get('balance_reads', False):
args.extend(['--balance-reads'])
+ pct_update_delay = config.get('pct_update_delay', 5);
if config.get('localize_reads', False):
args.extend(['--localize-reads'])
if config.get('max_attr_len', None):
@@ -274,6 +278,10 @@ def task(ctx, config):
if config.get('fast_read', False):
manager.raw_cluster_cmd(
'osd', 'pool', 'set', pool, 'fast_read', 'true')
+ if pct_update_delay:
+ manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool,
+ 'pct_update_delay', str(pct_update_delay));
min_size = config.get('min_size', None);
if min_size is not None:
manager.raw_cluster_cmd(
diff --git a/qa/tasks/radosgw_admin.py b/qa/tasks/radosgw_admin.py
index 3b98702acca..fb82378761b 100644
--- a/qa/tasks/radosgw_admin.py
+++ b/qa/tasks/radosgw_admin.py
@@ -16,6 +16,7 @@ import logging
import time
import datetime
import sys
+import errno
from io import StringIO
from queue import Queue
@@ -725,6 +726,40 @@ def task(ctx, config):
(err, out) = rgwadmin(ctx, client, ['user', 'rm', '--tenant', tenant_name, '--uid', 'tenanteduser'],
check_status=True)
+ account_id = 'RGW12312312312312312'
+ account_name = 'testacct'
+ rgwadmin(ctx, client, [
+ 'account', 'create',
+ '--account-id', account_id,
+ '--account-name', account_name,
+ ], check_status=True)
+ rgwadmin(ctx, client, [
+ 'user', 'create',
+ '--account-id', account_id,
+ '--uid', 'testacctuser',
+ '--display-name', 'accountuser',
+ '--gen-access-key',
+ '--gen-secret',
+ ], check_status=True)
+
+ # TESTCASE 'bucket link', 'bucket', 'account user', 'fails'
+ (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--bucket', bucket_name, '--uid', 'testacctuser'])
+ assert err == errno.EINVAL
+
+ rgwadmin(ctx, client, ['user', 'rm', '--uid', 'testacctuser'], check_status=True)
+
+ # TESTCASE 'bucket link', 'bucket', 'account', 'succeeds'
+ rgwadmin(ctx, client,
+ ['bucket', 'link', '--bucket', bucket_name, '--account-id', account_id],
+ check_status=True)
+
+ # relink the bucket to the first user and delete the account
+ rgwadmin(ctx, client,
+ ['bucket', 'link', '--bucket', bucket_name, '--uid', user1],
+ check_status=True)
+ rgwadmin(ctx, client, ['account', 'rm', '--account-id', account_id],
+ check_status=True)
+
# TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
# upload an object
diff --git a/qa/tasks/rbd.py b/qa/tasks/rbd.py
index b0ffaba8386..026b695fb00 100644
--- a/qa/tasks/rbd.py
+++ b/qa/tasks/rbd.py
@@ -65,6 +65,7 @@ def create_image(ctx, config):
size = properties.get('image_size', 10240)
fmt = properties.get('image_format', 1)
encryption_format = properties.get('encryption_format', 'none')
+ cluster, _, _ = teuthology.split_role(role)
(remote,) = ctx.cluster.only(role).remotes.keys()
log.info('Creating image {name} with size {size}'.format(name=name,
size=size))
@@ -73,6 +74,7 @@ def create_image(ctx, config):
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'rbd',
+ '--cluster', cluster,
'-p', 'rbd',
'create',
'--size', str(size),
@@ -99,6 +101,7 @@ def create_image(ctx, config):
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'rbd',
+ '--cluster', cluster,
'encryption',
'format',
name,
@@ -117,6 +120,7 @@ def create_image(ctx, config):
if properties is None:
properties = {}
name = properties.get('image_name', default_image_name(role))
+ cluster, _, _ = teuthology.split_role(role)
(remote,) = ctx.cluster.only(role).remotes.keys()
remote.run(
args=[
@@ -124,6 +128,7 @@ def create_image(ctx, config):
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'rbd',
+ '--cluster', cluster,
'-p', 'rbd',
'rm',
name,
@@ -160,6 +165,7 @@ def clone_image(ctx, config):
properties = {}
name = properties.get('image_name', default_image_name(role))
+ cluster, _, _ = teuthology.split_role(role)
parent_name = properties.get('parent_name')
assert parent_name is not None, \
"parent_name is required"
@@ -195,7 +201,7 @@ def clone_image(ctx, config):
'adjust-ulimits',
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd', '-p', 'rbd'
+ 'rbd', '--cluster', cluster, '-p', 'rbd'
]
args.extend(cmd)
remote.run(args=args)
@@ -209,6 +215,7 @@ def clone_image(ctx, config):
if properties is None:
properties = {}
name = properties.get('image_name', default_image_name(role))
+ cluster, _, _ = teuthology.split_role(role)
parent_name = properties.get('parent_name')
parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
@@ -221,7 +228,7 @@ def clone_image(ctx, config):
'adjust-ulimits',
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
- 'rbd', '-p', 'rbd'
+ 'rbd', '--cluster', cluster, '-p', 'rbd'
]
args.extend(cmd)
remote.run(args=args)
@@ -305,6 +312,7 @@ def dev_create(ctx, config):
if properties is None:
properties = {}
name = properties.get('image_name', default_image_name(role))
+ cluster, _, _ = teuthology.split_role(role)
parent_encryption_format = properties.get('parent_encryption_format',
'none')
encryption_format = properties.get('encryption_format',
@@ -365,6 +373,7 @@ def dev_create(ctx, config):
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'rbd',
+ '--cluster', cluster,
'--id', role.rsplit('.')[-1],
'-p', 'rbd',
'map',
@@ -609,7 +618,8 @@ def xfstests(ctx, config):
running_xfstests = {}
for role, properties in runs:
- assert role.startswith('client.'), \
+ cluster, typ, _ = teuthology.split_role(role)
+ assert typ == "client", \
"task xfstests can only run on client nodes"
for host, roles_for_host in ctx.cluster.remotes.items():
if role in roles_for_host:
diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py
index f5a6f5a2615..f93ca017fa2 100644
--- a/qa/tasks/rgw_multisite.py
+++ b/qa/tasks/rgw_multisite.py
@@ -139,7 +139,10 @@ class RGWMultisite(Task):
if cluster != cluster1: # already created on master cluster
log.info('pulling realm configuration to %s', cluster.name)
- realm.pull(cluster, master_zone.gateways[0], creds)
+
+ is_default = self.config['realm'].get('is_default', False)
+ args = ['--default'] if is_default else []
+ realm.pull(cluster, master_zone.gateways[0], creds, args)
# use the first zone's cluster to create the zonegroup
if not zonegroup:
@@ -358,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config):
if endpoints:
# replace client names with their gateway endpoints
config['endpoints'] = extract_gateway_endpoints(gateways, endpoints)
+ if not config.get('api_name'): # otherwise it will be set to an empty string
+ config['api_name'] = config['name']
zonegroup = multisite.ZoneGroup(config['name'], period)
# `zonegroup set` needs --default on command line, and 'is_master' in json
args = is_default_arg(config)
diff --git a/qa/tasks/rgw_multisite_tests.py b/qa/tasks/rgw_multisite_tests.py
index 822cbcf7910..e0a38deadd2 100644
--- a/qa/tasks/rgw_multisite_tests.py
+++ b/qa/tasks/rgw_multisite_tests.py
@@ -72,7 +72,9 @@ class RGWMultisiteTests(Task):
# create test account/user
log.info('creating test user..')
user = multisite.User('rgw-multisite-test-user', account='RGW11111111111111111')
- master_zone.cluster.admin(['account', 'create', '--account-id', user.account])
+ arg = ['--account-id', user.account]
+ arg += master_zone.zone_args()
+ master_zone.cluster.admin(['account', 'create'] + arg)
user.create(master_zone, ['--display-name', 'TestUser',
'--gen-access-key', '--gen-secret'])
diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py
index 6cb75173966..fae5ef3bf00 100644
--- a/qa/tasks/rook.py
+++ b/qa/tasks/rook.py
@@ -8,7 +8,7 @@ import json
import logging
import os
import yaml
-from io import BytesIO
+from io import BytesIO, StringIO
from tarfile import ReadError
from tasks.ceph_manager import CephManager
@@ -235,10 +235,14 @@ def ceph_log(ctx, config):
r = ctx.rook[cluster_name].remote.run(
stdout=BytesIO(),
args=args,
+ stderr=StringIO(),
)
stdout = r.stdout.getvalue().decode()
if stdout:
return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
return None
if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
diff --git a/qa/tasks/s3a_hadoop.py b/qa/tasks/s3a_hadoop.py
index 7b77359fcf2..4518a6f397c 100644
--- a/qa/tasks/s3a_hadoop.py
+++ b/qa/tasks/s3a_hadoop.py
@@ -1,5 +1,6 @@
import contextlib
import logging
+import os
from teuthology import misc
from teuthology.orchestra import run
@@ -40,7 +41,7 @@ def task(ctx, config):
# get versions
maven_major = config.get('maven-major', 'maven-3')
- maven_version = config.get('maven-version', '3.6.3')
+ maven_version = config.get('maven-version', '3.9.9')
hadoop_ver = config.get('hadoop-version', '2.9.2')
bucket_name = config.get('bucket-name', 's3atest')
access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F')
@@ -48,11 +49,19 @@ def task(ctx, config):
'secret-key',
'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb')
+ # programmatically find a nearby mirror so as not to hammer archive.apache.org
+ apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+ "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+ log.info("determining apache mirror by running: " + apache_mirror_cmd)
+ apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+ log.info("chosen apache mirror is " + apache_mirror_url_front)
+
# set versions for cloning the repo
apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format(
maven_version=maven_version)
- maven_link = 'http://archive.apache.org/dist/maven/' + \
- '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven
+ maven_link = '{apache_mirror_url_front}/maven/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+ '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + \
+ apache_maven
hadoop_git = 'https://github.com/apache/hadoop'
hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver)
if hadoop_ver == 'trunk':
@@ -204,6 +213,7 @@ def run_s3atest(client, maven_version, testdir, test_options):
run.Raw('&&'),
run.Raw(rm_test),
run.Raw('&&'),
+ run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'),
run.Raw(run_test),
run.Raw(test_options)
]
diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py
index cd0cd9d146d..85ab97d23cd 100644
--- a/qa/tasks/s3tests.py
+++ b/qa/tasks/s3tests.py
@@ -57,6 +57,17 @@ def download(ctx, config):
'git', 'reset', '--hard', sha1,
],
)
+ if client_config.get('boto3_extensions'):
+ ctx.cluster.only(client).run(
+ args=['mkdir',
+ '-p',
+ '/home/ubuntu/.aws/models/s3/2006-03-01/']
+ )
+ (remote,) = ctx.cluster.only(client).remotes.keys()
+ remote_file = '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json'
+ local_file = '{qadir}/../examples/rgw/boto3/service-2.sdk-extras.json'.format(qadir=ctx.config.get('suite_path'))
+ remote.put_file(local_file, remote_file)
+
try:
yield
finally:
@@ -70,6 +81,17 @@ def download(ctx, config):
'{tdir}/s3-tests-{client}'.format(tdir=testdir, client=client),
],
)
+ if client_config.get('boto3_extensions'):
+ ctx.cluster.only(client).run(
+ args=[
+ 'rm', '-rf', '/home/ubuntu/.aws/models/s3/2006-03-01/service-2.sdk-extras.json',
+ ],
+ )
+ ctx.cluster.only(client).run(
+ args=[
+ 'cd', '/home/ubuntu/', run.Raw('&&'), 'rmdir', '-p', '.aws/models/s3/2006-03-01/',
+ ],
+ )
def _config_user(s3tests_conf, section, user, email):
@@ -89,6 +111,8 @@ def _config_user(s3tests_conf, section, user, email):
s3tests_conf[section].setdefault('totp_seed',
base64.b32encode(os.urandom(40)).decode())
s3tests_conf[section].setdefault('totp_seconds', '5')
+ if section == 's3 tenant':
+ s3tests_conf[section].setdefault('tenant', 'testx')
@contextlib.contextmanager
@@ -442,8 +466,10 @@ def run_tests(ctx, config):
attrs += ['not fails_with_subdomain']
if not client_config.get('with-sse-s3'):
attrs += ['not sse_s3']
-
+
attrs += client_config.get('extra_attrs', [])
+ if 'bucket_logging' not in attrs:
+ attrs += ['not bucket_logging']
if 'unit_test_scan' in client_config and client_config['unit_test_scan']:
xmlfile_id = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S--") + str(uuid.uuid4())
xmlpath= f'{testdir}/archive/s3test-{xmlfile_id}.xml'
diff --git a/qa/tasks/s3tests_java.py b/qa/tasks/s3tests_java.py
index 3e20e10d06c..a58aa6cf0b4 100644
--- a/qa/tasks/s3tests_java.py
+++ b/qa/tasks/s3tests_java.py
@@ -284,6 +284,7 @@ class S3tests_java(Task):
args = ['cd',
'{tdir}/s3-tests-java'.format(tdir=testdir),
run.Raw('&&'),
+ run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'),
'/opt/gradle/gradle/bin/gradle', 'clean', 'test',
'--rerun-tasks', '--no-build-cache',
]
diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py
new file mode 100644
index 00000000000..a84a85bb307
--- /dev/null
+++ b/qa/tasks/stretch_mode_disable_enable.py
@@ -0,0 +1,547 @@
+import logging
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+class TestStretchMode(MgrTestCase):
+ """
+ Test the stretch mode feature of Ceph
+ """
+ POOL = 'stretch_pool'
+ CLUSTER = "ceph"
+ WRITE_PERIOD = 10
+ RECOVERY_PERIOD = WRITE_PERIOD * 6
+ SUCCESS_HOLD_TIME = 7
+ STRETCH_CRUSH_RULE = 'stretch_rule'
+ STRETCH_CRUSH_RULE_ID = None
+ STRETCH_BUCKET_TYPE = 'datacenter'
+ TIEBREAKER_MON_NAME = 'e'
+ DEFAULT_POOL_TYPE = 'replicated'
+ DEFAULT_POOL_CRUSH_RULE = 'replicated_rule'
+ DEFAULT_POOL_SIZE = 3
+ DEFAULT_POOL_MIN_SIZE = 2
+ DEFAULT_POOL_CRUSH_RULE_ID = None
+ # This dictionary maps the datacenter to the osd ids and hosts
+ DC_OSDS = {
+ 'dc1': {
+ "host01": [0, 1],
+ "host02": [2, 3],
+ },
+ 'dc2': {
+ "host03": [4, 5],
+ "host04": [6, 7],
+ },
+ }
+ DC_MONS = {
+ 'dc1': {
+ "host01": ['a'],
+ "host02": ['b'],
+ },
+ 'dc2': {
+ "host03": ['c'],
+ "host04": ['d'],
+ },
+ 'dc3': {
+ "host05": ['e'],
+ }
+ }
+ def _osd_count(self):
+ """
+ Get the number of OSDs in the cluster.
+ """
+ osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ return len(osd_map['osds'])
+
+ def setUp(self):
+ """
+ Setup the cluster and
+ ensure we have a clean condition before the test.
+ """
+ # Ensure we have at least 6 OSDs
+ super(TestStretchMode, self).setUp()
+ self.DEFAULT_POOL_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.DEFAULT_POOL_CRUSH_RULE)
+ self.STRETCH_CRUSH_RULE_ID = self.mgr_cluster.mon_manager.get_crush_rule_id(self.STRETCH_CRUSH_RULE)
+ if self._osd_count() < 4:
+ self.skipTest("Not enough OSDS!")
+
+ # Remove any filesystems so that we can remove their pools
+ if self.mds_cluster:
+ self.mds_cluster.mds_stop()
+ self.mds_cluster.mds_fail()
+ self.mds_cluster.delete_all_filesystems()
+
+ # Remove all other pools
+ for pool in self.mgr_cluster.mon_manager.get_osd_dump_json()['pools']:
+ try:
+ self.mgr_cluster.mon_manager.remove_pool(pool['pool_name'])
+ except:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'delete',
+ pool['pool_name'],
+ pool['pool_name'],
+ '--yes-i-really-really-mean-it')
+
+ def _setup_pool(
+ self,
+ pool_name=POOL,
+ pg_num=16,
+ pool_type=DEFAULT_POOL_TYPE,
+ crush_rule=DEFAULT_POOL_CRUSH_RULE,
+ size=None,
+ min_size=None
+ ):
+ """
+ Create a pool, set its size and pool if specified.
+ """
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'create', pool_name, str(pg_num), pool_type, crush_rule)
+
+ if size is not None:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name, 'size', str(size))
+
+ if min_size is not None:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'pool', 'set', pool_name, 'min_size', str(min_size))
+
+ def _write_some_data(self, t):
+ """
+ Write some data to the pool to simulate a workload.
+ """
+ args = [
+ "rados", "-p", self.POOL, "bench", str(t), "write", "-t", "16"]
+ self.mgr_cluster.admin_remote.run(args=args, wait=True)
+
+ def _get_all_mons_from_all_dc(self):
+ """
+ Get all mons from all datacenters.
+ """
+ return [mon for dc in self.DC_MONS.values() for mons in dc.values() for mon in mons]
+
+ def _bring_back_mon(self, mon):
+ """
+ Bring back the mon.
+ """
+ try:
+ self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
+ except Exception:
+ log.error("Failed to bring back mon.{}".format(str(mon)))
+ pass
+
+ def _get_host(self, osd):
+ """
+ Get the host of the osd.
+ """
+ for dc, nodes in self.DC_OSDS.items():
+ for node, osds in nodes.items():
+ if osd in osds:
+ return node
+ return None
+
+ def _move_osd_back_to_host(self, osd):
+ """
+ Move the osd back to the host.
+ """
+ host = self._get_host(osd)
+ assert host is not None, "The host of osd {} is not found.".format(osd)
+ log.debug("Moving osd.%d back to %s", osd, host)
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'crush', 'move', 'osd.{}'.format(str(osd)),
+ 'host={}'.format(host)
+ )
+
+ def tearDown(self):
+ """
+ Clean up the cluster after the test.
+ """
+ # Remove the pool
+ if self.POOL in self.mgr_cluster.mon_manager.pools:
+ self.mgr_cluster.mon_manager.remove_pool(self.POOL)
+
+ osd_map = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for osd in osd_map['osds']:
+ # mark all the osds in
+ if osd['weight'] == 0.0:
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'osd', 'in', str(osd['osd']))
+ # Bring back all the osds and move it back to the host.
+ if osd['up'] == 0:
+ self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
+ self._move_osd_back_to_host(osd['osd'])
+
+ # Bring back all the mons
+ mons = self._get_all_mons_from_all_dc()
+ for mon in mons:
+ self._bring_back_mon(mon)
+ super(TestStretchMode, self).tearDown()
+
+ def _kill_osd(self, osd):
+ """
+ Kill the osd.
+ """
+ try:
+ self.ctx.daemons.get_daemon('osd', osd, self.CLUSTER).stop()
+ except Exception:
+ log.error("Failed to stop osd.{}".format(str(osd)))
+ pass
+
+ def _get_osds_data(self, want_osds):
+ """
+ Get the osd data
+ """
+ all_osds_data = \
+ self.mgr_cluster.mon_manager.get_osd_dump_json()['osds']
+ return [
+ osd_data for osd_data in all_osds_data
+ if int(osd_data['osd']) in want_osds
+ ]
+
+ def _get_osds_by_dc(self, dc):
+ """
+ Get osds by datacenter.
+ """
+ ret = []
+ for host, osds in self.DC_OSDS[dc].items():
+ ret.extend(osds)
+ return ret
+
+ def _fail_over_all_osds_in_dc(self, dc):
+ """
+ Fail over all osds in specified <datacenter>
+ """
+ if not isinstance(dc, str):
+ raise ValueError("dc must be a string")
+ if dc not in self.DC_OSDS:
+ raise ValueError(
+ "dc must be one of the following: %s" % self.DC_OSDS.keys()
+ )
+ log.debug("Failing over all osds in %s", dc)
+ osds = self._get_osds_by_dc(dc)
+ # fail over all the OSDs in the DC
+ log.debug("OSDs to failed over: %s", osds)
+ for osd_id in osds:
+ self._kill_osd(osd_id)
+ # wait until all the osds are down
+ self.wait_until_true(
+ lambda: all([int(osd['up']) == 0
+ for osd in self._get_osds_data(osds)]),
+ timeout=self.RECOVERY_PERIOD
+ )
+
+ def _check_mons_out_of_quorum(self, want_mons):
+ """
+ Check if the mons are not in quorum.
+ """
+ quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+ return all([mon not in quorum_names for mon in want_mons])
+
+ def _kill_mon(self, mon):
+ """
+ Kill the mon.
+ """
+ try:
+ self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).stop()
+ except Exception:
+ log.error("Failed to stop mon.{}".format(str(mon)))
+ pass
+
+ def _get_mons_by_dc(self, dc):
+ """
+ Get mons by datacenter.
+ """
+ ret = []
+ for host, mons in self.DC_MONS[dc].items():
+ ret.extend(mons)
+ return ret
+
+ def _fail_over_all_mons_in_dc(self, dc):
+ """
+ Fail over all mons in the specified <datacenter>
+ """
+ if not isinstance(dc, str):
+ raise ValueError("dc must be a string")
+ if dc not in self.DC_MONS:
+ raise ValueError("dc must be one of the following: %s" %
+ ", ".join(self.DC_MONS.keys()))
+ log.debug("Failing over all mons %s", dc)
+ mons = self._get_mons_by_dc(dc)
+ log.debug("Mons to be failed over: %s", mons)
+ for mon in mons:
+ self._kill_mon(mon)
+ # wait until all the mons are out of quorum
+ self.wait_until_true(
+ lambda: self._check_mons_out_of_quorum(mons),
+ timeout=self.RECOVERY_PERIOD
+ )
+
+ def _stretch_mode_enabled_correctly(self):
+ """
+ Evaluate whether the stretch mode is enabled correctly.
+ by checking the OSDMap and MonMap.
+ """
+ # Checking the OSDMap
+ osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for pool in osdmap['pools']:
+ # expects crush_rule to be stretch_rule
+ self.assertEqual(
+ self.STRETCH_CRUSH_RULE_ID,
+ pool['crush_rule']
+ )
+ # expects pool size to be 4
+ self.assertEqual(
+ 4,
+ pool['size']
+ )
+ # expects pool min_size to be 2
+ self.assertEqual(
+ 2,
+ pool['min_size']
+ )
+ # expects pool is_stretch_pool flag to be true
+ self.assertEqual(
+ True,
+ pool['is_stretch_pool']
+ )
+ # expects peering_crush_bucket_count = 2 (always this value for stretch mode)
+ self.assertEqual(
+ 2,
+ pool['peering_crush_bucket_count']
+ )
+ # expects peering_crush_bucket_target = 2 (always this value for stretch mode)
+ self.assertEqual(
+ 2,
+ pool['peering_crush_bucket_target']
+ )
+ # expects peering_crush_bucket_barrier = 8 (crush type of datacenter is 8)
+ self.assertEqual(
+ 8,
+ pool['peering_crush_bucket_barrier']
+ )
+ # expects stretch_mode_enabled to be True
+ self.assertEqual(
+ True,
+ osdmap['stretch_mode']['stretch_mode_enabled']
+ )
+ # expects stretch_mode_bucket_count to be 2
+ self.assertEqual(
+ 2,
+ osdmap['stretch_mode']['stretch_bucket_count']
+ )
+ # expects degraded_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['degraded_stretch_mode']
+ )
+ # expects recovering_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['recovering_stretch_mode']
+ )
+ # expects stretch_mode_bucket to be 8 (datacenter crush type = 8)
+ self.assertEqual(
+ 8,
+ osdmap['stretch_mode']['stretch_mode_bucket']
+ )
+ # Checking the MonMap
+ monmap = self.mgr_cluster.mon_manager.get_mon_dump_json()
+ # expects stretch_mode to be True
+ self.assertEqual(
+ True,
+ monmap['stretch_mode']
+ )
+ # expects disallowed_leaders to be tiebreaker_mon
+ self.assertEqual(
+ self.TIEBREAKER_MON_NAME,
+ monmap['disallowed_leaders']
+ )
+ # expects tiebreaker_mon to be tiebreaker_mon
+ self.assertEqual(
+ self.TIEBREAKER_MON_NAME,
+ monmap['tiebreaker_mon']
+ )
+
+ def _stretch_mode_disabled_correctly(self):
+ """
+ Evaluate whether the stretch mode is disabled correctly.
+ by checking the OSDMap and MonMap.
+ """
+ # Checking the OSDMap
+ osdmap = self.mgr_cluster.mon_manager.get_osd_dump_json()
+ for pool in osdmap['pools']:
+ # expects crush_rule to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_CRUSH_RULE_ID,
+ pool['crush_rule']
+ )
+ # expects pool size to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_SIZE,
+ pool['size']
+ )
+ # expects pool min_size to be default
+ self.assertEqual(
+ self.DEFAULT_POOL_MIN_SIZE,
+ pool['min_size']
+ )
+ # expects pool is_stretch_pool flag to be false
+ self.assertEqual(
+ False,
+ pool['is_stretch_pool']
+ )
+ # expects peering_crush_bucket_count = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_count']
+ )
+ # expects peering_crush_bucket_target = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_target']
+ )
+ # expects peering_crush_bucket_barrier = 0
+ self.assertEqual(
+ 0,
+ pool['peering_crush_bucket_barrier']
+ )
+ # expects stretch_mode_enabled to be False
+ self.assertEqual(
+ False,
+ osdmap['stretch_mode']['stretch_mode_enabled']
+ )
+ # expects stretch_mode_bucket to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['stretch_bucket_count']
+ )
+ # expects degraded_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['degraded_stretch_mode']
+ )
+ # expects recovering_stretch_mode to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['recovering_stretch_mode']
+ )
+ # expects stretch_mode_bucket to be 0
+ self.assertEqual(
+ 0,
+ osdmap['stretch_mode']['stretch_mode_bucket']
+ )
+ # Checking the MonMap
+ monmap = self.mgr_cluster.mon_manager.get_mon_dump_json()
+ # expects stretch_mode to be False
+ self.assertEqual(
+ False,
+ monmap['stretch_mode']
+ )
+ # expects disallowed_leaders to be empty
+ self.assertEqual(
+ "",
+ monmap['disallowed_leaders']
+ )
+ # expects tiebreaker_mon to be empty
+ self.assertEqual(
+ "",
+ monmap['tiebreaker_mon']
+ )
+
+ def test_disable_stretch_mode(self):
+ """
+ Test disabling stretch mode with the following scenario:
+ 1. Healthy Stretch Mode
+ 2. Degraded Stretch Mode
+ """
+ # Create a pool
+ self._setup_pool(self.POOL, 16, 'replicated', self.STRETCH_CRUSH_RULE, 4, 2)
+ # Write some data to the pool
+ self._write_some_data(self.WRITE_PERIOD)
+ # disable stretch mode without --yes-i-really-mean-it (expects -EPERM 1)
+ self.assertEqual(
+ 1,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode'
+ ))
+ # Disable stretch mode with non-existent crush rule (expects -EINVAL 22)
+ self.assertEqual(
+ 22,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ 'non_existent_rule',
+ '--yes-i-really-mean-it'
+ ))
+ # Disable stretch mode with the current stretch rule (expect -EINVAL 22)
+ self.assertEqual(
+ 22,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ self.STRETCH_CRUSH_RULE,
+ '--yes-i-really-mean-it',
+
+ ))
+ # Disable stretch mode without crush rule (expect success 0)
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ '--yes-i-really-mean-it'
+ ))
+ # Check if stretch mode is disabled correctly
+ self._stretch_mode_disabled_correctly()
+ # all PGs are active + clean
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # write some data to the pool
+ self._write_some_data(self.WRITE_PERIOD)
+ # Enable stretch mode
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'enable_stretch_mode',
+ self.TIEBREAKER_MON_NAME,
+ self.STRETCH_CRUSH_RULE,
+ self.STRETCH_BUCKET_TYPE
+ ))
+ self._stretch_mode_enabled_correctly()
+ # all PGs are active + clean
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active_clean(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # write some data to the pool
+ # self._write_some_data(self.WRITE_PERIOD)
+ # Bring down dc1
+ self._fail_over_all_osds_in_dc('dc1')
+ self._fail_over_all_mons_in_dc('dc1')
+ # should be in degraded stretch mode
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.is_degraded_stretch_mode(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
+ # Disable stretch mode with valid crush rule (expect success 0)
+ self.assertEqual(
+ 0,
+ self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+ 'mon',
+ 'disable_stretch_mode',
+ self.DEFAULT_POOL_CRUSH_RULE,
+ '--yes-i-really-mean-it'
+ ))
+ # Check if stretch mode is disabled correctly
+ self._stretch_mode_disabled_correctly()
+ # all PGs are active
+ self.wait_until_true_and_hold(
+ lambda: self.mgr_cluster.mon_manager.pg_all_active(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml
index b70583a75e1..dbde1ced0db 100644
--- a/qa/tasks/thrashosds-health.yaml
+++ b/qa/tasks/thrashosds-health.yaml
@@ -30,3 +30,4 @@ overrides:
- out of quorum
- noscrub
- nodeep-scrub
+ - is down
diff --git a/qa/tasks/tox.py b/qa/tasks/tox.py
index 61c5b7411b4..4e4dee966d5 100644
--- a/qa/tasks/tox.py
+++ b/qa/tasks/tox.py
@@ -35,7 +35,7 @@ def task(ctx, config):
ctx.cluster.only(client).run(args=[
'source', '{tvdir}/bin/activate'.format(tvdir=tvdir),
run.Raw('&&'),
- 'pip', 'install', 'tox==3.15.0'
+ 'pip', 'install', 'tox'
])
# export the path Keystone and Tempest
diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py
index ca929ba05b4..2ed21431330 100644
--- a/qa/tasks/vstart_runner.py
+++ b/qa/tasks/vstart_runner.py
@@ -233,6 +233,11 @@ class LocalRemoteProcess(object):
else:
self.stderr.write(err)
+ def _handle_subprocess_output(self, output, stream):
+ if isinstance(stream, StringIO):
+ return rm_nonascii_chars(output)
+ return output
+
def wait(self, timeout=None):
# Null subproc.stdin so communicate() does not try flushing/closing it
# again.
@@ -250,7 +255,8 @@ class LocalRemoteProcess(object):
return
out, err = self.subproc.communicate(timeout=timeout)
- out, err = rm_nonascii_chars(out), rm_nonascii_chars(err)
+ out = self._handle_subprocess_output(out, self.stdout)
+ err = self._handle_subprocess_output(err, self.stderr)
self._write_stdout(out)
self._write_stderr(err)
diff --git a/qa/tasks/workunit.py b/qa/tasks/workunit.py
index f6e55c48cd6..4fd82eaea9d 100644
--- a/qa/tasks/workunit.py
+++ b/qa/tasks/workunit.py
@@ -441,8 +441,10 @@ def _run_tests(ctx, refspec, role, tests, env, basedir,
remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
finally:
log.info('Stopping %s on %s...', tests, role)
+ # N.B. unlike before, don't cleanup path under variable "scratch_tmp"
+ # here! If the mount is broken then rm will hang. For context, see
+ # commit d4b8f94cf8d95ebb277b550fc6ebc3468052a39c.
args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
- # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang.
remote.run(
logger=log.getChild(role),
args=args,
diff --git a/qa/workunits/cephadm/test_iscsi_setup.sh b/qa/workunits/cephadm/test_iscsi_setup.sh
new file mode 100755
index 00000000000..88f379918bc
--- /dev/null
+++ b/qa/workunits/cephadm/test_iscsi_setup.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# very basic set up of iscsi gw and client
+# to make sure things are working
+
+set -ex
+
+if ! grep -q rhel /etc/*-release; then
+ echo "The script only supports CentOS."
+ exit 1
+fi
+
+# teuthology tends to put the cephadm binary built for our testing
+# branch in /home/ubuntu/cephtest/. If it's there, lets just move it
+# so we don't need to reference the full path.
+if ! command -v cephadm && ls /home/ubuntu/cephtest/cephadm; then
+ sudo cp /home/ubuntu/cephtest/cephadm /usr/sbin/
+fi
+
+# make sure we haven't already created luns
+! sudo ls /dev/disk/by-path | grep iscsi
+
+sudo dnf install jq -y
+
+ISCSI_CONT_ID=$(sudo podman ps -qa --filter='name=iscsi' | head -n 1)
+ISCSI_DAEMON_NAME=$(sudo cephadm ls --no-detail | jq -r '.[] | select(.name | startswith("iscsi")) | .name')
+ISCSI_DAEMON_ID=$(cut -d '.' -f2- <<< "$ISCSI_DAEMON_NAME")
+HOSTNAME=$(sudo cephadm shell -- ceph orch ps --daemon-id "$ISCSI_DAEMON_ID" -f json | jq -r '.[] | .hostname')
+NODE_IP=$(sudo cephadm shell -- ceph orch host ls --format json | jq --arg HOSTNAME "$HOSTNAME" -r '.[] | select(.hostname == $HOSTNAME) | .addr')
+# The result of this python line is what iscsi will expect for the first gateway name
+FQDN=$(python3 -c 'import socket; print(socket.getfqdn())')
+# I am running this twice on purpose. I don't know why but in my testing the first time this would
+# run it would return a different result then all subsequent runs (and take significantly longer to run).
+# The result from the first run would cause gateway creation to fail when the return value is used
+# later on. It was likely specific to my env, but it doesn't hurt to run it twice anyway. This
+# was the case whether I ran it through cephadm shell or directly on the host machine.
+FQDN=$(python3 -c 'import socket; print(socket.getfqdn())')
+ISCSI_POOL=$(sudo cephadm shell -- ceph orch ls iscsi --format json | jq -r '.[] | .spec | .pool')
+ISCSI_USER="adminadmin"
+ISCSI_PASSWORD="adminadminadmin"
+
+# gateway setup
+container_gwcli() {
+ sudo podman exec -it ${ISCSI_CONT_ID} gwcli "$@"
+}
+
+container_gwcli /iscsi-targets create iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw
+# I've seen this give a nonzero error code with an error message even when
+# creating the gateway successfully, so this command is allowed to fail
+# If it actually failed to make the gateway, some of the follow up commands will fail
+container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/gateways create ${FQDN} ${NODE_IP} || true
+container_gwcli /disks create pool=${ISCSI_POOL} image=disk_1 size=2G
+container_gwcli /disks create pool=${ISCSI_POOL} image=disk_2 size=2G
+container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts create iqn.1994-05.com.redhat:client1
+container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 auth username=${ISCSI_USER} password=${ISCSI_PASSWORD}
+container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 disk add ${ISCSI_POOL}/disk_1
+container_gwcli /iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw/hosts/iqn.1994-05.com.redhat:client1 disk add ${ISCSI_POOL}/disk_2
+
+# set up multipath and some iscsi config options
+sudo dnf install -y iscsi-initiator-utils device-mapper-multipath
+
+# this next line is purposely being done without "-a" on the tee command to
+# overwrite the current initiatorname.iscsi file if it is there
+echo "GenerateName=no" | sudo tee /etc/iscsi/initiatorname.iscsi
+echo "InitiatorName=iqn.1994-05.com.redhat:client1" | sudo tee -a /etc/iscsi/initiatorname.iscsi
+
+echo "node.session.auth.authmethod = CHAP" | sudo tee -a /etc/iscsi/iscsid.conf
+echo "node.session.auth.username = ${ISCSI_USER}" | sudo tee -a /etc/iscsi/iscsid.conf
+echo "node.session.auth.password = ${ISCSI_PASSWORD}" | sudo tee -a /etc/iscsi/iscsid.conf
+
+sudo tee -a /etc/multipath.conf > /dev/null << EOF
+devices {
+ device {
+ vendor "LIO-ORG"
+ product "TCMU device"
+ hardware_handler "1 alua"
+ path_grouping_policy "failover"
+ path_selector "queue-length 0"
+ failback 60
+ path_checker tur
+ prio alua
+ prio_args exclusive_pref_bit
+ fast_io_fail_tmo 25
+ no_path_retry queue
+ }
+}
+EOF
+sudo systemctl restart multipathd
+sudo systemctl restart iscsid
+
+# client setup
+sudo iscsiadm -m discovery -t st -p ${NODE_IP}
+sudo iscsiadm -m node -T iqn.2003-01.com.redhat.iscsi-gw:iscsi-igw -l
+sudo iscsiadm -m session --rescan
+
+sleep 5
+
+# make sure we can now see luns
+sudo ls /dev/disk/by-path | grep iscsi
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index cdfff17d837..ad5950367e9 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -63,7 +63,7 @@ function retry_eagain()
for count in $(seq 1 $max) ; do
status=0
"$@" > $tmpfile 2>&1 || status=$?
- if test $status = 0 ||
+ if test $status = 0 ||
! grep --quiet EAGAIN $tmpfile ; then
break
fi
@@ -108,7 +108,7 @@ function check_response()
exit 1
fi
- if ! grep --quiet -- "$expected_string" $TMPFILE ; then
+ if ! grep --quiet -- "$expected_string" $TMPFILE ; then
echo "Didn't find $expected_string in output" >&2
cat $TMPFILE >&2
exit 1
@@ -696,7 +696,7 @@ function test_auth_profiles()
ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-ro
ceph -n client.xx-profile-rd -k client.xx.keyring auth del client.xx-profile-rw
-
+
# add a new role-definer with the existing role-definer
ceph -n client.xx-profile-rd -k client.xx.keyring \
auth add client.xx-profile-rd2 mon 'allow profile role-definer'
@@ -730,7 +730,7 @@ function test_mon_caps()
ceph-authtool -n client.bug --cap mon '' $TEMP_DIR/ceph.client.bug.keyring
ceph auth add client.bug -i $TEMP_DIR/ceph.client.bug.keyring
rados lspools --no-mon-config --keyring $TEMP_DIR/ceph.client.bug.keyring -n client.bug >& $TMPFILE || true
- check_response "Permission denied"
+ check_response "Permission denied"
}
function test_mon_misc()
@@ -780,7 +780,6 @@ function test_mon_misc()
ceph mgr dump
ceph mgr dump | jq -e '.active_clients[0].name'
ceph mgr module ls
- ceph mgr module enable restful
expect_false ceph mgr module enable foodne
ceph mgr module enable foodne --force
ceph mgr module disable foodne
@@ -1650,7 +1649,7 @@ function test_mon_osd()
dump_json=$(ceph osd dump --format=json | \
jq -cM '.osds[] | select(.osd == 0)')
[[ "${info_json}" == "${dump_json}" ]]
-
+
info_plain="$(ceph osd info)"
dump_plain="$(ceph osd dump | grep '^osd')"
[[ "${info_plain}" == "${dump_plain}" ]]
@@ -2244,7 +2243,7 @@ function test_mon_pg()
# tell osd version
#
ceph tell osd.0 version
- expect_false ceph tell osd.9999 version
+ expect_false ceph tell osd.9999 version
expect_false ceph tell osd.foo version
# back to pg stuff
@@ -2336,7 +2335,7 @@ function test_mon_osd_pool_set()
ceph osd pool get $TEST_POOL_GETSET deep_scrub_interval | expect_false grep '.'
ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.'
- ceph osd pool set $TEST_POOL_GETSET recovery_priority 5
+ ceph osd pool set $TEST_POOL_GETSET recovery_priority 5
ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: 5'
ceph osd pool set $TEST_POOL_GETSET recovery_priority -5
ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: -5'
@@ -2346,13 +2345,13 @@ function test_mon_osd_pool_set()
expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 11
ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.'
- ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5
+ ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5
ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | grep 'recovery_op_priority: 5'
ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 0
ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.'
ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.'
- ceph osd pool set $TEST_POOL_GETSET scrub_priority 5
+ ceph osd pool set $TEST_POOL_GETSET scrub_priority 5
ceph osd pool get $TEST_POOL_GETSET scrub_priority | grep 'scrub_priority: 5'
ceph osd pool set $TEST_POOL_GETSET scrub_priority 0
ceph osd pool get $TEST_POOL_GETSET scrub_priority | expect_false grep '.'
@@ -2386,10 +2385,10 @@ function test_mon_osd_pool_set()
ceph osd pool set $TEST_POOL_GETSET size 2
wait_for_clean
ceph osd pool set $TEST_POOL_GETSET min_size 2
-
+
expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 0
ceph osd pool set $TEST_POOL_GETSET hashpspool 0 --yes-i-really-mean-it
-
+
expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 1
ceph osd pool set $TEST_POOL_GETSET hashpspool 1 --yes-i-really-mean-it
@@ -2587,7 +2586,7 @@ function test_mon_osd_misc()
ceph osd map 2>$TMPFILE; check_response 'pool' $? 22
# expect error about unused argument foo
- ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22
+ ceph osd ls foo 2>$TMPFILE; check_response 'unused' $? 22
# expect "not in range" for invalid overload percentage
ceph osd reweight-by-utilization 80 2>$TMPFILE; check_response 'higher than 100' $? 22
diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh
new file mode 100755
index 00000000000..88552aa50bd
--- /dev/null
+++ b/qa/workunits/client/test_oc_disabled.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -ex
+
+ceph_test_client --client_oc=false
diff --git a/qa/workunits/dencoder/test_readable.py b/qa/workunits/dencoder/test_readable.py
index f032f7a9bbe..6eba0a4eb3f 100755
--- a/qa/workunits/dencoder/test_readable.py
+++ b/qa/workunits/dencoder/test_readable.py
@@ -61,7 +61,7 @@ def process_type(file_path, type):
cmd_determ = [CEPH_DENCODER, "type", type, "is_deterministic"]
determ_res = subprocess.run(cmd_determ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Check if the command failed
- if determ_res.returncode != 0:
+ if determ_res.returncode != 0 and determ_res.returncode != 1:
error_message = determ_res.stderr.decode().strip()
debug_print(f"Error running command: {error_message}")
return 1
@@ -222,7 +222,7 @@ def check_backward_compat():
version_name = version.name
_backward_compat[version_name] = {}
type_dir = archive_dir / version_name / "forward_incompat"
- if type_dir.exists() and type_dir.is_dir():
+ if type_dir.exists():
for type_entry in type_dir.iterdir():
if type_entry.is_dir():
type_name = type_entry.name
@@ -243,7 +243,8 @@ def check_backward_compat():
def process_batch(batch):
results = []
- with concurrent.futures.ThreadPoolExecutor() as executor:
+ max_workers = 15
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(
test_object_wrapper, batch_type, vdir, arversion, current_ver
@@ -259,7 +260,8 @@ def process_batch(batch):
# Create a generator that processes batches asynchronously
def async_process_batches(task_batches):
- with concurrent.futures.ProcessPoolExecutor() as executor:
+ max_workers = 10
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_batch, batch) for batch in task_batches]
for future in concurrent.futures.as_completed(futures):
yield future.result()
diff --git a/qa/workunits/erasure-code/bench.sh b/qa/workunits/erasure-code/bench.sh
index fc75830dfd0..87e997c3500 100755
--- a/qa/workunits/erasure-code/bench.sh
+++ b/qa/workunits/erasure-code/bench.sh
@@ -17,7 +17,8 @@
#
# Test that it works from sources with:
#
-# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \
+# TOTAL_SIZE=$((4 * 1024 * 1024)) SIZE=4096 \
+# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \
# PLUGIN_DIRECTORY=build/lib \
# qa/workunits/erasure-code/bench.sh fplot jerasure |
# tee qa/workunits/erasure-code/bench.js
@@ -34,10 +35,14 @@
# firefox qa/workunits/erasure-code/bench.html
#
# Once it is confirmed to work, it can be run with a more significant
-# volume of data so that the measures are more reliable:
+# volume of data so that the measures are more reliable. Ideally the size
+# of the buffers (SIZE) should be larger than the L3 cache to avoid cache hits.
+# The following example uses an 80MB (80 * 1024 * 1024) buffer.
+# A larger buffer with fewer iterations (iterations = TOTAL SIZE / SIZE) should result in
+# more time spent encoding/decoding and less time allocating/aligning buffers:
#
-# TOTAL_SIZE=$((4 * 1024 * 1024 * 1024)) \
-# CEPH_ERASURE_CODE_BENCHMARK=src/ceph_erasure_code_benchmark \
+# TOTAL_SIZE=$((100 * 80 * 1024 * 1024)) SIZE=$((80 * 1024 * 1024)) \
+# CEPH_ERASURE_CODE_BENCHMARK=build/bin/ceph_erasure_code_benchmark \
# PLUGIN_DIRECTORY=build/lib \
# qa/workunits/erasure-code/bench.sh fplot jerasure |
# tee qa/workunits/erasure-code/bench.js
@@ -51,8 +56,8 @@ export PATH=/sbin:$PATH
: ${PLUGIN_DIRECTORY:=/usr/lib/ceph/erasure-code}
: ${PLUGINS:=isa jerasure}
: ${TECHNIQUES:=vandermonde cauchy liberation reed_sol_r6_op blaum_roth liber8tion}
-: ${TOTAL_SIZE:=$((1024 * 1024))}
-: ${SIZE:=4096}
+: ${TOTAL_SIZE:=$((100 * 80 * 1024 * 1024))} #TOTAL_SIZE / SIZE = number of encode or decode iterations to run
+: ${SIZE:=$((80 * 1024 * 1024))} #size of buffer to encode/decode
: ${PARAMETERS:=--parameter jerasure-per-chunk-alignment=true}
declare -rA isa_techniques=(
diff --git a/qa/workunits/fs/misc/fallocate.sh b/qa/workunits/fs/misc/fallocate.sh
new file mode 100755
index 00000000000..253e6cb7a37
--- /dev/null
+++ b/qa/workunits/fs/misc/fallocate.sh
@@ -0,0 +1,17 @@
+#!/bin/sh -x
+
+# fallocate with mode 0 should fail with EOPNOTSUPP
+set -e
+mkdir -p testdir
+cd testdir
+
+expect_failure() {
+ if "$@"; then return 1; else return 0; fi
+}
+
+expect_failure fallocate -l 1M preallocated.txt
+rm -f preallocated.txt
+
+cd ..
+rmdir testdir
+echo OK
diff --git a/qa/workunits/fs/snaps/snaptest-double-null.sh b/qa/workunits/fs/snaps/snaptest-double-null.sh
index cdf32e4f0ef..833c0fd696b 100755
--- a/qa/workunits/fs/snaps/snaptest-double-null.sh
+++ b/qa/workunits/fs/snaps/snaptest-double-null.sh
@@ -11,6 +11,7 @@ mkdir a
cat > a/foo &
mkdir a/.snap/one
mkdir a/.snap/two
+wait
chmod 777 a/foo
sync # this might crash the mds
ps
diff --git a/qa/workunits/fs/snaps/snaptest-git-ceph.sh b/qa/workunits/fs/snaps/snaptest-git-ceph.sh
index 2b38720c9a5..6079ba8945b 100755
--- a/qa/workunits/fs/snaps/snaptest-git-ceph.sh
+++ b/qa/workunits/fs/snaps/snaptest-git-ceph.sh
@@ -4,7 +4,14 @@ set -e
# increase the cache size
sudo git config --global http.sslVerify false
-sudo git config --global http.postBuffer 1048576000
+sudo git config --global http.postBuffer 1024MB # default is 1MB
+sudo git config --global http.maxRequestBuffer 100M # default is 10MB
+sudo git config --global core.compression 0
+
+# enable the debug logs for git clone
+export GIT_TRACE_PACKET=1
+export GIT_TRACE=1
+export GIT_CURL_VERBOSE=1
# try it again if the clone is slow and the second time
retried=false
@@ -19,6 +26,11 @@ timeout 1800 git clone https://git.ceph.com/ceph.git
trap - EXIT
cd ceph
+# disable the debug logs for git clone
+export GIT_TRACE_PACKET=0
+export GIT_TRACE=0
+export GIT_CURL_VERBOSE=0
+
versions=`seq 1 90`
for v in $versions
diff --git a/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh
new file mode 100755
index 00000000000..827fb0a0b13
--- /dev/null
+++ b/qa/workunits/mon/mon-stretch-mode-5-mons-8-osds.sh
@@ -0,0 +1,72 @@
+#!/bin/bash -ex
+
+# A bash script for setting up stretch mode with 5 monitors and 8 OSDs.
+
+NUM_OSDS_UP=$(ceph osd df | grep "up" | wc -l)
+
+if [ $NUM_OSDS_UP -lt 8 ]; then
+ echo "test requires at least 8 OSDs up and running"
+ exit 1
+fi
+
+# ensure election strategy is set to "connectivity"
+# See https://tracker.ceph.com/issues/69107
+ceph mon set election_strategy connectivity
+
+for dc in dc1 dc2
+ do
+ ceph osd crush add-bucket $dc datacenter
+ ceph osd crush move $dc root=default
+ done
+
+ceph osd crush add-bucket host01 host
+ceph osd crush add-bucket host02 host
+ceph osd crush add-bucket host03 host
+ceph osd crush add-bucket host04 host
+
+ceph osd crush move host01 datacenter=dc1
+ceph osd crush move host02 datacenter=dc1
+ceph osd crush move host03 datacenter=dc2
+ceph osd crush move host04 datacenter=dc2
+
+ceph osd crush move osd.0 host=host01
+ceph osd crush move osd.1 host=host01
+ceph osd crush move osd.2 host=host02
+ceph osd crush move osd.3 host=host02
+ceph osd crush move osd.4 host=host03
+ceph osd crush move osd.5 host=host03
+ceph osd crush move osd.6 host=host04
+ceph osd crush move osd.7 host=host04
+
+# set location for monitors
+ceph mon set_location a datacenter=dc1 host=host01
+ceph mon set_location b datacenter=dc1 host=host02
+ceph mon set_location c datacenter=dc2 host=host03
+ceph mon set_location d datacenter=dc2 host=host04
+
+# set location for tiebreaker monitor
+ceph mon set_location e datacenter=dc3 host=host05
+
+# remove the current host from crush map
+hostname=$(hostname -s)
+ceph osd crush remove $hostname
+# create a new crush rule with stretch rule
+ceph osd getcrushmap > crushmap
+crushtool --decompile crushmap > crushmap.txt
+sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt
+cat >> crushmap_modified.txt << EOF
+rule stretch_rule {
+ id 2
+ type replicated
+ step take default
+ step choose firstn 2 type datacenter
+ step chooseleaf firstn 2 type host
+ step emit
+}
+# end crush map
+EOF
+
+crushtool --compile crushmap_modified.txt -o crushmap.bin
+ceph osd setcrushmap -i crushmap.bin
+
+ceph mon enable_stretch_mode e stretch_rule datacenter
diff --git a/qa/workunits/rbd/nvmeof_basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh
index dc6fd1669da..9e7a1f5134e 100755
--- a/qa/workunits/rbd/nvmeof_basic_tests.sh
+++ b/qa/workunits/nvmeof/basic_tests.sh
@@ -38,8 +38,10 @@ disconnect_all() {
connect_all() {
sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
sleep 5
- output=$(sudo nvme list --output-format=json)
- if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then
+ expected_devices_count=$1
+ actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l)
+ if [ "$actual_devices" -ne "$expected_devices_count" ]; then
+ sudo nvme list --output-format=json
return 1
fi
}
@@ -72,11 +74,13 @@ test_run connect
test_run list_subsys 1
test_run disconnect_all
test_run list_subsys 0
-test_run connect_all
+devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT ))
+test_run connect_all $devices_count
gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT))
test_run list_subsys $multipath_count
+
echo "-------------Test Summary-------------"
echo "[nvmeof] All nvmeof basic tests passed!"
diff --git a/qa/workunits/rbd/nvmeof_fio_test.sh b/qa/workunits/nvmeof/fio_test.sh
index 57d355a6318..f7f783afc67 100755
--- a/qa/workunits/rbd/nvmeof_fio_test.sh
+++ b/qa/workunits/nvmeof/fio_test.sh
@@ -5,6 +5,7 @@ sudo yum -y install sysstat
namespace_range_start=
namespace_range_end=
+random_devices_count=
rbd_iostat=false
while [[ $# -gt 0 ]]; do
@@ -17,6 +18,10 @@ while [[ $# -gt 0 ]]; do
namespace_range_end=$2
shift 2
;;
+ --random_devices)
+ random_devices_count=$2
+ shift 2
+ ;;
--rbd_iostat)
rbd_iostat=true
shift
@@ -29,7 +34,7 @@ done
fio_file=$(mktemp -t nvmeof-fio-XXXX)
all_drives_list=$(sudo nvme list --output-format=json |
- jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath')
+ jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace')
# When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`),
# then fio runs on namespaces only in the defined range (which is 1 to 3 here).
@@ -37,6 +42,8 @@ all_drives_list=$(sudo nvme list --output-format=json |
# run on first 3 namespaces here.
if [ "$namespace_range_start" ] || [ "$namespace_range_end" ]; then
selected_drives=$(echo "${all_drives_list[@]}" | sed -n "${namespace_range_start},${namespace_range_end}p")
+elif [ "$random_devices_count" ]; then
+ selected_drives=$(echo "${all_drives_list[@]}" | shuf -n $random_devices_count)
else
selected_drives="${all_drives_list[@]}"
fi
diff --git a/qa/workunits/nvmeof/mtls_test.sh b/qa/workunits/nvmeof/mtls_test.sh
new file mode 100755
index 00000000000..e13ca530e8d
--- /dev/null
+++ b/qa/workunits/nvmeof/mtls_test.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -ex
+source /etc/ceph/nvmeof.env
+
+# install yq
+wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /tmp/yq && chmod +x /tmp/yq
+
+subjectAltName=$(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | sed 's/,/,IP:/g')
+
+# create mtls spec files
+ceph orch ls nvmeof --export > /tmp/gw-conf-original.yaml
+sudo /tmp/yq ".spec.enable_auth=true | \
+ .spec.root_ca_cert=\"mountcert\" | \
+ .spec.client_cert = load_str(\"/etc/ceph/client.crt\") | \
+ .spec.client_key = load_str(\"/etc/ceph/client.key\") | \
+ .spec.server_cert = load_str(\"/etc/ceph/server.crt\") | \
+ .spec.server_key = load_str(\"/etc/ceph/server.key\")" /tmp/gw-conf-original.yaml > /tmp/gw-conf-with-mtls.yaml
+cp /tmp/gw-conf-original.yaml /tmp/gw-conf-without-mtls.yaml
+sudo /tmp/yq '.spec.enable_auth=false' -i /tmp/gw-conf-without-mtls.yaml
+
+wait_for_service() {
+ MAX_RETRIES=30
+ for ((RETRY_COUNT=1; RETRY_COUNT<=MAX_RETRIES; RETRY_COUNT++)); do
+
+ if ceph orch ls --refresh | grep -q "nvmeof"; then
+ echo "Found nvmeof in the output!"
+ break
+ fi
+ if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
+ echo "Reached maximum retries ($MAX_RETRIES). Exiting."
+ break
+ fi
+ sleep 5
+ done
+ ceph orch ps
+ ceph orch ls --refresh
+}
+
+# deploy mtls
+cat /tmp/gw-conf-with-mtls.yaml
+ceph orch apply -i /tmp/gw-conf-with-mtls.yaml
+ceph orch redeploy nvmeof.mypool.mygroup0
+sleep 100
+wait_for_service
+
+
+# test
+IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES"
+for i in "${!gateway_ips[@]}"
+do
+ ip="${gateway_ips[i]}"
+ sudo podman run -v /etc/ceph/server.crt:/server.crt:z -v /etc/ceph/client.crt:/client.crt:z \
+ -v /etc/ceph/client.key:/client.key:z \
+ -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \
+ --client-key /client.key --client-cert /client.crt --server-cert /server.crt --format json subsystem list
+done
+
+
+# remove mtls
+cat /tmp/gw-conf-without-mtls.yaml
+ceph orch apply -i /tmp/gw-conf-without-mtls.yaml
+ceph orch redeploy nvmeof.mypool.mygroup0
+sleep 100
+wait_for_service
+
+
+# test
+IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES"
+for i in "${!gateway_ips[@]}"
+do
+ ip="${gateway_ips[i]}"
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \
+ --format json subsystem list
+done
+
diff --git a/qa/workunits/nvmeof/namespace_test.sh b/qa/workunits/nvmeof/namespace_test.sh
new file mode 100755
index 00000000000..ef331fd085b
--- /dev/null
+++ b/qa/workunits/nvmeof/namespace_test.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -xe
+
+# It's assumed in this test that each subsystem has equal number
+# of namespaces (i.e. NVMEOF_NAMESPACES_COUNT ns per subsystem).
+# This script then adds NEW_NAMESPACES_COUNT amount of namespaces
+# to each subsystem and then deletes those new namespaces.
+
+source /etc/ceph/nvmeof.env
+
+RBD_POOL="${RBD_POOL:-mypool}"
+NEW_IMAGE_SIZE="${RBD_IMAGE_SIZE:-8192}" # 1024*8
+NEW_NAMESPACES_COUNT="${NEW_NAMESPACES_COUNT:-3}"
+
+gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
+new_images_count=$(( $NVMEOF_SUBSYSTEMS_COUNT * $NEW_NAMESPACES_COUNT))
+
+
+assert_namespaces_count() {
+ expected_count_per_subsys=$1
+ actual_count=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list |
+ grep namespace_count | grep $expected_count_per_subsys | wc -l)
+ if [ "$actual_count" -ne "$NVMEOF_SUBSYSTEMS_COUNT" ]; then
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list
+ echo "Expected count of namepaces not found, expected (per subsystem): $expected_count_per_subsys"
+ return 1
+ fi
+}
+
+
+# add rbd images
+for i in $(seq 1 $new_images_count); do
+ image_name="test${i}"
+ rbd create $RBD_POOL/$image_name --size $NEW_IMAGE_SIZE
+done
+
+# add new namespaces
+image_index=1
+for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
+ subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
+ for ns in $(seq 1 $NEW_NAMESPACES_COUNT); do
+ image="test${image_index}"
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace add --subsystem $subsystem_nqn --rbd-pool $RBD_POOL --rbd-image $image --load-balancing-group $(($image_index % $gateways_count + 1))
+ ((image_index++))
+ done
+done
+
+# list namespaces
+for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
+ subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn
+done
+
+# verify namespaces added
+expected_count_per_subsys=$(( $NEW_NAMESPACES_COUNT + $NVMEOF_NAMESPACES_COUNT ))
+assert_namespaces_count $expected_count_per_subsys
+
+# delete namespaces
+for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
+ subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
+ NSIDs=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json namespace list --subsystem $subsystem_nqn |
+ jq -r '.namespaces[] | select(.rbd_image_name | startswith("test")) | .nsid')
+
+ for nsid in $NSIDs; do
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace del --subsystem $subsystem_nqn --nsid $nsid
+ done
+done
+
+# verify namespaces deleted
+expected_count_per_subsys=$NVMEOF_NAMESPACES_COUNT
+assert_namespaces_count $expected_count_per_subsys
+
diff --git a/qa/workunits/nvmeof/scalability_test.sh b/qa/workunits/nvmeof/scalability_test.sh
new file mode 100755
index 00000000000..8ede4b7eda2
--- /dev/null
+++ b/qa/workunits/nvmeof/scalability_test.sh
@@ -0,0 +1,66 @@
+#!/bin/bash -xe
+
+
+GATEWAYS=$1 # exmaple "nvmeof.a,nvmeof.b"
+DELAY="${SCALING_DELAYS:-50}"
+POOL="${RBD_POOL:-mypool}"
+GROUP="${NVMEOF_GROUP:-mygroup0}"
+source /etc/ceph/nvmeof.env
+
+if [ -z "$GATEWAYS" ]; then
+ echo "At least one gateway needs to be defined for scalability test"
+ exit 1
+fi
+
+status_checks() {
+ expected_count=$1
+
+ output=$(ceph nvme-gw show $POOL $GROUP)
+ nvme_show=$(echo $output | grep -o '"AVAILABLE"' | wc -l)
+ if [ "$nvme_show" -ne "$expected_count" ]; then
+ return 1
+ fi
+
+ orch_ls=$(ceph orch ls)
+ if ! echo "$orch_ls" | grep -q "$expected_count/$expected_count"; then
+ return 1
+ fi
+
+ output=$(ceph orch ps --service-name nvmeof.$POOL.$GROUP)
+ orch_ps=$(echo $output | grep -o 'running' | wc -l)
+ if [ "$orch_ps" -ne "$expected_count" ]; then
+ return 1
+ fi
+
+ ceph_status=$(ceph -s)
+ if ! echo "$ceph_status" | grep -q "HEALTH_OK"; then
+ return 1
+ fi
+}
+
+total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
+scaled_down_gateways_count=$(( total_gateways_count - $(echo "$GATEWAYS" | tr -cd ',' | wc -c) - 1 ))
+
+
+echo "[nvmeof.scale] Setting up config to remove gateways ${GATEWAYS}"
+ceph orch ls --service-name nvmeof.$POOL.$GROUP --export > /tmp/nvmeof-gw.yaml
+ceph orch ls nvmeof --export > /tmp/nvmeof-gw.yaml
+cat /tmp/nvmeof-gw.yaml
+
+pattern=$(echo $GATEWAYS | sed 's/,/\\|/g')
+sed "/$pattern/d" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml
+cat /tmp/nvmeof-gw-new.yaml
+
+echo "[nvmeof.scale] Starting scale testing by removing ${GATEWAYS}"
+status_checks $total_gateways_count
+ceph orch apply -i /tmp/nvmeof-gw-new.yaml # downscale
+ceph orch redeploy nvmeof.$POOL.$GROUP
+sleep $DELAY
+status_checks $scaled_down_gateways_count
+echo "[nvmeof.scale] Downscale complete - removed gateways (${GATEWAYS}); now scaling back up"
+ceph orch apply -i /tmp/nvmeof-gw.yaml #upscale
+ceph orch redeploy nvmeof.$POOL.$GROUP
+sleep $DELAY
+status_checks $total_gateways_count
+
+echo "[nvmeof.scale] Scale testing passed for ${GATEWAYS}"
diff --git a/qa/workunits/rbd/nvmeof_setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index fb72e1d6402..b573647b1e3 100755
--- a/qa/workunits/rbd/nvmeof_setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -26,14 +26,21 @@ list_subsystems () {
done
}
+list_namespaces () {
+ for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
+ subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn
+ done
+}
+
+echo "[nvmeof] Starting subsystem setup..."
+
# add all subsystems
for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
- sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn
+ sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
done
-list_subsystems
-
# add all gateway listeners
for i in "${!gateway_ips[@]}"
do
@@ -65,11 +72,5 @@ done
list_subsystems
-# list namespaces
-for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
- subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
- sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn
-done
-
echo "[nvmeof] Subsystem setup done"
diff --git a/qa/workunits/rados/test_rados_tool.sh b/qa/workunits/rados/test_rados_tool.sh
index b822aa2b823..9febc4a4524 100755
--- a/qa/workunits/rados/test_rados_tool.sh
+++ b/qa/workunits/rados/test_rados_tool.sh
@@ -329,10 +329,10 @@ test_xattr() {
expect_false $RADOS_TOOL -p $POOL setxattr $OBJ 2>/dev/null
expect_false $RADOS_TOOL -p $POOL setxattr $OBJ foo fooval extraarg 2>/dev/null
$RADOS_TOOL -p $POOL setxattr $OBJ foo fooval
- $RADOS_TOOL -p $POOL getxattr $OBJ foo > $V2
+ $RADOS_TOOL -p $POOL getxattr $OBJ foo > $V2 | tr -d '\n' > $V2
cmp $V1 $V2
cat $V1 | $RADOS_TOOL -p $POOL setxattr $OBJ bar
- $RADOS_TOOL -p $POOL getxattr $OBJ bar > $V2
+ $RADOS_TOOL -p $POOL getxattr $OBJ bar > $V2 | tr -d '\n' > $V2
cmp $V1 $V2
$RADOS_TOOL -p $POOL listxattr $OBJ > $V1
grep -q foo $V1
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index ca9ffde8113..0ceb9ff54cf 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -1,8 +1,6 @@
#!/usr/bin/env bash
set -ex
-. $(dirname $0)/../../standalone/ceph-helpers.sh
-
export RBD_FORCE_ALLOW_V1=1
# make sure rbd pool is EMPTY.. this is a test script!!
@@ -916,6 +914,11 @@ test_namespace() {
rbd group create rbd/test1/group1
rbd group image add rbd/test1/group1 rbd/test1/image1
+ rbd group image add --group-pool rbd --group-namespace test1 --group group1 \
+ --image-pool rbd --image-namespace test1 --image image2
+ rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \
+ --image-pool rbd --image-namespace test1 --image image1
+ rbd group image rm rbd/test1/group1 rbd/test1/image2
rbd group rm rbd/test1/group1
rbd trash move rbd/test1/image1
@@ -935,7 +938,7 @@ get_migration_state() {
local image=$1
rbd --format xml status $image |
- $XMLSTARLET sel -t -v '//status/migration/state'
+ xmlstarlet sel -t -v '//status/migration/state'
}
test_migration() {
@@ -1175,14 +1178,14 @@ test_trash_purge_schedule() {
for i in `seq 12`; do
test "$(rbd trash purge schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd' && break
+ xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd' && break
sleep 10
done
rbd trash purge schedule status
test "$(rbd trash purge schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd'
+ xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd'
test "$(rbd trash purge schedule status -p rbd --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd'
+ xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd'
rbd trash purge schedule add 2d 00:17
rbd trash purge schedule ls | grep 'every 2d starting at 00:17'
@@ -1191,36 +1194,36 @@ test_trash_purge_schedule() {
rbd trash purge schedule ls -p rbd2 -R | grep 'every 2d starting at 00:17'
rbd trash purge schedule ls -p rbd2/ns1 -R | grep 'every 2d starting at 00:17'
test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml |
- $XMLSTARLET sel -t -v '//schedules/schedule/pool')" = "-"
+ xmlstarlet sel -t -v '//schedules/schedule/pool')" = "-"
test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml |
- $XMLSTARLET sel -t -v '//schedules/schedule/namespace')" = "-"
+ xmlstarlet sel -t -v '//schedules/schedule/namespace')" = "-"
test "$(rbd trash purge schedule ls -R -p rbd2/ns1 --format xml |
- $XMLSTARLET sel -t -v '//schedules/schedule/items/item/start_time')" = "00:17:00"
+ xmlstarlet sel -t -v '//schedules/schedule/items/item/start_time')" = "00:17:00"
for i in `seq 12`; do
rbd trash purge schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool' | grep 'rbd2' && break
+ xmlstarlet sel -t -v '//scheduled/item/pool' | grep 'rbd2' && break
sleep 10
done
rbd trash purge schedule status
rbd trash purge schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool' | grep 'rbd2'
+ xmlstarlet sel -t -v '//scheduled/item/pool' | grep 'rbd2'
echo $(rbd trash purge schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool') | grep 'rbd rbd2 rbd2'
+ xmlstarlet sel -t -v '//scheduled/item/pool') | grep 'rbd rbd2 rbd2'
test "$(rbd trash purge schedule status -p rbd --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool')" = 'rbd'
+ xmlstarlet sel -t -v '//scheduled/item/pool')" = 'rbd'
test "$(echo $(rbd trash purge schedule status -p rbd2 --format xml |
- $XMLSTARLET sel -t -v '//scheduled/item/pool'))" = 'rbd2 rbd2'
+ xmlstarlet sel -t -v '//scheduled/item/pool'))" = 'rbd2 rbd2'
test "$(echo $(rbd trash purge schedule ls -R --format xml |
- $XMLSTARLET sel -t -v '//schedules/schedule/items'))" = "2d00:17:00 1d01:30:00"
+ xmlstarlet sel -t -v '//schedules/schedule/items'))" = "2d00:17:00 1d01:30:00"
rbd trash purge schedule add 1d
rbd trash purge schedule ls | grep 'every 2d starting at 00:17'
rbd trash purge schedule ls | grep 'every 1d'
rbd trash purge schedule ls -R --format xml |
- $XMLSTARLET sel -t -v '//schedules/schedule/items' | grep '2d00:17'
+ xmlstarlet sel -t -v '//schedules/schedule/items' | grep '2d00:17'
rbd trash purge schedule rm 1d
rbd trash purge schedule ls | grep 'every 2d starting at 00:17'
@@ -1362,13 +1365,13 @@ test_mirror_snapshot_schedule() {
rbd mirror snapshot schedule status
test "$(rbd mirror snapshot schedule status --format xml |
- $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
+ xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
test "$(rbd mirror snapshot schedule status -p rbd2 --format xml |
- $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
+ xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --format xml |
- $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
+ xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --image test1 --format xml |
- $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
+ xmlstarlet sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
rbd mirror image demote rbd2/ns1/test1
for i in `seq 12`; do
diff --git a/qa/workunits/rbd/cli_migration.sh b/qa/workunits/rbd/cli_migration.sh
index be8e031fd1b..3af19420957 100755
--- a/qa/workunits/rbd/cli_migration.sh
+++ b/qa/workunits/rbd/cli_migration.sh
@@ -1,17 +1,20 @@
#!/usr/bin/env bash
set -ex
-. $(dirname $0)/../../standalone/ceph-helpers.sh
-
TEMPDIR=
IMAGE1=image1
IMAGE2=image2
IMAGE3=image3
-IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3}"
+NAMESPACE1=namespace1
+NAMESPACE2=namespace2
+NAMESPACES="${NAMESPACE1} ${NAMESPACE2}"
+IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3} rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}"
cleanup() {
+ kill_nbd_server
cleanup_tempdir
remove_images
+ remove_namespaces
}
setup_tempdir() {
@@ -22,10 +25,17 @@ cleanup_tempdir() {
rm -rf ${TEMPDIR}
}
+expect_false() {
+ if "$@"; then return 1; else return 0; fi
+}
+
create_base_image() {
local image=$1
- rbd create --size 1G ${image}
+ # size is not a multiple of object size to trigger an edge case in
+ # list-snaps
+ rbd create --size 1025M ${image}
+
rbd bench --io-type write --io-pattern rand --io-size=4K --io-total 256M ${image}
rbd snap create ${image}@1
rbd bench --io-type write --io-pattern rand --io-size=4K --io-total 64M ${image}
@@ -36,8 +46,11 @@ create_base_image() {
export_raw_image() {
local image=$1
- rm -rf "${TEMPDIR}/${image}"
- rbd export ${image} "${TEMPDIR}/${image}"
+ # Replace slashes (/) with underscores (_) for namespace images
+ local export_image="${image//\//_}"
+
+ rm -rf "${TEMPDIR}/${export_image}"
+ rbd export "${image}" "${TEMPDIR}/${export_image}"
}
export_base_image() {
@@ -63,6 +76,17 @@ remove_images() {
done
}
+remove_namespaces() {
+ for namespace in ${NAMESPACES}
+ do
+ rbd namespace remove rbd/${namespace} || true
+ done
+}
+
+kill_nbd_server() {
+ pkill -9 qemu-nbd || true
+}
+
show_diff()
{
local file1=$1
@@ -80,6 +104,11 @@ compare_images() {
local ret=0
export_raw_image ${dst_image}
+
+ # Replace slashes (/) with underscores (_) for namespace images
+ src_image="${src_image//\//_}"
+ dst_image="${dst_image//\//_}"
+
if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
then
show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
@@ -89,18 +118,26 @@ compare_images() {
}
test_import_native_format() {
- local base_image=$1
- local dest_image=$2
+ local base_image_spec=$1
+ local dest_image_spec=$2
+
+ # if base image is from namespace
+ local base_namespace=""
+ local base_image=${base_image_spec}
+ if [[ "${base_image_spec}" == rbd/*/* ]]; then
+ base_namespace=$(basename "$(dirname "${base_image_spec}")")
+ base_image=$(basename "${base_image_spec}")
+ fi
- rbd migration prepare --import-only "rbd/${base_image}@2" ${dest_image}
- rbd migration abort ${dest_image}
+ rbd migration prepare --import-only "${base_image_spec}@2" ${dest_image_spec}
+ rbd migration abort ${dest_image_spec}
local pool_id=$(ceph osd pool ls detail --format xml | xmlstarlet sel -t -v "//pools/pool[pool_name='rbd']/pool_id")
cat > ${TEMPDIR}/spec.json <<EOF
{
"type": "native",
"pool_id": ${pool_id},
- "pool_namespace": "",
+ "pool_namespace": "${base_namespace}",
"image_name": "${base_image}",
"snap_name": "2"
}
@@ -108,37 +145,85 @@ EOF
cat ${TEMPDIR}/spec.json
rbd migration prepare --import-only \
- --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec}
- compare_images "${base_image}@1" "${dest_image}@1"
- compare_images "${base_image}@2" "${dest_image}@2"
+ compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+ compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
- rbd migration abort ${dest_image}
+ rbd migration abort ${dest_image_spec}
rbd migration prepare --import-only \
- --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
- rbd migration execute ${dest_image}
-
- compare_images "${base_image}@1" "${dest_image}@1"
- compare_images "${base_image}@2" "${dest_image}@2"
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec}
+ rbd migration execute ${dest_image_spec}
+
+ compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+ compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
+
+ rbd migration abort ${dest_image_spec}
+
+ # no snap name or snap id
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\"}" \
+ ${dest_image_spec}
+
+ # invalid source spec JSON
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \
+ ${dest_image_spec}
+
+ # non-existing snap name
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \
+ ${dest_image_spec}
+
+ # invalid snap name
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \
+ ${dest_image_spec}
+
+ # non-existing snap id passed as int
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \
+ ${dest_image_spec}
+
+ # non-existing snap id passed as string
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \
+ ${dest_image_spec}
+
+ # invalid snap id
+ expect_false rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \
+ ${dest_image_spec}
+
+ # snap id passed as int
+ local snap_id=$(rbd snap ls ${base_image_spec} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id")
+ rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \
+ ${dest_image_spec}
+ rbd migration abort ${dest_image_spec}
- rbd migration abort ${dest_image}
+ # snap id passed as string
+ rbd migration prepare --import-only \
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \
+ ${dest_image_spec}
+ rbd migration abort ${dest_image_spec}
rbd migration prepare --import-only \
- --source-spec "{\"type\": \"native\", \"pool_id\": "${pool_id}", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
- ${dest_image}
- rbd migration abort ${dest_image}
+ --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
+ ${dest_image_spec}
+ rbd migration abort ${dest_image_spec}
rbd migration prepare --import-only \
- --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
- ${dest_image}
- rbd migration execute ${dest_image}
- rbd migration commit ${dest_image}
+ --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
+ ${dest_image_spec}
+ rbd migration execute ${dest_image_spec}
+ rbd migration commit ${dest_image_spec}
- compare_images "${base_image}@1" "${dest_image}@1"
- compare_images "${base_image}@2" "${dest_image}@2"
+ compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+ compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
- remove_image "${dest_image}"
+ remove_image "${dest_image_spec}"
}
test_import_qcow_format() {
@@ -279,12 +364,12 @@ EOF
cat ${TEMPDIR}/spec.json
cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \
- --source-spec-path - ${dest_image}
+ --source-spec-path - ${dest_image}
compare_images ${base_image} ${dest_image}
rbd migration abort ${dest_image}
rbd migration prepare --import-only \
- --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
rbd migration execute ${dest_image}
rbd migration commit ${dest_image}
@@ -340,6 +425,177 @@ EOF
remove_image "${dest_image}"
}
+test_import_nbd_stream_qcow2() {
+ local base_image=$1
+ local dest_image=$2
+
+ qemu-nbd -f qcow2 --read-only --shared 10 --persistent --fork \
+ ${TEMPDIR}/${base_image}.qcow2
+
+ cat > ${TEMPDIR}/spec.json <<EOF
+{
+ "type": "raw",
+ "stream": {
+ "type": "nbd",
+ "uri": "nbd://localhost"
+ }
+}
+EOF
+ cat ${TEMPDIR}/spec.json
+
+ cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \
+ --source-spec-path - ${dest_image}
+ compare_images ${base_image} ${dest_image}
+ rbd migration abort ${dest_image}
+
+ rbd migration prepare --import-only \
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+ compare_images ${base_image} ${dest_image}
+ rbd migration execute ${dest_image}
+ compare_images ${base_image} ${dest_image}
+ rbd migration commit ${dest_image}
+ compare_images ${base_image} ${dest_image}
+ remove_image "${dest_image}"
+
+ # shortest possible URI
+ rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://"}}' \
+ ${dest_image}
+ rbd migration abort ${dest_image}
+
+ # non-existing export name
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd:///myexport"}}' \
+ ${dest_image}
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://localhost/myexport"}}' \
+ ${dest_image}
+
+ kill_nbd_server
+ qemu-nbd --export-name myexport -f qcow2 --read-only --shared 10 --persistent --fork \
+ ${TEMPDIR}/${base_image}.qcow2
+
+ rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd:///myexport"}}' \
+ ${dest_image}
+ rbd migration abort ${dest_image}
+
+ rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://localhost/myexport"}}' \
+ ${dest_image}
+ rbd migration abort ${dest_image}
+
+ kill_nbd_server
+
+ # server not running
+ expect_false rbd migration prepare --import-only \
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://"}}' \
+ ${dest_image}
+
+ # no URI
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd"}}' \
+ ${dest_image}
+
+ # invalid URI
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": 123456}}' \
+ ${dest_image}
+
+ # libnbd - nbd_get_errno() returns an error
+ # nbd_connect_uri: unknown URI scheme: NULL: Invalid argument (errno = 22)
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": ""}}' \
+ ${dest_image}
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "foo.example.com"}}' \
+ ${dest_image}
+
+ # libnbd - nbd_get_errno() returns 0, EIO fallback
+ # nbd_connect_uri: getaddrinfo: foo.example.com:10809: Name or service not known (errno = 0)
+ expect_false rbd migration prepare --import-only \
+ --source-spec '{"type": "raw", "stream": {"type": "nbd", "uri": "nbd://foo.example.com"}}' \
+ ${dest_image}
+}
+
+test_import_nbd_stream_raw() {
+ local base_image=$1
+ local dest_image=$2
+
+ qemu-nbd -f raw --read-only --shared 10 --persistent --fork \
+ --socket ${TEMPDIR}/qemu-nbd-${base_image} ${TEMPDIR}/${base_image}
+ qemu-nbd -f raw --read-only --shared 10 --persistent --fork \
+ --socket ${TEMPDIR}/qemu-nbd-${base_image}@1 ${TEMPDIR}/${base_image}@1
+ qemu-nbd -f raw --read-only --shared 10 --persistent --fork \
+ --socket ${TEMPDIR}/qemu-nbd-${base_image}@2 ${TEMPDIR}/${base_image}@2
+
+ cat > ${TEMPDIR}/spec.json <<EOF
+{
+ "type": "raw",
+ "stream": {
+ "type": "nbd",
+ "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}"
+ },
+ "snapshots": [{
+ "type": "raw",
+ "name": "snap1",
+ "stream": {
+ "type": "nbd",
+ "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}@1"
+ }
+ }, {
+ "type": "raw",
+ "name": "snap2",
+ "stream": {
+ "type": "nbd",
+ "uri": "nbd+unix:///?socket=${TEMPDIR}/qemu-nbd-${base_image}@2"
+ }
+ }]
+}
+EOF
+ cat ${TEMPDIR}/spec.json
+
+ rbd migration prepare --import-only \
+ --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+
+ rbd snap create ${dest_image}@head
+ rbd bench --io-type write --io-pattern rand --io-size 32K --io-total 4M ${dest_image}
+
+ compare_images "${base_image}@1" "${dest_image}@snap1"
+ compare_images "${base_image}@2" "${dest_image}@snap2"
+ compare_images "${base_image}" "${dest_image}@head"
+
+ rbd migration abort ${dest_image}
+
+ cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \
+ --source-spec-path - ${dest_image}
+
+ rbd snap create ${dest_image}@head
+ rbd bench --io-type write --io-pattern rand --io-size 64K --io-total 8M ${dest_image}
+
+ compare_images "${base_image}@1" "${dest_image}@snap1"
+ compare_images "${base_image}@2" "${dest_image}@snap2"
+ compare_images "${base_image}" "${dest_image}@head"
+
+ rbd migration execute ${dest_image}
+
+ compare_images "${base_image}@1" "${dest_image}@snap1"
+ compare_images "${base_image}@2" "${dest_image}@snap2"
+ compare_images "${base_image}" "${dest_image}@head"
+
+ rbd migration commit ${dest_image}
+
+ compare_images "${base_image}@1" "${dest_image}@snap1"
+ compare_images "${base_image}@2" "${dest_image}@snap2"
+ compare_images "${base_image}" "${dest_image}@head"
+
+ remove_image "${dest_image}"
+
+ kill_nbd_server
+}
+
# make sure rbd pool is EMPTY.. this is a test script!!
rbd ls 2>&1 | wc -l | grep -v '^0$' && echo "nonempty rbd pool, aborting! run this script on an empty test cluster only." && exit 1
@@ -351,7 +607,25 @@ export_base_image ${IMAGE1}
test_import_native_format ${IMAGE1} ${IMAGE2}
test_import_qcow_format ${IMAGE1} ${IMAGE2}
+
test_import_qcow2_format ${IMAGE2} ${IMAGE3}
+test_import_nbd_stream_qcow2 ${IMAGE2} ${IMAGE3}
+
test_import_raw_format ${IMAGE1} ${IMAGE2}
+test_import_nbd_stream_raw ${IMAGE1} ${IMAGE2}
+
+rbd namespace create rbd/${NAMESPACE1}
+rbd namespace create rbd/${NAMESPACE2}
+create_base_image rbd/${NAMESPACE1}/${IMAGE1}
+export_base_image rbd/${NAMESPACE1}/${IMAGE1}
+
+# Migration from namespace to namespace
+test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}
+
+# Migration from namespace to non-namespace
+test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} ${IMAGE2}
+
+# Migration from non-namespace to namespace
+test_import_native_format ${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}
echo OK
diff --git a/qa/workunits/rbd/journal.sh b/qa/workunits/rbd/journal.sh
index ba89e75c926..7652a274243 100755
--- a/qa/workunits/rbd/journal.sh
+++ b/qa/workunits/rbd/journal.sh
@@ -1,8 +1,6 @@
#!/usr/bin/env bash
set -e
-. $(dirname $0)/../../standalone/ceph-helpers.sh
-
function list_tests()
{
echo "AVAILABLE TESTS"
@@ -45,7 +43,7 @@ test_rbd_journal()
rbd create --image-feature exclusive-lock --image-feature journaling \
--size 128 ${image}
local journal=$(rbd info ${image} --format=xml 2>/dev/null |
- $XMLSTARLET sel -t -v "//image/journal")
+ xmlstarlet sel -t -v "//image/journal")
test -n "${journal}"
rbd journal info ${journal}
rbd journal info --journal ${journal}
@@ -54,14 +52,14 @@ test_rbd_journal()
rbd feature disable ${image} journaling
rbd info ${image} --format=xml 2>/dev/null |
- expect_false $XMLSTARLET sel -t -v "//image/journal"
+ expect_false xmlstarlet sel -t -v "//image/journal"
expect_false rbd journal info ${journal}
expect_false rbd journal info --image ${image}
rbd feature enable ${image} journaling
local journal1=$(rbd info ${image} --format=xml 2>/dev/null |
- $XMLSTARLET sel -t -v "//image/journal")
+ xmlstarlet sel -t -v "//image/journal")
test "${journal}" = "${journal1}"
rbd journal info ${journal}
@@ -89,7 +87,7 @@ test_rbd_journal()
rbd create --image-feature exclusive-lock --image-feature journaling \
--size 128 ${image1}
journal1=$(rbd info ${image1} --format=xml 2>/dev/null |
- $XMLSTARLET sel -t -v "//image/journal")
+ xmlstarlet sel -t -v "//image/journal")
save_commit_position ${journal1}
rbd journal import --dest ${image1} $TMPDIR/journal.export
@@ -130,7 +128,7 @@ rbd_assert_eq() {
local expected_val=$4
local val=$(rbd --format xml ${cmd} --image ${image} |
- $XMLSTARLET sel -t -v "${param}")
+ xmlstarlet sel -t -v "${param}")
test "${val}" = "${expected_val}"
}
diff --git a/qa/workunits/rbd/luks-encryption.sh b/qa/workunits/rbd/luks-encryption.sh
index 97cb5a0fe87..b6305cb46c6 100755
--- a/qa/workunits/rbd/luks-encryption.sh
+++ b/qa/workunits/rbd/luks-encryption.sh
@@ -2,7 +2,7 @@
set -ex
CEPH_ID=${CEPH_ID:-admin}
-TMP_FILES="/tmp/passphrase /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
+TMP_FILES="/tmp/passphrase /tmp/passphrase1 /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
_sudo()
{
@@ -278,8 +278,7 @@ function test_migration_clone() {
rbd migration prepare testimg1 testimg2
# test reading
- # FIXME: https://tracker.ceph.com/issues/63184
- LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
cmp $LIBRBD_DEV /tmp/cmpdata
# trigger copyup for an unwritten area
@@ -297,8 +296,7 @@ function test_migration_clone() {
_sudo rbd device unmap -t nbd $LIBRBD_DEV
# test reading on a fresh mapping
- # FIXME: https://tracker.ceph.com/issues/63184
- LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
cmp $LIBRBD_DEV /tmp/cmpdata
_sudo rbd device unmap -t nbd $LIBRBD_DEV
@@ -320,6 +318,85 @@ function test_migration_clone() {
rbd rm testimg
}
+function test_migration_open_clone_chain() {
+ rbd create --size 32M testimg
+ rbd encryption format testimg luks1 /tmp/passphrase
+ rbd snap create testimg@snap
+ rbd snap protect testimg@snap
+
+ rbd clone testimg@snap testimg1
+ rbd encryption format testimg1 luks2 /tmp/passphrase1
+ rbd snap create testimg1@snap
+ rbd snap protect testimg1@snap
+
+ rbd clone testimg1@snap testimg2
+ rbd encryption format testimg2 luks1 /tmp/passphrase2
+
+ # 1. X <-- X <-- X
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+ # 2. X <-- X <-- migrating
+ rbd migration prepare testimg2 testimg2
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg2
+
+ # 3. X <-- migrating <-- X
+ rbd migration prepare testimg1 testimg1
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg1
+
+ # 4. migrating <-- X <-- X
+ rbd migration prepare testimg testimg
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg
+
+ # 5. migrating <-- migrating <-- X
+ rbd migration prepare testimg testimg
+ rbd migration prepare testimg1 testimg1
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg1
+ rbd migration abort testimg
+
+ # 6. migrating <-- X <-- migrating
+ rbd migration prepare testimg testimg
+ rbd migration prepare testimg2 testimg2
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg2
+ rbd migration abort testimg
+
+ # 7. X <-- migrating <-- migrating
+ rbd migration prepare testimg1 testimg1
+ rbd migration prepare testimg2 testimg2
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+ rbd migration abort testimg2
+ rbd migration abort testimg1
+
+ # 8. migrating <-- migrating <-- migrating
+ rbd migration prepare testimg testimg
+ rbd migration prepare testimg1 testimg1
+ rbd migration prepare testimg2 testimg2
+ LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+ _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+ rbd migration abort testimg2
+ rbd rm testimg2
+ rbd migration abort testimg1
+ rbd snap unprotect testimg1@snap
+ rbd snap rm testimg1@snap
+ rbd rm testimg1
+ rbd migration abort testimg
+ rbd snap unprotect testimg@snap
+ rbd snap rm testimg@snap
+ rbd rm testimg
+}
+
function get_nbd_device_paths {
rbd device list -t nbd | tail -n +2 | egrep "\s+rbd\s+testimg" | awk '{print $5;}'
}
@@ -343,6 +420,7 @@ function clean_up {
rbd snap unprotect testimg1@snap || true
rbd snap remove testimg1@snap || true
rbd remove testimg1 || true
+ rbd migration abort testimg || true
rbd snap remove testimg@snap2 || true
rbd snap remove testimg@snap1 || true
rbd snap unprotect testimg@snap || true
@@ -371,6 +449,7 @@ dd if=/dev/urandom of=/tmp/testdata2 bs=4M count=4
# create passphrase files
printf "pass\0word\n" > /tmp/passphrase
+printf " passwo\nrd 1,1" > /tmp/passphrase1
printf "\t password2 " > /tmp/passphrase2
# create an image
@@ -401,4 +480,6 @@ test_migration_clone luks1
rbd create --size 48M testimg
test_migration_clone luks2
+test_migration_open_clone_chain
+
echo OK
diff --git a/qa/workunits/rbd/rbd-ggate.sh b/qa/workunits/rbd/rbd-ggate.sh
index 1bf89da382c..d1dd00e4e2d 100755
--- a/qa/workunits/rbd/rbd-ggate.sh
+++ b/qa/workunits/rbd/rbd-ggate.sh
@@ -7,15 +7,6 @@ SIZE=64
DATA=
DEV=
-if which xmlstarlet > /dev/null 2>&1; then
- XMLSTARLET=xmlstarlet
-elif which xml > /dev/null 2>&1; then
- XMLSTARLET=xml
-else
- echo "Missing xmlstarlet binary!"
- exit 1
-fi
-
if [ `uname -K` -ge 1200078 ] ; then
RBD_GGATE_RESIZE_SUPPORTED=1
fi
@@ -148,16 +139,16 @@ _sudo sync
echo trim test
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -eq "${provisioned}" ]
_sudo newfs -E ${DEV}
_sudo sync
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -lt "${provisioned}" ]
echo resize test
diff --git a/qa/workunits/rbd/rbd-nbd.sh b/qa/workunits/rbd/rbd-nbd.sh
index 98b3aff1370..1f9acd14492 100755
--- a/qa/workunits/rbd/rbd-nbd.sh
+++ b/qa/workunits/rbd/rbd-nbd.sh
@@ -1,8 +1,6 @@
#!/usr/bin/env bash
set -ex
-. $(dirname $0)/../../standalone/ceph-helpers.sh
-
POOL=rbd
ANOTHER_POOL=new_default_pool$$
NS=ns
@@ -105,7 +103,7 @@ function get_pid()
local pool=$1
local ns=$2
- PID=$(rbd device --device-type nbd --format xml list | $XMLSTARLET sel -t -v \
+ PID=$(rbd device --device-type nbd --format xml list | xmlstarlet sel -t -v \
"//devices/device[pool='${pool}'][namespace='${ns}'][image='${IMAGE}'][device='${DEV}']/id")
test -n "${PID}" || return 1
ps -p ${PID} -C rbd-nbd
@@ -172,17 +170,17 @@ unmap_device ${DEV} ${PID}
DEV=`_sudo rbd device --device-type nbd --options notrim map ${POOL}/${IMAGE}`
get_pid ${POOL}
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -eq "${provisioned}" ]
# should fail discard as at time of mapping notrim was used
expect_false _sudo blkdiscard ${DEV}
sync
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -eq "${provisioned}" ]
unmap_device ${DEV} ${PID}
@@ -190,17 +188,17 @@ unmap_device ${DEV} ${PID}
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
get_pid ${POOL}
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -eq "${provisioned}" ]
# should honor discard as at time of mapping trim was considered by default
_sudo blkdiscard ${DEV}
sync
provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/provisioned_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/provisioned_size" -v .`
used=`rbd -p ${POOL} --format xml du ${IMAGE} |
- $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
+ xmlstarlet sel -t -m "//stats/images/image/used_size" -v .`
[ "${used}" -lt "${provisioned}" ]
unmap_device ${DEV} ${PID}
diff --git a/qa/workunits/rbd/rbd_groups.sh b/qa/workunits/rbd/rbd_groups.sh
index 450095dabfc..ee3cb506740 100755
--- a/qa/workunits/rbd/rbd_groups.sh
+++ b/qa/workunits/rbd/rbd_groups.sh
@@ -210,15 +210,32 @@ check_snapshot_info()
local snap_name=$2
local image_count=$3
- local snap_info=$(rbd group snap info $group_name@$snap_name --format=json)
- local actual_snap_name=$(jq -r ".name" <<< "$snap_info")
+ local snap_info_json=$(
+ rbd group snap info $group_name@$snap_name --format=json)
+ local actual_snap_name=$(jq -r ".name" <<< "$snap_info_json")
test "$actual_snap_name" = "$snap_name" || return 1
- local snap_state=$(jq -r ".state" <<< "$snap_info")
+ local snap_state=$(jq -r ".state" <<< "$snap_info_json")
test "$snap_state" = "complete" || return 1
- local actual_image_count=$(jq '.images | length' <<< "$snap_info")
- test "$actual_image_count" = "$image_count"
+ local actual_image_count=$(jq '.images | length' <<< "$snap_info_json")
+ test "$actual_image_count" = "$image_count" || return 1
+
+ local image_snap_name=$(jq -r '.image_snap_name' <<< "$snap_info_json")
+ local snap_info=$(rbd group snap info $group_name@$snap_name)
+ local snap_state=$(grep -w 'state:' <<< "$snap_info" | tr -d '\t')
+ test "$snap_state" = "state: complete" || return 1
+ local image_snap_field=$(grep -w 'image snap:' <<< "$snap_info")
+ local images_field=$(grep -w 'images:' <<< "$snap_info")
+ if ((image_count != 0)); then
+ test -n "$image_snap_name" || return 1
+ test -n "$image_snap_field" || return 1
+ test -n "$images_field" || return 1
+ else
+ test -z "$image_snap_name" || return 1
+ test -z "$image_snap_field" || return 1
+ test -z "$images_field" || return 1
+ fi
}
echo "TEST: create remove consistency group"
diff --git a/qa/workunits/rbd/rbd_mirror.sh b/qa/workunits/rbd/rbd_mirror.sh
index 1cda355039e..90d5204b92f 100755
--- a/qa/workunits/rbd/rbd_mirror.sh
+++ b/qa/workunits/rbd/rbd_mirror.sh
@@ -37,12 +37,12 @@ set_image_meta ${CLUSTER2} ${POOL} ${image} "key1" "value1"
set_image_meta ${CLUSTER2} ${POOL} ${image} "key2" "value2"
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'down+unknown'
fi
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
compare_image_meta ${CLUSTER1} ${POOL} ${image} "key1" "value1"
compare_image_meta ${CLUSTER1} ${POOL} ${image} "key2" "value2"
@@ -53,19 +53,19 @@ create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${image1} ${RBD_MIRROR_MODE}
write_image ${CLUSTER2} ${POOL} ${image1} 100
start_mirrors ${CLUSTER1}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image1}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image1}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image1}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1}
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image1} 'down+unknown'
fi
-compare_images ${POOL} ${image1}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image1}
testlog "TEST: test the first image is replaying after restart"
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
testlog "TEST: stop/start/restart mirror via admin socket"
@@ -173,7 +173,7 @@ wait_for_image_in_omap ${CLUSTER2} ${POOL}
create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${image} ${RBD_MIRROR_MODE}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying'
testlog "TEST: failover and failback"
@@ -187,10 +187,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
# failover (unmodified)
demote_image ${CLUSTER2} ${POOL} ${image}
@@ -207,10 +207,10 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
# failover
demote_image ${CLUSTER2} ${POOL} ${image}
@@ -220,10 +220,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
write_image ${CLUSTER1} ${POOL} ${image} 100
-wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${POOL} ${image}
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
# failback
demote_image ${CLUSTER1} ${POOL} ${image}
@@ -233,10 +233,10 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
testlog "TEST: failover / failback loop"
for i in `seq 1 20`; do
@@ -246,7 +246,7 @@ for i in `seq 1 20`; do
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying'
demote_image ${CLUSTER1} ${POOL} ${image}
@@ -255,7 +255,7 @@ for i in `seq 1 20`; do
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying'
done
@@ -271,7 +271,7 @@ create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${force_promote_image} ${RBD_
write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${force_promote_image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${force_promote_image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${force_promote_image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${force_promote_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${force_promote_image}
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${force_promote_image} 'up+stopped'
promote_image ${CLUSTER1} ${POOL} ${force_promote_image} '--force'
@@ -302,14 +302,14 @@ else
enable_mirror ${CLUSTER2} ${PARENT_POOL} ${parent_image} ${RBD_MIRROR_MODE}
fi
wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} ${parent_image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${parent_image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} ${parent_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} ${parent_image}
-compare_images ${PARENT_POOL} ${parent_image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} ${parent_image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${clone_image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${clone_image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${clone_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${clone_image}
-compare_images ${POOL} ${clone_image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${clone_image}
remove_image_retry ${CLUSTER2} ${POOL} ${clone_image}
testlog " - clone v1"
@@ -383,11 +383,11 @@ create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap1'
write_image ${CLUSTER2} ${POOL} ${dp_image} 100
create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap2'
write_image ${CLUSTER2} ${POOL} ${dp_image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${dp_image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${dp_image}
-compare_images ${POOL} ${dp_image}@snap1
-compare_images ${POOL} ${dp_image}@snap2
-compare_images ${POOL} ${dp_image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}@snap1
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}@snap2
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${dp_image}
remove_image_retry ${CLUSTER2} ${POOL} ${dp_image}
testlog "TEST: disable mirroring / delete non-primary image"
@@ -436,8 +436,8 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
wait_for_image_present ${CLUSTER1} ${POOL} ${i} 'present'
wait_for_snap_present ${CLUSTER1} ${POOL} ${i} 'snap2'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${i}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${i}
- compare_images ${POOL} ${i}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${i}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${i}
done
testlog "TEST: remove mirroring pool"
@@ -454,9 +454,9 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
create_image ${CLUSTER2} ${POOL} ${rdp_image} 128 --data-pool ${pool}
write_image ${CLUSTER2} ${pool} ${image} 100
write_image ${CLUSTER2} ${POOL} ${rdp_image} 100
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${pool} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${pool} ${pool} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${pool} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${rdp_image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${rdp_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${rdp_image}
for cluster in ${CLUSTER1} ${CLUSTER2}; do
CEPH_ARGS='' ceph --cluster ${cluster} osd pool rm ${pool} ${pool} --yes-i-really-really-mean-it
@@ -519,12 +519,12 @@ wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS1} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS2} ${image}
write_image ${CLUSTER2} ${POOL}/${NS1} ${image} 100
write_image ${CLUSTER2} ${POOL}/${NS2} ${image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${POOL}/${NS2} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS1} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS2} ${image}
-compare_images ${POOL}/${NS1} ${image}
-compare_images ${POOL}/${NS2} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS2} ${POOL}/${NS2} ${image}
testlog " - disable mirroring / delete image"
remove_image_retry ${CLUSTER2} ${POOL}/${NS1} ${image}
@@ -533,6 +533,40 @@ wait_for_image_present ${CLUSTER1} ${POOL}/${NS1} ${image} 'deleted'
wait_for_image_present ${CLUSTER1} ${POOL}/${NS2} ${image} 'deleted'
remove_image_retry ${CLUSTER2} ${POOL}/${NS2} ${image}
+testlog "TEST: mirror to a different remote namespace"
+testlog " - replay"
+NS3=ns3
+NS4=ns4
+rbd --cluster ${CLUSTER1} namespace create ${POOL}/${NS3}
+rbd --cluster ${CLUSTER2} namespace create ${POOL}/${NS4}
+rbd --cluster ${CLUSTER1} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE} --remote-namespace ${NS4}
+rbd --cluster ${CLUSTER2} mirror pool enable ${POOL}/${NS4} ${MIRROR_POOL_MODE} --remote-namespace ${NS3}
+create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS4} ${image} ${RBD_MIRROR_MODE}
+wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS3} ${image}
+write_image ${CLUSTER2} ${POOL}/${NS4} ${image} 100
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS4} ${image}
+wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS3} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS4} ${image}
+
+testlog " - disable mirroring and re-enable without remote-namespace"
+remove_image_retry ${CLUSTER2} ${POOL}/${NS4} ${image}
+wait_for_image_present ${CLUSTER1} ${POOL}/${NS3} ${image} 'deleted'
+rbd --cluster ${CLUSTER1} mirror pool disable ${POOL}/${NS3}
+rbd --cluster ${CLUSTER2} mirror pool disable ${POOL}/${NS4}
+rbd --cluster ${CLUSTER2} namespace create ${POOL}/${NS3}
+rbd --cluster ${CLUSTER2} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE}
+rbd --cluster ${CLUSTER1} mirror pool enable ${POOL}/${NS3} ${MIRROR_POOL_MODE}
+create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS3} ${image} ${RBD_MIRROR_MODE}
+wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS3} ${image}
+write_image ${CLUSTER2} ${POOL}/${NS3} ${image} 100
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS3} ${image}
+wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS3} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS3} ${POOL}/${NS3} ${image}
+remove_image_retry ${CLUSTER2} ${POOL}/${NS3} ${image}
+wait_for_image_present ${CLUSTER1} ${POOL}/${NS3} ${image} 'deleted'
+rbd --cluster ${CLUSTER1} mirror pool disable ${POOL}/${NS3}
+rbd --cluster ${CLUSTER2} mirror pool disable ${POOL}/${NS3}
+
testlog " - data pool"
dp_image=test_data_pool
create_image_and_enable_mirror ${CLUSTER2} ${POOL}/${NS1} ${dp_image} ${RBD_MIRROR_MODE} 128 --data-pool ${PARENT_POOL}
@@ -542,9 +576,9 @@ wait_for_image_replay_started ${CLUSTER1} ${POOL}/${NS1} ${dp_image}
data_pool=$(get_image_data_pool ${CLUSTER1} ${POOL}/${NS1} ${dp_image})
test "${data_pool}" = "${PARENT_POOL}"
write_image ${CLUSTER2} ${POOL}/${NS1} ${dp_image} 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${dp_image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${dp_image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL}/${NS1} ${dp_image}
-compare_images ${POOL}/${NS1} ${dp_image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL}/${NS1} ${POOL}/${NS1} ${dp_image}
remove_image_retry ${CLUSTER2} ${POOL}/${NS1} ${dp_image}
testlog "TEST: simple image resync"
@@ -553,7 +587,7 @@ wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
testlog "TEST: image resync while replayer is stopped"
@@ -566,7 +600,7 @@ if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
- compare_images ${POOL} ${image}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
fi
testlog "TEST: request image resync while daemon is offline"
@@ -577,7 +611,7 @@ wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${image}
-compare_images ${POOL} ${image}
+compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
remove_image_retry ${CLUSTER2} ${POOL} ${image}
if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
@@ -588,7 +622,7 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
testlog " - replay stopped after disconnect"
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
disconnect_image ${CLUSTER2} ${POOL} ${image}
test -z "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
@@ -600,9 +634,9 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
- compare_images ${POOL} ${image}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
testlog " - disconnected after max_concurrent_object_sets reached"
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
@@ -628,25 +662,25 @@ if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
- compare_images ${POOL} ${image}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
testlog " - rbd_mirroring_resync_after_disconnect config option"
set_image_meta ${CLUSTER2} ${POOL} ${image} \
conf_rbd_mirroring_resync_after_disconnect true
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
image_id=$(get_image_id ${CLUSTER1} ${POOL} ${image})
disconnect_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted' ${image_id}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'present'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
test -n "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
- compare_images ${POOL} ${image}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
set_image_meta ${CLUSTER2} ${POOL} ${image} \
conf_rbd_mirroring_resync_after_disconnect false
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
disconnect_image ${CLUSTER2} ${POOL} ${image}
test -z "$(get_mirror_journal_position ${CLUSTER2} ${POOL} ${image})"
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
diff --git a/qa/workunits/rbd/rbd_mirror_bootstrap.sh b/qa/workunits/rbd/rbd_mirror_bootstrap.sh
index 412e84c88a6..3ddb0aa219b 100755
--- a/qa/workunits/rbd/rbd_mirror_bootstrap.sh
+++ b/qa/workunits/rbd/rbd_mirror_bootstrap.sh
@@ -38,7 +38,7 @@ create_image_and_enable_mirror ${CLUSTER1} ${POOL} image1
wait_for_image_replay_started ${CLUSTER2} ${POOL} image1
write_image ${CLUSTER1} ${POOL} image1 100
-wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} image1
+wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${POOL} image1
wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${POOL} image1
testlog "TEST: verify rx-tx direction"
@@ -54,12 +54,12 @@ enable_mirror ${CLUSTER2} ${PARENT_POOL} image2
wait_for_image_replay_started ${CLUSTER2} ${PARENT_POOL} image1
write_image ${CLUSTER1} ${PARENT_POOL} image1 100
-wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} image1
+wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${PARENT_POOL} ${PARENT_POOL} image1
wait_for_replaying_status_in_pool_dir ${CLUSTER2} ${PARENT_POOL} image1
wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} image2
write_image ${CLUSTER2} ${PARENT_POOL} image2 100
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} image2
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${PARENT_POOL} image2
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} image2
testlog "TEST: pool replayer and callout cleanup when peer is updated"
diff --git a/qa/workunits/rbd/rbd_mirror_ha.sh b/qa/workunits/rbd/rbd_mirror_ha.sh
index 1e43712a631..e5a086b82ab 100755
--- a/qa/workunits/rbd/rbd_mirror_ha.sh
+++ b/qa/workunits/rbd/rbd_mirror_ha.sh
@@ -71,7 +71,7 @@ test_replay()
wait_for_image_replay_started ${CLUSTER1}:${LEADER} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1}:${LEADER} ${CLUSTER2} ${POOL} \
- ${image}
+ ${POOL} ${image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' \
'primary_position' \
"${MIRROR_USER_ID_PREFIX}${LEADER} on $(hostname -s)"
@@ -79,7 +79,7 @@ test_replay()
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} \
'down+unknown'
fi
- compare_images ${POOL} ${image}
+ compare_images ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
done
}
diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh
index abb1d17c8df..1b1436db74d 100755
--- a/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -72,15 +72,6 @@
# ../qa/workunits/rbd/rbd_mirror_helpers.sh cleanup
#
-if type xmlstarlet > /dev/null 2>&1; then
- XMLSTARLET=xmlstarlet
-elif type xml > /dev/null 2>&1; then
- XMLSTARLET=xml
-else
- echo "Missing xmlstarlet binary!"
- exit 1
-fi
-
RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-2}
CLUSTER1=cluster1
@@ -752,17 +743,18 @@ wait_for_journal_replay_complete()
{
local local_cluster=$1
local cluster=$2
- local pool=$3
- local image=$4
+ local local_pool=$3
+ local remote_pool=$4
+ local image=$5
local s master_pos mirror_pos last_mirror_pos
local master_tag master_entry mirror_tag mirror_entry
while true; do
for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16 32 32; do
sleep ${s}
- flush "${local_cluster}" "${pool}" "${image}"
- master_pos=$(get_master_journal_position "${cluster}" "${pool}" "${image}")
- mirror_pos=$(get_mirror_journal_position "${cluster}" "${pool}" "${image}")
+ flush "${local_cluster}" "${local_pool}" "${image}"
+ master_pos=$(get_master_journal_position "${cluster}" "${remote_pool}" "${image}")
+ mirror_pos=$(get_mirror_journal_position "${cluster}" "${remote_pool}" "${image}")
test -n "${master_pos}" -a "${master_pos}" = "${mirror_pos}" && return 0
test "${mirror_pos}" != "${last_mirror_pos}" && break
done
@@ -805,21 +797,22 @@ wait_for_snapshot_sync_complete()
{
local local_cluster=$1
local cluster=$2
- local pool=$3
- local image=$4
+ local local_pool=$3
+ local remote_pool=$4
+ local image=$5
- local status_log=${TEMPDIR}/$(mkfname ${cluster}-${pool}-${image}.status)
- local local_status_log=${TEMPDIR}/$(mkfname ${local_cluster}-${pool}-${image}.status)
+ local status_log=${TEMPDIR}/$(mkfname ${cluster}-${remote_pool}-${image}.status)
+ local local_status_log=${TEMPDIR}/$(mkfname ${local_cluster}-${local_pool}-${image}.status)
- mirror_image_snapshot "${cluster}" "${pool}" "${image}"
- get_newest_mirror_snapshot "${cluster}" "${pool}" "${image}" "${status_log}"
+ mirror_image_snapshot "${cluster}" "${remote_pool}" "${image}"
+ get_newest_mirror_snapshot "${cluster}" "${remote_pool}" "${image}" "${status_log}"
local snapshot_id=$(xmlstarlet sel -t -v "//snapshot/id" < ${status_log})
while true; do
for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16 32 32; do
sleep ${s}
- get_newest_mirror_snapshot "${local_cluster}" "${pool}" "${image}" "${local_status_log}"
+ get_newest_mirror_snapshot "${local_cluster}" "${local_pool}" "${image}" "${local_status_log}"
local primary_snapshot_id=$(xmlstarlet sel -t -v "//snapshot/namespace/primary_snap_id" < ${local_status_log})
test "${snapshot_id}" = "${primary_snapshot_id}" && return 0
@@ -834,13 +827,14 @@ wait_for_replay_complete()
{
local local_cluster=$1
local cluster=$2
- local pool=$3
- local image=$4
+ local local_pool=$3
+ local remote_pool=$4
+ local image=$5
if [ "${RBD_MIRROR_MODE}" = "journal" ]; then
- wait_for_journal_replay_complete ${local_cluster} ${cluster} ${pool} ${image}
+ wait_for_journal_replay_complete ${local_cluster} ${cluster} ${local_pool} ${remote_pool} ${image}
elif [ "${RBD_MIRROR_MODE}" = "snapshot" ]; then
- wait_for_snapshot_sync_complete ${local_cluster} ${cluster} ${pool} ${image}
+ wait_for_snapshot_sync_complete ${local_cluster} ${cluster} ${local_pool} ${remote_pool} ${image}
else
return 1
fi
@@ -894,9 +888,9 @@ test_mirror_pool_status_verbose()
--verbose --format xml)
local last_update state
- last_update=$($XMLSTARLET sel -t -v \
+ last_update=$(xmlstarlet sel -t -v \
"//images/image[name='${image}']/last_update" <<< "$status")
- state=$($XMLSTARLET sel -t -v \
+ state=$(xmlstarlet sel -t -v \
"//images/image[name='${image}']/state" <<< "$status")
echo "${state}" | grep "${state_pattern}" ||
@@ -1307,16 +1301,19 @@ show_diff()
compare_images()
{
- local pool=$1
- local image=$2
local ret=0
+ local local_cluster=$1
+ local cluster=$2
+ local local_pool=$3
+ local remote_pool=$4
+ local image=$5
- local rmt_export=${TEMPDIR}/$(mkfname ${CLUSTER2}-${pool}-${image}.export)
- local loc_export=${TEMPDIR}/$(mkfname ${CLUSTER1}-${pool}-${image}.export)
+ local rmt_export=${TEMPDIR}/$(mkfname ${cluster}-${remote_pool}-${image}.export)
+ local loc_export=${TEMPDIR}/$(mkfname ${local_cluster}-${local_pool}-${image}.export)
rm -f ${rmt_export} ${loc_export}
- rbd --cluster ${CLUSTER2} export ${pool}/${image} ${rmt_export}
- rbd --cluster ${CLUSTER1} export ${pool}/${image} ${loc_export}
+ rbd --cluster ${cluster} export ${remote_pool}/${image} ${rmt_export}
+ rbd --cluster ${local_cluster} export ${local_pool}/${image} ${loc_export}
if ! cmp ${rmt_export} ${loc_export}
then
show_diff ${rmt_export} ${loc_export}
@@ -1337,7 +1334,7 @@ compare_image_snapshots()
for snap_name in $(rbd --cluster ${CLUSTER1} --format xml \
snap list ${pool}/${image} | \
- $XMLSTARLET sel -t -v "//snapshot/name" | \
+ xmlstarlet sel -t -v "//snapshot/name" | \
grep -E -v "^\.rbd-mirror\."); do
rm -f ${rmt_export} ${loc_export}
rbd --cluster ${CLUSTER2} export ${pool}/${image}@${snap_name} ${rmt_export}
diff --git a/qa/workunits/rbd/rbd_mirror_stress.sh b/qa/workunits/rbd/rbd_mirror_stress.sh
index baf0c9f1a8f..b0a85e8a48a 100755
--- a/qa/workunits/rbd/rbd_mirror_stress.sh
+++ b/qa/workunits/rbd/rbd_mirror_stress.sh
@@ -111,7 +111,7 @@ do
snap_name="snap${i}"
create_snap ${CLUSTER2} ${POOL} ${image} ${snap_name}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${snap_name}
if [ -n "${clean_snap_name}" ]; then
@@ -124,7 +124,7 @@ do
done
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
-wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${clean_snap_name}
for i in `seq 1 10`
@@ -173,7 +173,7 @@ do
image="image_${i}"
create_snap ${CLUSTER2} ${POOL} ${image} ${snap_name}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
- wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+ wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${POOL} ${image}
wait_for_snap_present ${CLUSTER1} ${POOL} ${image} ${snap_name}
compare_image_snaps ${POOL} ${image} ${snap_name}
done
diff --git a/qa/workunits/rbd/test_admin_socket.sh b/qa/workunits/rbd/test_admin_socket.sh
index 6b960787b5e..110fdd48ea7 100755
--- a/qa/workunits/rbd/test_admin_socket.sh
+++ b/qa/workunits/rbd/test_admin_socket.sh
@@ -5,8 +5,6 @@ TMPDIR=/tmp/rbd_test_admin_socket$$
mkdir $TMPDIR
trap "rm -fr $TMPDIR" 0
-. $(dirname $0)/../../standalone/ceph-helpers.sh
-
function expect_false()
{
set -x
@@ -40,12 +38,12 @@ function rbd_get_perfcounter()
local name
name=$(ceph --format xml --admin-daemon $(rbd_watch_asok ${image}) \
- perf schema | $XMLSTARLET el -d3 |
+ perf schema | xmlstarlet el -d3 |
grep "/librbd-.*-${image}/${counter}\$")
test -n "${name}" || return 1
ceph --format xml --admin-daemon $(rbd_watch_asok ${image}) perf dump |
- $XMLSTARLET sel -t -m "${name}" -v .
+ xmlstarlet sel -t -m "${name}" -v .
}
function rbd_check_perfcounter()
diff --git a/qa/workunits/rest/test-restful.sh b/qa/workunits/rest/test-restful.sh
deleted file mode 100755
index fde0d107a0b..00000000000
--- a/qa/workunits/rest/test-restful.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh -ex
-
-mydir=`dirname $0`
-
-secret=`ceph config-key get mgr/restful/keys/admin`
-url=$(ceph mgr dump|jq -r .services.restful|sed -e 's/\/$//')
-echo "url $url secret $secret"
-$mydir/test_mgr_rest_api.py $url $secret
-
-echo $0 OK
diff --git a/qa/workunits/rgw/s3_utilities.pm b/qa/workunits/rgw/s3_utilities.pm
index 3c3fae900e8..5a91db9d1fd 100644
--- a/qa/workunits/rgw/s3_utilities.pm
+++ b/qa/workunits/rgw/s3_utilities.pm
@@ -21,7 +21,7 @@ sub get_timestamp {
if ($min < 10) { $min = "0$min"; }
if ($sec < 10) { $sec = "0$sec"; }
$year=$year+1900;
- return $year . '_' . $mon . '_' . $mday . '_' . $hour . '_' . $min . '_' . $sec;
+ return $year . '-' . $mon . '-' . $mday . '-' . $hour . '-' . $min . '-' . $sec;
}
# Function to check if radosgw is already running
@@ -195,11 +195,12 @@ sub run_s3
host => $hostname,
secure => 0,
retry => 1,
+ dns_bucket_names => 0,
}
);
}
-our $bucketname = 'buck_'.get_timestamp();
+our $bucketname = 'buck-'.get_timestamp();
# create a new bucket (the test bucket)
our $bucket = $s3->add_bucket( { bucket => $bucketname } )
or die $s3->err. "bucket $bucketname create failed\n". $s3->errstr;
diff --git a/qa/workunits/rgw/test_rgw_bucket_check.py b/qa/workunits/rgw/test_rgw_bucket_check.py
index bfa6d65d6e7..33936df2401 100755
--- a/qa/workunits/rgw/test_rgw_bucket_check.py
+++ b/qa/workunits/rgw/test_rgw_bucket_check.py
@@ -173,6 +173,7 @@ def main():
exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}')
out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys')
json_out = json.loads(out)
+ log.info(f'"bucket check unlinked" returned {json_out}, expecting {unlinked_keys}')
assert len(json_out) == len(unlinked_keys)
bucket.object_versions.all().delete()
out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}')
diff --git a/qa/workunits/rgw/test_rgw_reshard.py b/qa/workunits/rgw/test_rgw_reshard.py
index e22050fc27f..18ffb102250 100755
--- a/qa/workunits/rgw/test_rgw_reshard.py
+++ b/qa/workunits/rgw/test_rgw_reshard.py
@@ -76,6 +76,16 @@ def get_bucket_num_shards(bucket_name, bucket_id):
num_shards = json_op['data']['bucket_info']['num_shards']
return num_shards
+def get_bucket_reshard_status(bucket_name):
+ """
+ function to get bucket reshard status
+ """
+ cmd = exec_cmd("radosgw-admin bucket stats --bucket {}".format(bucket_name))
+ json_op = json.loads(cmd)
+ #print(json.dumps(json_op, indent = 4, sort_keys=True))
+ reshard_status = json_op['reshard_status']
+ return reshard_status
+
def run_bucket_reshard_cmd(bucket_name, num_shards, **kwargs):
cmd = 'radosgw-admin bucket reshard --bucket {} --num-shards {}'.format(bucket_name, num_shards)
cmd += ' --rgw-reshard-bucket-lock-duration 30' # reduce to minimum
@@ -104,7 +114,7 @@ def test_bucket_reshard(conn, name, **fault):
# try reshard with fault injection
_, ret = run_bucket_reshard_cmd(name, num_shards_expected, check_retcode=False, **fault)
- if fault.get('error_code') == errno.ECANCELED:
+ if fault.get('error_code') == errno.ECANCELED or fault.get('error_code') == errno.EOPNOTSUPP:
assert(ret == 0) # expect ECANCELED to retry and succeed
else:
assert(ret != 0 and ret != errno.EBUSY)
@@ -139,6 +149,11 @@ def test_bucket_reshard(conn, name, **fault):
bucket.delete_objects(Delete={'Objects':[{'Key':o.key} for o in objs]})
bucket.delete()
+def calc_reshardlog_count(json_op):
+ cnt = 0
+ for shard in json_op:
+ cnt += len(shard['shard_entries'])
+ return cnt
def main():
"""
@@ -210,6 +225,13 @@ def main():
log.error("Resharding failed on bucket {}. Expected number of shards are not created\n".format(BUCKET_NAME))
# TESTCASE 'manual bucket resharding','inject error','fail','check bucket accessibility', 'retry reshard'
+ log.debug('TEST: reshard bucket with EIO injected at init_index\n')
+ test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index')
+ log.debug('TEST: reshard bucket with EOPNOTSUPP injected at init_index\n')
+ test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index', error_code=errno.EOPNOTSUPP)
+ log.debug('TEST: reshard bucket with abort at init_index\n')
+ test_bucket_reshard(connection, 'abort-at-init-indext', abort_at='init_index')
+
log.debug('TEST: reshard bucket with EIO injected at set_target_layout\n')
test_bucket_reshard(connection, 'error-at-set-target-layout', error_at='set_target_layout')
log.debug('TEST: reshard bucket with ECANCELED injected at set_target_layout\n')
@@ -217,6 +239,13 @@ def main():
log.debug('TEST: reshard bucket with abort at set_target_layout\n')
test_bucket_reshard(connection, 'abort-at-set-target-layout', abort_at='set_target_layout')
+ log.debug('TEST: reshard bucket with EIO injected at trim_reshard_log_entries\n')
+ test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries')
+ log.debug('TEST: reshard bucket with EOPNOTSUPP injected at trim_reshard_log_entries\n')
+ test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries', error_code=errno.EOPNOTSUPP)
+ log.debug('TEST: reshard bucket with abort at trim_reshard_log_entries\n')
+ test_bucket_reshard(connection, 'abort-at-trim-reshard-log-entries', abort_at='trim_reshard_log_entries')
+
log.debug('TEST: reshard bucket with EIO injected at block_writes\n')
test_bucket_reshard(connection, 'error-at-block-writes', error_at='block_writes')
log.debug('TEST: reshard bucket with abort at block_writes\n')
@@ -234,6 +263,80 @@ def main():
log.debug('TEST: reshard bucket with abort at do_reshard\n')
test_bucket_reshard(connection, 'abort-at-do-reshard', abort_at='do_reshard')
+ log.debug('TEST: reshard bucket with EIO injected at logrecord_writes\n')
+ test_bucket_reshard(connection, 'error-at-logrecord-writes', error_at='logrecord_writes')
+ log.debug('TEST: reshard bucket with abort at logrecord_writes\n')
+ test_bucket_reshard(connection, 'abort-at-logrecord-writes', abort_at='logrecord_writes')
+
+ log.debug('TEST: reshard bucket with EIO injected at change_reshard_state\n')
+ test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state')
+ log.debug('TEST: reshard bucket with ECANCELED injected at change_reshard_state\n')
+ test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state', error_code=errno.ECANCELED)
+ log.debug('TEST: reshard bucket with abort at change_reshard_state\n')
+ test_bucket_reshard(connection, 'abort-at-change-reshard-state', abort_at='change_reshard_state')
+
+ # TESTCASE 'logrecord could be stopped after reshard failed'
+ log.debug(' test: logrecord could be stopped after reshard failed')
+ num_shards = get_bucket_stats(BUCKET_NAME).num_shards
+ assert "None" == get_bucket_reshard_status(BUCKET_NAME)
+ _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='change_reshard_state')
+ assert(ret != 0 and ret != errno.EBUSY)
+ assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
+
+ bucket.put_object(Key='put_during_logrecord', Body=b"some_data")
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 1
+
+ # end up with logrecord status, the logrecord will be purged
+ time.sleep(30)
+ assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
+ bucket.put_object(Key='put_during_logrecord1', Body=b"some_data1")
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 0
+ assert "None" == get_bucket_reshard_status(BUCKET_NAME)
+
+ # TESTCASE 'duplicated entries should be purged before reshard'
+ log.debug(' test: duplicated entries should be purged before reshard')
+ num_shards = get_bucket_stats(BUCKET_NAME).num_shards
+ _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard')
+ assert(ret != 0 and ret != errno.EBUSY)
+ assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
+
+ bucket.put_object(Key='put_during_logrecord2', Body=b"some_data2")
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 1
+
+ # begin to reshard again, the duplicated entries will be purged
+ time.sleep(30)
+ _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='logrecord_writes')
+ assert(ret != 0 and ret != errno.EBUSY)
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 0
+
+ # TESTCASE 'duplicated entries can be purged manually'
+ log.debug(' test: duplicated entries can be purged manually')
+ time.sleep(30)
+ num_shards = get_bucket_stats(BUCKET_NAME).num_shards
+ _, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard')
+ assert(ret != 0 and ret != errno.EBUSY)
+ assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
+
+ bucket.put_object(Key='put_during_logrecord3', Body=b"some_data3")
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 1
+
+ time.sleep(30)
+ exec_cmd('radosgw-admin reshardlog purge --bucket %s' % BUCKET_NAME)
+ cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
+ json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
+ assert calc_reshardlog_count(json_op) == 0
+ log.debug('check reshard logrecord successfully')
+
# TESTCASE 'versioning reshard-','bucket', reshard','versioning reshard','succeeds'
log.debug(' test: reshard versioned bucket')
num_shards_expected = get_bucket_stats(VER_BUCKET_NAME).num_shards + 1
@@ -287,6 +390,8 @@ def main():
time.sleep(1)
ver_bucket.put_object(Key='put_during_reshard', Body=b"some_data")
log.debug('put object successful')
+ # waiter for delay reshard to finish
+ time.sleep(5)
# TESTCASE 'check that bucket stats are correct after reshard with unlinked entries'
log.debug('TEST: check that bucket stats are correct after reshard with unlinked entries\n')