19 files changed, 170 insertions, 59 deletions
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 491e46603f7..6dd5b10ae8f 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -5833,7 +5833,7 @@ function TEST_periodic_scrub_replicated() {
 
     flush_pg_stats
     # Request a regular scrub and it will be done
-    pg_schedule_scrub $pg
+    pg_scrub $pg
     grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
 
     # deep-scrub error is no longer present
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 7c97edae552..0416ae2ea4e 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,7 +1,8 @@
+# runs on default nvmeof image (i.e. DEFAULT_NVMEOF_IMAGE)
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: default # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
index 9ef37004427..dfe31380bb6 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -18,6 +18,7 @@ tasks:
     clients:
       client.0:
         - nvmeof/setup_subsystem.sh
+        - nvmeof/basic_tests.sh
     env:
       RBD_POOL: mypool
       RBD_IMAGE_PREFIX: myimage
@@ -27,7 +28,6 @@ tasks:
     timeout: 30m
     clients:
       client.0:
-        - nvmeof/basic_tests.sh
         - nvmeof/fio_test.sh --rbd_iostat
       client.1:
         - nvmeof/basic_tests.sh
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
index 12cb50b408d..d66b6fc8093 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -31,8 +31,11 @@ tasks:
     no_coverage_and_limits: true
     timeout: 30m
     clients:
-      client.0:
+      client.3:
         - nvmeof/scalability_test.sh nvmeof.a,nvmeof.b
         - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c,nvmeof.d
+        - nvmeof/scalability_test.sh nvmeof.b,nvmeof.c
     env:
       SCALING_DELAYS: '50'
+      RBD_POOL: mypool
+      NVMEOF_GROUP: mygroup0
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml
new file mode 100644
index 00000000000..83d54cdf5c3
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml
@@ -0,0 +1,37 @@
+tasks:
+- nvmeof:
+    installer: host.a
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    rbd:
+      pool_name: mypool
+      image_name_prefix: myimage
+    gateway_config:
+      subsystems_count: 10 
+      namespaces_count: 90 # each subsystem
+      cli_image: quay.io/ceph/nvmeof-cli:latest
+
+- cephadm.wait_for_service:
+    service: nvmeof.mypool.mygroup0
+
+- cephadm.exec:
+    host.a:
+      - ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml
+      - cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml 
+      - "sed -i '/  pool: mypool/a\\  spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml"
+      - cat /tmp/nvmeof-no-huge-page.yaml
+      - ceph orch ls --refresh
+      - ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml
+      - ceph orch redeploy nvmeof.mypool.mygroup0
+      
+- cephadm.wait_for_service:
+    service: nvmeof.mypool.mygroup0
+
+- workunit:
+    no_coverage_and_limits: true
+    clients:
+      client.0:
+        - nvmeof/setup_subsystem.sh
+        - nvmeof/basic_tests.sh
+    env:
+      RBD_POOL: mypool
+      RBD_IMAGE_PREFIX: myimage
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml
index b4755a6433b..0f7ac011a60 100644
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/120-subsys-8-namespace.yaml
@@ -6,8 +6,8 @@ tasks:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
-      subsystems_count: 3
-      namespaces_count: 20 # each subsystem
+      subsystems_count: 120
+      namespaces_count: 8 # each subsystem
       cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
index 19fa2ec605d..46037784d31 100644
--- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
@@ -11,6 +11,7 @@ overrides:
       - NVMEOF_SINGLE_GATEWAY
       - NVMEOF_GATEWAY_DOWN
       - are in unavailable state
+      - is unavailable
       - is in error state
       - failed cephadm daemon
 
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
index 80bf0527715..b58dc14d87b 100644
--- a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
@@ -6,9 +6,11 @@ overrides:
       - NVMEOF_SINGLE_GATEWAY
       - NVMEOF_GATEWAY_DOWN
       - are in unavailable state
+      - is unavailable
       - is in error state
       - failed cephadm daemon
 
 tasks:
 - nvmeof.thrash:
     checker_host: 'client.0'
+    randomize: False
diff --git a/qa/suites/nvmeof/thrash/workloads/fio.yaml b/qa/suites/nvmeof/thrash/workloads/fio.yaml
index b042b92d6ae..f9a0d0ebde5 100644
--- a/qa/suites/nvmeof/thrash/workloads/fio.yaml
+++ b/qa/suites/nvmeof/thrash/workloads/fio.yaml
@@ -1,11 +1,11 @@
 tasks:
 - workunit:
     no_coverage_and_limits: true
-    timeout: 30m
+    timeout: 60m
     clients:
       client.0:
-        - nvmeof/fio_test.sh --rbd_iostat
+        - nvmeof/fio_test.sh --random_devices 200
     env:
       RBD_POOL: mypool
       IOSTAT_INTERVAL: '10'
-      RUNTIME: '600'
+      RUNTIME: '1800'
diff --git a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
index 462570e7727..303f98d540e 100644
--- a/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
+++ b/qa/suites/rgw/notifications/tasks/kafka/test_kafka.yaml
@@ -1,7 +1,7 @@
 tasks:
 - kafka:
     client.0:
-      kafka_version: 2.6.0
+      kafka_version: 3.8.1
 - notification-tests:
     client.0:
       extra_attr: ["kafka_test", "data_path_v2_kafka_test"]
diff --git a/qa/tasks/kafka.py b/qa/tasks/kafka.py
index 5e6c208ca30..833f03babf6 100644
--- a/qa/tasks/kafka.py
+++ b/qa/tasks/kafka.py
@@ -4,6 +4,7 @@ Deploy and configure Kafka for Teuthology
 import contextlib
 import logging
 import time
+import os
 
 from teuthology import misc as teuthology
 from teuthology import contextutil
@@ -33,6 +34,13 @@ def install_kafka(ctx, config):
     assert isinstance(config, dict)
     log.info('Installing Kafka...')
 
+    # programmatically find a nearby mirror so as not to hammer archive.apache.org
+    apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+        "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+    log.info("determining apache mirror by running: " + apache_mirror_cmd)
+    apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+    log.info("chosen apache mirror is " + apache_mirror_url_front)
+
     for (client, _) in config.items():
         (remote,) = ctx.cluster.only(client).remotes.keys()
         test_dir=teuthology.get_testdir(ctx)
@@ -40,7 +48,8 @@ def install_kafka(ctx, config):
 
         kafka_file =  kafka_prefix + current_version + '.tgz'
 
-        link1 = 'https://archive.apache.org/dist/kafka/' + current_version + '/' + kafka_file
+        link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+            current_version + '/' + kafka_file
         ctx.cluster.only(client).run(
             args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1],
         )
diff --git a/qa/tasks/mgr/dashboard/test_rbd.py b/qa/tasks/mgr/dashboard/test_rbd.py
index a872645e33e..83b3bf520c2 100644
--- a/qa/tasks/mgr/dashboard/test_rbd.py
+++ b/qa/tasks/mgr/dashboard/test_rbd.py
@@ -869,7 +869,19 @@ class RbdTest(DashboardTestCase):
         self.assertEqual(clone_format_version, 2)
         self.assertStatus(200)
 
+        # if empty list is sent, then the config will remain as it is
         value = []
+        res = [{'section': "global", 'value': "2"}]
+        self._post('/api/cluster_conf', {
+            'name': config_name,
+            'value': value
+        })
+        self.wait_until_equal(
+            lambda: _get_config_by_name(config_name),
+            res,
+            timeout=60)
+
+        value = [{'section': "global", 'value': ""}]
         self._post('/api/cluster_conf', {
             'name': config_name,
             'value': value
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py
index 42e357294d9..c58a7267b4e 100644
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -128,12 +128,11 @@ class Nvmeof(Task):
 
             total_images = int(self.namespaces_count) * int(self.subsystems_count)
             log.info(f'[nvmeof]: creating {total_images} images')
+            rbd_create_cmd = []
             for i in range(1, total_images + 1):
                 imagename = self.image_name_prefix + str(i)
-                log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}')
-                _shell(self.ctx, self.cluster_name, self.remote, [
-                    'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}'
-                ])
+                rbd_create_cmd += ['rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}', run.Raw(';')]           
+            _shell(self.ctx, self.cluster_name, self.remote, rbd_create_cmd)     
 
         for role, i in daemons.items():
             remote, id_ = i
@@ -251,9 +250,9 @@ class NvmeofThrasher(Thrasher, Greenlet):
 
     daemon_max_thrash_times: 
                         For now, NVMeoF daemons have limitation that each daemon can 
-                        be thrashed only 3 times in span of 30 mins. This option 
+                        be thrashed only 5 times in span of 30 mins. This option 
                         allows to set the amount of times it could be thrashed in a period
-                        of time. (default: 3)
+                        of time. (default: 5)
     daemon_max_thrash_period: 
                         This option goes with the above option. It sets the period of time
                         over which each daemons can be thrashed for daemon_max_thrash_times
@@ -306,12 +305,12 @@ class NvmeofThrasher(Thrasher, Greenlet):
         self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1))
 
         # Limits on thrashing each daemon
-        self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3))
+        self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 5))
         self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds
 
         self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60))
         self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30))
-        self.min_revive_delay = int(self.config.get('min_revive_delay', 100))
+        self.min_revive_delay = int(self.config.get('min_revive_delay', 60))
         self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30))
 
     def _get_devices(self, remote):
@@ -347,6 +346,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
             run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof',
             run.Raw('&&'), 'ceph', 'health', 'detail',
             run.Raw('&&'), 'ceph', '-s',
+            run.Raw('&&'), 'sudo', 'nvme', 'list',
         ]
         for dev in self.devices:
             check_cmd += [
@@ -421,13 +421,11 @@ class NvmeofThrasher(Thrasher, Greenlet):
         while not self.stopping.is_set():
             killed_daemons = defaultdict(list)
 
-            weight = 1.0 / len(self.daemons)
-            count = 0
+            thrash_daemon_num = self.rng.randint(1, self.max_thrash_daemons)
+            selected_daemons = self.rng.sample(self.daemons, thrash_daemon_num)
             for daemon in self.daemons:
-                skip = self.rng.uniform(0.0, 1.0)
-                if weight <= skip:
-                    self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
-                        label=daemon.id_, skip=skip, weight=weight))
+                if daemon not in selected_daemons:
+                    self.log(f'skipping daemon {daemon.id_} ...')
                     continue
 
                 # For now, nvmeof daemons can only be thrashed 3 times in last 30mins. 
@@ -445,17 +443,11 @@ class NvmeofThrasher(Thrasher, Greenlet):
                         continue
 
                 self.log('kill {label}'.format(label=daemon.id_))
-                # daemon.stop()
                 kill_method = self.kill_daemon(daemon)
 
                 killed_daemons[kill_method].append(daemon)
                 daemons_thrash_history[daemon.id_] += [datetime.now()]
 
-                # only thrash max_thrash_daemons amount of daemons
-                count += 1
-                if count >= self.max_thrash_daemons:
-                    break
-
             if killed_daemons:
                 iteration_summary = "thrashed- "
                 for kill_method in killed_daemons:
@@ -468,7 +460,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
 
                 self.log(f'waiting for {revive_delay} secs before reviving')
                 time.sleep(revive_delay) # blocking wait
-                self.log('done waiting before reviving')
+                self.log(f'done waiting before reviving - iteration #{len(summary)}: {iteration_summary}')
 
                 self.do_checks()
                 self.switch_task()
@@ -487,7 +479,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
                 if thrash_delay > 0.0:
                     self.log(f'waiting for {thrash_delay} secs before thrashing')
                     time.sleep(thrash_delay) # blocking
-                    self.log('done waiting before thrashing')
+                    self.log('done waiting before thrashing - everything should be up now')
 
                 self.do_checks()
                 self.switch_task()
diff --git a/qa/tasks/s3a_hadoop.py b/qa/tasks/s3a_hadoop.py
index 7b77359fcf2..4518a6f397c 100644
--- a/qa/tasks/s3a_hadoop.py
+++ b/qa/tasks/s3a_hadoop.py
@@ -1,5 +1,6 @@
 import contextlib
 import logging
+import os
 from teuthology import misc
 from teuthology.orchestra import run
 
@@ -40,7 +41,7 @@ def task(ctx, config):
 
     # get versions
     maven_major = config.get('maven-major', 'maven-3')
-    maven_version = config.get('maven-version', '3.6.3')
+    maven_version = config.get('maven-version', '3.9.9')
     hadoop_ver = config.get('hadoop-version', '2.9.2')
     bucket_name = config.get('bucket-name', 's3atest')
     access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F')
@@ -48,11 +49,19 @@ def task(ctx, config):
         'secret-key',
         'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb')
 
+    # programmatically find a nearby mirror so as not to hammer archive.apache.org
+    apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \
+        "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1"
+    log.info("determining apache mirror by running: " + apache_mirror_cmd)
+    apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/)
+    log.info("chosen apache mirror is " + apache_mirror_url_front)
+
     # set versions for cloning the repo
     apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format(
         maven_version=maven_version)
-    maven_link = 'http://archive.apache.org/dist/maven/' + \
-        '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven
+    maven_link = '{apache_mirror_url_front}/maven/'.format(apache_mirror_url_front=apache_mirror_url_front) + \
+        '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + \
+        apache_maven
     hadoop_git = 'https://github.com/apache/hadoop'
     hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver)
     if hadoop_ver == 'trunk':
@@ -204,6 +213,7 @@ def run_s3atest(client, maven_version, testdir, test_options):
             run.Raw('&&'),
             run.Raw(rm_test),
             run.Raw('&&'),
+            run.Raw('JAVA_HOME=$(alternatives --list | grep jre_1.8.0 | head -n 1 | awk \'{print $3}\')'),
             run.Raw(run_test),
             run.Raw(test_options)
         ]
diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh
index dc6fd1669da..794353348b4 100755
--- a/qa/workunits/nvmeof/basic_tests.sh
+++ b/qa/workunits/nvmeof/basic_tests.sh
@@ -38,8 +38,10 @@ disconnect_all() {
 connect_all() {
     sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
     sleep 5
-    output=$(sudo nvme list --output-format=json)
-    if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then
+    expected_devices_count=$1
+    actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l) 
+    if [ "$actual_devices" -ne "$expected_devices_count" ]; then
+        sudo nvme list --output-format=json
         return 1
     fi
 }
@@ -72,11 +74,13 @@ test_run connect
 test_run list_subsys 1
 test_run disconnect_all
 test_run list_subsys 0
-test_run connect_all
+devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT)) 
+test_run connect_all $devices_count
 gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
 multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) 
 test_run list_subsys $multipath_count
 
 
+
 echo "-------------Test Summary-------------"
 echo "[nvmeof] All nvmeof basic tests passed!"
diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh
index 57d355a6318..03fb58693bd 100755
--- a/qa/workunits/nvmeof/fio_test.sh
+++ b/qa/workunits/nvmeof/fio_test.sh
@@ -5,6 +5,7 @@ sudo yum -y install sysstat
 
 namespace_range_start=
 namespace_range_end=
+random_devices_count=
 rbd_iostat=false
 
 while [[ $# -gt 0 ]]; do
@@ -17,6 +18,10 @@ while [[ $# -gt 0 ]]; do
             namespace_range_end=$2
             shift 2
             ;;
+        --random_devices)
+            random_devices_count=$2
+            shift 2
+            ;;
         --rbd_iostat)
             rbd_iostat=true
             shift
@@ -37,6 +42,8 @@ all_drives_list=$(sudo nvme list --output-format=json |
 # run on first 3 namespaces here.
 if [ "$namespace_range_start" ] || [ "$namespace_range_end" ]; then
     selected_drives=$(echo "${all_drives_list[@]}" | sed -n "${namespace_range_start},${namespace_range_end}p")
+elif [ "$random_devices_count" ]; then 
+    selected_drives=$(echo "${all_drives_list[@]}" | shuf -n $random_devices_count)
 else
     selected_drives="${all_drives_list[@]}"
 fi
diff --git a/qa/workunits/nvmeof/scalability_test.sh b/qa/workunits/nvmeof/scalability_test.sh
index 5a26b6284f7..8ede4b7eda2 100755
--- a/qa/workunits/nvmeof/scalability_test.sh
+++ b/qa/workunits/nvmeof/scalability_test.sh
@@ -3,37 +3,64 @@
 
 GATEWAYS=$1 # exmaple "nvmeof.a,nvmeof.b"
 DELAY="${SCALING_DELAYS:-50}"
+POOL="${RBD_POOL:-mypool}"
+GROUP="${NVMEOF_GROUP:-mygroup0}"
+source /etc/ceph/nvmeof.env
 
 if [ -z "$GATEWAYS" ]; then
     echo "At least one gateway needs to be defined for scalability test"
     exit 1
 fi
 
-pip3 install yq
-
 status_checks() {
-    ceph nvme-gw show mypool ''
-    ceph orch ls
-    ceph orch ps 
-    ceph -s
+    expected_count=$1
+
+    output=$(ceph nvme-gw show $POOL $GROUP) 
+    nvme_show=$(echo $output | grep -o '"AVAILABLE"' | wc -l)
+    if [ "$nvme_show" -ne "$expected_count" ]; then
+        return 1
+    fi
+
+    orch_ls=$(ceph orch ls)
+    if ! echo "$orch_ls" | grep -q "$expected_count/$expected_count"; then
+        return 1
+    fi
+
+    output=$(ceph orch ps --service-name nvmeof.$POOL.$GROUP)     
+    orch_ps=$(echo $output | grep -o 'running' | wc -l)
+    if [ "$orch_ps" -ne "$expected_count" ]; then
+        return 1
+    fi
+
+    ceph_status=$(ceph -s)
+    if ! echo "$ceph_status" | grep -q "HEALTH_OK"; then
+        return 1
+    fi
 }
 
+total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
+scaled_down_gateways_count=$(( total_gateways_count - $(echo "$GATEWAYS" | tr -cd ',' | wc -c) - 1 ))
+
 
 echo "[nvmeof.scale] Setting up config to remove gateways ${GATEWAYS}"
+ceph orch ls --service-name nvmeof.$POOL.$GROUP --export > /tmp/nvmeof-gw.yaml
 ceph orch ls nvmeof --export > /tmp/nvmeof-gw.yaml
 cat /tmp/nvmeof-gw.yaml
-yq "del(.placement.hosts[] | select(. | test(\".*($(echo $GATEWAYS | sed 's/,/|/g'))\")))" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml
+
+pattern=$(echo $GATEWAYS | sed 's/,/\\|/g')
+sed "/$pattern/d" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml  
 cat /tmp/nvmeof-gw-new.yaml
 
 echo "[nvmeof.scale] Starting scale testing by removing ${GATEWAYS}"
-status_checks
-ceph orch rm nvmeof.mypool && sleep 20 # temp workaround
+status_checks $total_gateways_count 
 ceph orch apply -i /tmp/nvmeof-gw-new.yaml # downscale
+ceph orch redeploy nvmeof.$POOL.$GROUP 
 sleep $DELAY
-status_checks
-ceph orch rm nvmeof.mypool && sleep 20 # temp workaround
+status_checks $scaled_down_gateways_count
+echo "[nvmeof.scale] Downscale complete - removed gateways (${GATEWAYS}); now scaling back up"
 ceph orch apply -i /tmp/nvmeof-gw.yaml #upscale
+ceph orch redeploy nvmeof.$POOL.$GROUP 
 sleep $DELAY
-status_checks
+status_checks $total_gateways_count
 
 echo "[nvmeof.scale] Scale testing passed for ${GATEWAYS}"
diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index cc4024323eb..b573647b1e3 100755
--- a/qa/workunits/nvmeof/setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -26,14 +26,21 @@ list_subsystems () {
     done
 }
 
+list_namespaces () { 
+    for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
+        subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
+        sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn        
+    done
+}
+
+echo "[nvmeof] Starting subsystem setup..."
+
 # add all subsystems
 for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
     subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
     sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
 done
 
-list_subsystems
-
 # add all gateway listeners 
 for i in "${!gateway_ips[@]}"
 do
@@ -65,11 +72,5 @@ done
 
 list_subsystems
 
-# list namespaces
-for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
-    subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
-    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn        
-done
-
 
 echo "[nvmeof] Subsystem setup done"
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index 2aa27d3d655..0ceb9ff54cf 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -914,6 +914,11 @@ test_namespace() {
 
     rbd group create rbd/test1/group1
     rbd group image add rbd/test1/group1 rbd/test1/image1
+    rbd group image add --group-pool rbd --group-namespace test1 --group group1 \
+        --image-pool rbd --image-namespace test1 --image image2
+    rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \
+        --image-pool rbd --image-namespace test1 --image image1
+    rbd group image rm rbd/test1/group1 rbd/test1/image2
     rbd group rm rbd/test1/group1
 
     rbd trash move rbd/test1/image1