370 files changed, 5627 insertions, 8535 deletions
diff --git a/src/btrfs_ioc_test.c b/src/btrfs_ioc_test.c
deleted file mode 100644
index e12bad14d1b..00000000000
--- a/src/btrfs_ioc_test.c
+++ /dev/null
@@ -1,171 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <asm/types.h>
-#include <sys/stat.h>
-#include <sys/ioctl.h>
-#include <fcntl.h>
-
-#include "common/safe_io.h"
-#include "os/btrfs_ioctl.h"
-
-void do_open_wr(const char *fname, int *fd)
-{
-	*fd = open(fname, O_WRONLY | O_CREAT, 0644);
-	if (*fd < 0) {
-		perror("open");
-		exit(1);
-	}
-}
-
-void do_open_rd(const char *fname, int *fd)
-{
-	*fd = open(fname, O_RDONLY);
-	if (*fd < 0) {
-		perror("open");
-		exit(1);
-	}
-}
-
-void do_lseek(int fd, int ofs)
-{
-	int rc = lseek(fd, ofs, SEEK_SET);
-	if (rc < 0) {
-		perror("lseek");
-		exit(1);
-	}
-}
-
-void do_write(int fd, int len)
-{
-	char *buf = malloc(len);
-	int rc;
-	if (!buf) {
-		printf("not enough memory\n");
-		exit(1);
-	}
-
-	memset(buf, 0, len);
-	rc = safe_write(fd, buf, len);
-	if (rc) {
-		fprintf(stderr, "safe_write failed with error %d (%s)\n",
-			rc, strerror(rc));
-		exit(1);
-	}
-
-	if (rc != len) {
-		printf("invalid number of bytes written\n");
-		exit(1);
-	}
-
-	free(buf);
-}
-
-void do_link(const char *old, const char *new)
-{
-	int rc = link(old, new);
-	if (rc < 0) {
-		perror("link");
-		exit(1);
-	}
-}
-
-void do_clone_range(int from, int to, int off, int len)
-{
-	struct btrfs_ioctl_clone_range_args a;
-	int r;
-
-	a.src_fd = from;
-	a.src_offset = off;
-	a.src_length = len;
-	a.dest_offset = off;
-	r = ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
-	if (r < 0) {
-		perror("ioctl");
-		exit(1);
-	}
-}
-
-void do_snap_async(int fd, const char *name, unsigned long long *transid)
-{
-	struct btrfs_ioctl_async_vol_args async_args;
-	struct btrfs_ioctl_vol_args volargs;
-	int r;
-
-	strcpy(volargs.name, name);
-	volargs.fd = fd;
-
-	async_args.args = &volargs;
-	async_args.transid = transid;
-
-	r = ioctl(fd, BTRFS_IOC_SNAP_CREATE_ASYNC, &async_args);
-
-	if (r < 0) {
-		perror("ioctl");
-		exit(1);
-	}
-}
-
-void do_snap_destroy(int fd, const char *name)
-{
-	struct btrfs_ioctl_vol_args volargs;
-	int r;
-
-	strcpy(volargs.name, name);
-	volargs.fd = 0;
-
-	r = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &volargs);
-
-	if (r < 0) {
-		perror("snap_destroy: ioctl");
-		exit(1);
-	}
-}
-
-void do_snap_wait(int fd, unsigned long long transid)
-{
-	int r = ioctl(fd, BTRFS_IOC_WAIT_SYNC, &transid);
-	if (r < 0) {
-		perror("do_snap_wait: ioctl");
-		exit(1);
-	}
-}
-
-void usage_exit(char *arg)
-{
-	printf("usage: %s <btrfs_base> <snap_name>\n", arg);
-	exit(1);
-}
-
-#define TEMP_FILENAME "temp"
-#define DEST_FILENAME "dest"
-#define SRC_FILENAME "src"
-
-int main(int argc, char *argv[])
-{
-	const char *base_dir;
-	const char *snap_name;
-
-	int fd;
-	int i;
-	unsigned long long transid;
-
-	if (argc < 3)
-		usage_exit(argv[0]);
-
-	base_dir = argv[1];
-	snap_name = argv[2];
-
-	for (i=0; i<10; i++) {
-		printf("%d\n", i);
-		do_open_rd(base_dir, &fd);
-		do_snap_async(fd, snap_name, &transid);
-		sleep(2);
-		//do_snap_wait(fd, transid);
-		do_snap_destroy(fd, snap_name);
-		close(fd);
-	}
-	
-	return 0;
-}
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index 388f6aeea27..c278de43eb0 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -10,7 +10,8 @@ from ceph_volume.api import lvm as api
 from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict
 from ceph_volume.util.device import Device
 from ceph_volume.systemd import systemctl
-from typing import Any, Dict, List
+from ceph_volume.devices.raw.list import direct_report
+from typing import Any, Dict, List, Set
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -95,83 +96,126 @@ def zap_data(path):
         'conv=fsync'
     ])
 
-def find_associated_devices(osd_id: str = '', osd_fsid: str = '') -> List[api.Volume]:
-    """
-    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
-    system that match those tag values, further detect if any partitions are
-    part of the OSD, and then return the set of LVs and partitions (if any).
-    """
-    lv_tags = {}
-    lv_tags = {key: value for key, value in {
-        'ceph.osd_id': osd_id,
-        'ceph.osd_fsid': osd_fsid
-    }.items() if value}
 
-    lvs = api.get_lvs(tags=lv_tags)
+class Zap:
+    help = 'Removes all data and filesystems from a logical volume or partition.'
 
-    if not lvs:
-        raise RuntimeError('Unable to find any LV for zapping OSD: '
-                            f'{osd_id or osd_fsid}')
-    devices_to_zap = ensure_associated_lvs(lvs, lv_tags)
+    def __init__(self, argv: List[str]) -> None:
+        self.argv = argv
+        self.osd_ids_to_zap: List[str] = []
 
-    return [Device(path) for path in set(devices_to_zap) if path]
+    def ensure_associated_raw(self, raw_report: Dict[str, Any]) -> List[str]:
+        osd_id: str = self.args.osd_id
+        osd_uuid: str = self.args.osd_fsid
+        raw_devices: Set[str] = set()
 
-def ensure_associated_lvs(lvs: List[api.Volume],
-                          lv_tags: Dict[str, Any] = {}) -> List[str]:
-    """
-    Go through each LV and ensure if backing devices (journal, wal, block)
-    are LVs or partitions, so that they can be accurately reported.
-    """
-    # look for many LVs for each backing type, because it is possible to
-    # receive a filtering for osd.1, and have multiple failed deployments
-    # leaving many journals with osd.1 - usually, only a single LV will be
-    # returned
-
-    db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
-    wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
-    backing_devices = [(db_lvs, 'db'),
-                       (wal_lvs, 'wal')]
-
-    verified_devices = []
-
-    for lv in lvs:
-        # go through each lv and append it, otherwise query `blkid` to find
-        # a physical device. Do this for each type (journal,db,wal) regardless
-        # if they have been processed in the previous LV, so that bad devices
-        # with the same ID can be caught
-        for ceph_lvs, _type in backing_devices:
-            if ceph_lvs:
-                verified_devices.extend([l.lv_path for l in ceph_lvs])
-                continue
-
-            # must be a disk partition, by querying blkid by the uuid we are
-            # ensuring that the device path is always correct
-            try:
-                device_uuid = lv.tags['ceph.%s_uuid' % _type]
-            except KeyError:
-                # Bluestore will not have ceph.journal_uuid, and Filestore
-                # will not not have ceph.db_uuid
-                continue
+        if len([details.get('osd_id') for _, details in raw_report.items() if details.get('osd_id') == osd_id]) > 1:
+            if not osd_uuid:
+                raise RuntimeError(f'Multiple OSDs found with id {osd_id}, pass --osd-fsid')
 
-            osd_device = disk.get_device_from_partuuid(device_uuid)
-            if not osd_device:
-                # if the osd_device is not found by the partuuid, then it is
-                # not possible to ensure this device exists anymore, so skip it
-                continue
-            verified_devices.append(osd_device)
+        if not osd_uuid:
+            for _, details in raw_report.items():
+                if details.get('osd_id') == int(osd_id):
+                    osd_uuid = details.get('osd_uuid')
+                    break
 
-        verified_devices.append(lv.lv_path)
+        for osd_uuid, details in raw_report.items():
+            device: str = details.get('device')
+            if details.get('osd_uuid') == osd_uuid:
+                raw_devices.add(device)
 
-    # reduce the list from all the duplicates that were added
-    return list(set(verified_devices))
+        return list(raw_devices)
+        
 
+    def find_associated_devices(self) -> List[api.Volume]:
+        """From an ``osd_id`` and/or an ``osd_fsid``, filter out all the Logical Volumes (LVs) in the
+        system that match those tag values, further detect if any partitions are
+        part of the OSD, and then return the set of LVs and partitions (if any).
 
-class Zap:
-    help = 'Removes all data and filesystems from a logical volume or partition.'
+        The function first queries the LVM-based OSDs using the provided `osd_id` or `osd_fsid`.
+        If no matches are found, it then searches the system for RAW-based OSDs.
 
-    def __init__(self, argv: List[str]) -> None:
-        self.argv = argv
-        self.osd_ids_to_zap: List[str] = []
+        Raises:
+            SystemExit: If no OSDs are found, the function raises a `SystemExit` with an appropriate message.
+
+        Returns:
+            List[api.Volume]: A list of `api.Volume` objects corresponding to the OSD's Logical Volumes (LVs)
+            or partitions that are associated with the given `osd_id` or `osd_fsid`.
+
+        Notes:
+            - If neither `osd_id` nor `osd_fsid` are provided, the function will not be able to find OSDs.
+            - The search proceeds from LVM-based OSDs to RAW-based OSDs if no Logical Volumes are found.
+        """
+        lv_tags = {}
+        lv_tags = {key: value for key, value in {
+            'ceph.osd_id': self.args.osd_id,
+            'ceph.osd_fsid': self.args.osd_fsid
+        }.items() if value}
+        devices_to_zap: List[str] = []
+        lvs = api.get_lvs(tags=lv_tags)
+
+        if lvs:
+            devices_to_zap = self.ensure_associated_lvs(lvs, lv_tags)
+        else:
+            mlogger.debug(f'No OSD identified by "{self.args.osd_id or self.args.osd_fsid}" was found among LVM-based OSDs.')
+            mlogger.debug('Proceeding to check RAW-based OSDs.')
+            raw_osds: Dict[str, Any] = direct_report()
+            if raw_osds:
+                devices_to_zap = self.ensure_associated_raw(raw_osds)
+        if not devices_to_zap:
+            raise SystemExit('No OSD were found.')
+
+        return [Device(path) for path in set(devices_to_zap) if path]
+
+    def ensure_associated_lvs(self,
+                              lvs: List[api.Volume],
+                              lv_tags: Dict[str, Any] = {}) -> List[str]:
+        """
+        Go through each LV and ensure if backing devices (journal, wal, block)
+        are LVs or partitions, so that they can be accurately reported.
+        """
+        # look for many LVs for each backing type, because it is possible to
+        # receive a filtering for osd.1, and have multiple failed deployments
+        # leaving many journals with osd.1 - usually, only a single LV will be
+        # returned
+
+        db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
+        wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
+        backing_devices = [(db_lvs, 'db'),
+                        (wal_lvs, 'wal')]
+
+        verified_devices = []
+
+        for lv in lvs:
+            # go through each lv and append it, otherwise query `blkid` to find
+            # a physical device. Do this for each type (journal,db,wal) regardless
+            # if they have been processed in the previous LV, so that bad devices
+            # with the same ID can be caught
+            for ceph_lvs, _type in backing_devices:
+                if ceph_lvs:
+                    verified_devices.extend([l.lv_path for l in ceph_lvs])
+                    continue
+
+                # must be a disk partition, by querying blkid by the uuid we are
+                # ensuring that the device path is always correct
+                try:
+                    device_uuid = lv.tags['ceph.%s_uuid' % _type]
+                except KeyError:
+                    # Bluestore will not have ceph.journal_uuid, and Filestore
+                    # will not not have ceph.db_uuid
+                    continue
+
+                osd_device = disk.get_device_from_partuuid(device_uuid)
+                if not osd_device:
+                    # if the osd_device is not found by the partuuid, then it is
+                    # not possible to ensure this device exists anymore, so skip it
+                    continue
+                verified_devices.append(osd_device)
+
+            verified_devices.append(lv.lv_path)
+
+        # reduce the list from all the duplicates that were added
+        return list(set(verified_devices))
 
     def unmount_lv(self, lv: api.Volume) -> None:
         if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'):
@@ -355,7 +399,6 @@ class Zap:
             SystemExit: When the device is a mapper and not a mpath device.
         """
         devices = self.args.devices
-
         for device in devices:
             mlogger.info("Zapping: %s", device.path)
             if device.is_mapper and not device.is_mpath:
@@ -388,7 +431,7 @@ class Zap:
                 mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
                 mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
                 raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
-        self.args.devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        self.args.devices = self.find_associated_devices()
         self.zap()
 
     def dmcrypt_close(self, dmcrypt_uuid: str) -> None:
diff --git a/src/ceph-volume/ceph_volume/devices/raw/list.py b/src/ceph-volume/ceph_volume/devices/raw/list.py
index f6ac08eab98..68923216a41 100644
--- a/src/ceph-volume/ceph_volume/devices/raw/list.py
+++ b/src/ceph-volume/ceph_volume/devices/raw/list.py
@@ -5,12 +5,14 @@ import logging
 from textwrap import dedent
 from ceph_volume import decorators, process
 from ceph_volume.util import disk
-from typing import Any, Dict, List as _List
+from ceph_volume.util.device import Device
+from typing import Any, Dict, Optional, List as _List
+from concurrent.futures import ThreadPoolExecutor
 
 logger = logging.getLogger(__name__)
 
 
-def direct_report(devices):
+def direct_report(devices: Optional[_List[str]] = None) -> Dict[str, Any]:
     """
     Other non-cli consumers of listing information will want to consume the
     report without the need to parse arguments or other flags. This helper
@@ -20,27 +22,29 @@ def direct_report(devices):
     _list = List([])
     return _list.generate(devices)
 
-def _get_bluestore_info(dev: str) -> Dict[str, Any]:
+def _get_bluestore_info(devices: _List[str]) -> Dict[str, Any]:
     result: Dict[str, Any] = {}
-    out, err, rc = process.call([
-        'ceph-bluestore-tool', 'show-label',
-        '--dev', dev], verbose_on_failure=False)
+    command: _List[str] = ['ceph-bluestore-tool',
+                           'show-label', '--bdev_aio_poll_ms=1']
+    for device in devices:
+        command.extend(['--dev', device])
+    out, err, rc = process.call(command, verbose_on_failure=False)
     if rc:
-        # ceph-bluestore-tool returns an error (below) if device is not bluestore OSD
-        #   > unable to read label for <device>: (2) No such file or directory
-        # but it's possible the error could be for a different reason (like if the disk fails)
-        logger.debug(f'assuming device {dev} is not BlueStore; ceph-bluestore-tool failed to get info from device: {out}\n{err}')
+        logger.debug(f"ceph-bluestore-tool couldn't detect any BlueStore device.\n{out}\n{err}")
     else:
         oj = json.loads(''.join(out))
-        if dev not in oj:
-            # should be impossible, so warn
-            logger.warning(f'skipping device {dev} because it is not reported in ceph-bluestore-tool output: {out}')
-        try:
-            result = disk.bluestore_info(dev, oj)
-        except KeyError as e:
-            # this will appear for devices that have a bluestore header but aren't valid OSDs
-            # for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869
-            logger.error(f'device {dev} does not have all BlueStore data needed to be a valid OSD: {out}\n{e}')
+        for device in devices:
+            if device not in oj:
+                # should be impossible, so warn
+                logger.warning(f'skipping device {device} because it is not reported in ceph-bluestore-tool output: {out}')
+            if oj.get(device):
+                try:
+                    osd_uuid = oj[device]['osd_uuid']
+                    result[osd_uuid] = disk.bluestore_info(device, oj)
+                except KeyError as e:
+                    # this will appear for devices that have a bluestore header but aren't valid OSDs
+                    # for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869
+                    logger.error(f'device {device} does not have all BlueStore data needed to be a valid OSD: {out}\n{e}')
     return result
 
 
@@ -50,68 +54,67 @@ class List(object):
 
     def __init__(self, argv: _List[str]) -> None:
         self.argv = argv
-
-    def is_atari_partitions(self, _lsblk: Dict[str, Any]) -> bool:
-        dev = _lsblk['NAME']
-        if _lsblk.get('PKNAME'):
-            parent = _lsblk['PKNAME']
-            try:
-                if disk.has_bluestore_label(parent):
-                    logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent),
-                                    'device is likely a phantom Atari partition. device info: {}'.format(_lsblk)))
-                    return True
-            except OSError as e:
-                logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev),
-                            'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e)))
-                return True
-        return False
-
-    def exclude_atari_partitions(self, _lsblk_all: Dict[str, Any]) -> _List[Dict[str, Any]]:
-        return [_lsblk for _lsblk in _lsblk_all if not self.is_atari_partitions(_lsblk)]
-
-    def generate(self, devs=None):
+        self.info_devices: _List[Dict[str, str]] = []
+        self.devices_to_scan: _List[str] = []
+
+    def exclude_atari_partitions(self) -> None:
+        result: _List[str] = []
+        for info_device in self.info_devices:
+            path = info_device['NAME']
+            parent_device = info_device.get('PKNAME')
+            if parent_device:
+                try:
+                    if disk.has_bluestore_label(parent_device):
+                        logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(path, parent_device),
+                                        'device is likely a phantom Atari partition. device info: {}'.format(info_device)))
+                        continue
+                except OSError as e:
+                    logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(path),
+                                'failed to determine if parent device {} is BlueStore. err: {}'.format(parent_device, e)))
+                    continue
+            result.append(path)
+        self.devices_to_scan = result
+
+    def exclude_lvm_osd_devices(self) -> None:
+        with ThreadPoolExecutor() as pool:
+            filtered_devices_to_scan = pool.map(self.filter_lvm_osd_devices, self.devices_to_scan)
+            self.devices_to_scan = [device for device in filtered_devices_to_scan if device is not None]
+
+    def filter_lvm_osd_devices(self, device: str) -> Optional[str]:
+        d = Device(device)
+        return d.path if not d.ceph_device_lvm else None
+
+    def generate(self, devices: Optional[_List[str]] = None) -> Dict[str, Any]:
         logger.debug('Listing block devices via lsblk...')
-        info_devices = []
-        if not devs or not any(devs):
+        if not devices or not any(devices):
             # If no devs are given initially, we want to list ALL devices including children and
             # parents. Parent disks with child partitions may be the appropriate device to return if
             # the parent disk has a bluestore header, but children may be the most appropriate
             # devices to return if the parent disk does not have a bluestore header.
-            info_devices = disk.lsblk_all(abspath=True)
-            devs = [device['NAME'] for device in info_devices if device.get('NAME',)]
+            self.info_devices = disk.lsblk_all(abspath=True)
+            # Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret
+            # bluestore's on-disk format as an Atari partition table. These false Atari partitions
+            # can be interpreted as real OSDs if a bluestore OSD was previously created on the false
+            # partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a
+            # parent, it is a child. If the parent is a valid bluestore OSD, the child will only
+            # exist if it is a phantom Atari partition, and the child should be ignored. If the
+            # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to
+            # determine whether a parent is bluestore, we should err on the side of not reporting
+            # the child so as not to give a false negative.
+            self.exclude_atari_partitions()
+            self.exclude_lvm_osd_devices()
+
         else:
-            for dev in devs:
-                info_devices.append(disk.lsblk(dev, abspath=True))
-
-        # Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret
-        # bluestore's on-disk format as an Atari partition table. These false Atari partitions
-        # can be interpreted as real OSDs if a bluestore OSD was previously created on the false
-        # partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a
-        # parent, it is a child. If the parent is a valid bluestore OSD, the child will only
-        # exist if it is a phantom Atari partition, and the child should be ignored. If the
-        # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to
-        # determine whether a parent is bluestore, we should err on the side of not reporting
-        # the child so as not to give a false negative.
-        info_devices = self.exclude_atari_partitions(info_devices)
-
-        result = {}
-        logger.debug('inspecting devices: {}'.format(devs))
-        for info_device in info_devices:
-            bs_info = _get_bluestore_info(info_device['NAME'])
-            if not bs_info:
-                # None is also returned in the rare event that there is an issue reading info from
-                # a BlueStore disk, so be sure to log our assumption that it isn't bluestore
-                logger.info('device {} does not have BlueStore information'.format(info_device['NAME']))
-                continue
-            uuid = bs_info['osd_uuid']
-            if uuid not in result:
-                result[uuid] = {}
-            result[uuid].update(bs_info)
+            self.devices_to_scan = devices
+
+        result: Dict[str, Any] = {}
+        logger.debug('inspecting devices: {}'.format(self.devices_to_scan))
+        result = _get_bluestore_info(self.devices_to_scan)
 
         return result
 
     @decorators.needs_root
-    def list(self, args):
+    def list(self, args: argparse.Namespace) -> None:
         report = self.generate(args.device)
         if args.format == 'json':
             print(json.dumps(report, indent=4, sort_keys=True))
@@ -120,7 +123,7 @@ class List(object):
                 raise SystemExit('No valid Ceph devices found')
             raise RuntimeError('not implemented yet')
 
-    def main(self):
+    def main(self) -> None:
         sub_command_help = dedent("""
         List OSDs on raw devices with raw device labels (usually the first
         block of the device).
diff --git a/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py b/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
index ba3719cd3f3..aa11d553723 100644
--- a/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
+++ b/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
@@ -367,7 +367,7 @@ class LvmBlueStore(BlueStore):
         if is_encrypted:
             osd_lv_path = '/dev/mapper/%s' % osd_block_lv.__dict__['lv_uuid']
             lockbox_secret = osd_block_lv.tags['ceph.cephx_lockbox_secret']
-            self.with_tpm = bool(osd_block_lv.tags.get('ceph.with_tpm', 0))
+            self.with_tpm = osd_block_lv.tags.get('ceph.with_tpm') == '1'
             if not self.with_tpm:
                 encryption_utils.write_lockbox_keyring(osd_id,
                                                        osd_fsid,
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
new file mode 100644
index 00000000000..cca64e83ab0
--- /dev/null
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
@@ -0,0 +1,81 @@
+ceph_bluestore_tool_output = '''
+{
+    "/dev/sdb": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb-fsid",
+        "ceph_version_when_created": "ceph version 19.3.0-5537-gb9ba4e48 (b9ba4e48633d6d90d5927a4e66b9ecbb4d7e6e73) squid (dev)",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "0"
+    },
+    "/dev/vdx": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.955279+0000",
+        "description": "main",
+        "bfm_blocks": "52428800",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "214748364800",
+        "bluefs": "1",
+        "ceph_fsid": "2d20bc8c-8a0c-11ef-aaba-525400e54507",
+        "ceph_version_when_created": "ceph version 19.3.0-5537-gb9ba4e48 (b9ba4e48633d6d90d5927a4e66b9ecbb4d7e6e73) squid (dev)",
+        "created_at": "2024-10-16T10:51:09.121455Z",
+        "elastic_shared_blobs": "1",
+        "epoch": "16",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "multi": "yes",
+        "osd_key": "AQCZmg9nxOKTCBAA6EQftuqMuKMHqypSAfqBsQ==",
+        "ready": "ready",
+        "type": "bluestore",
+        "whoami": "5"
+    },
+    "/dev/vdy": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.961279+0000",
+        "description": "bluefs db"
+    },
+    "/dev/vdz": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.961279+0000",
+        "description": "bluefs wal"
+    }
+}
+'''.split('\n')
+
+lsblk_all = ['NAME="/dev/sdb" KNAME="/dev/sdb" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdx" KNAME="/dev/sdx" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdy" KNAME="/dev/sdy" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdz" KNAME="/dev/sdz" PKNAME="" PARTLABEL=""']
+
+blkid_output = ['/dev/ceph-1172bba3-3e0e-45e5-ace6-31ae8401221f/osd-block-5050a85c-d1a7-4d66-b4ba-2e9b1a2970ae: TYPE="ceph_bluestore" USAGE="other"']
+
+udevadm_property = '''DEVNAME=/dev/sdb
+DEVTYPE=disk
+ID_ATA=1
+ID_BUS=ata
+ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_PART_TABLE_UUID=c8f91d57-b26c-4de1-8884-0c9541da288c
+ID_PATH=pci-0000:00:17.0-ata-3
+ID_PATH_TAG=pci-0000_00_17_0-ata-3
+ID_REVISION=70000P10
+ID_SERIAL=SK_hynix_SC311_SATA_512GB_MS83N71801150416A
+TAGS=:systemd:
+USEC_INITIALIZED=16117769'''.split('\n')
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
index efe52c053ff..d9b3bdfd239 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -1,3 +1,4 @@
+# type: ignore
 import os
 import pytest
 from copy import deepcopy
@@ -5,6 +6,25 @@ from mock.mock import patch, call, Mock
 from ceph_volume import process
 from ceph_volume.api import lvm as api
 from ceph_volume.devices.lvm import zap
+from . import data_zap
+from typing import Tuple, List
+
+
+def process_call(command, **kw):
+    result: Tuple[List[str], List[str], int] = ''
+    if 'udevadm' in command:
+        result = data_zap.udevadm_property, [], 0
+    if 'ceph-bluestore-tool' in command:
+        result = data_zap.ceph_bluestore_tool_output, [], 0
+    if 'is-active' in command:
+        result = [], [], 1
+    if 'lsblk' in command:
+        result = data_zap.lsblk_all, [], 0
+    if 'blkid' in command:
+        result = data_zap.blkid_output, [], 0
+    if 'pvs' in command:
+        result = [], [], 0
+    return result
 
 
 class TestZap:
@@ -30,10 +50,10 @@ class TestZap:
             zap.Zap(argv=['--clear', '/dev/foo']).main()
         assert e.value.code == 1
 
-
-class TestFindAssociatedDevices(object):
-
-    def test_no_lvs_found_that_match_id(self, monkeypatch, device_info):
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_id(self, is_root, monkeypatch, device_info):
         tags = 'ceph.osd_id=9,ceph.journal_uuid=x,ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='vg',
                          lv_tags=tags, lv_path='/dev/VolGroup/lv')
@@ -41,10 +61,15 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id=10)
+        z = zap.Zap(['--osd-id', '10'])
 
-    def test_no_lvs_found_that_match_fsid(self, monkeypatch, device_info):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_fsid(self, is_root, monkeypatch):
         tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,'+\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', lv_tags=tags,
@@ -53,10 +78,15 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_fsid='aaaa-lkjh')
+        z = zap.Zap(['--osd-fsid', 'aaaa-lkjh'])
 
-    def test_no_lvs_found_that_match_id_fsid(self, monkeypatch, device_info):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_id_fsid(self, is_root, monkeypatch):
         tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,'+\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='vg',
@@ -65,45 +95,82 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id='9', osd_fsid='aaaa-lkjh')
+        z = zap.Zap(['--osd-id', '9', '--osd-fsid', 'aaaa-lkjh'])
+
+        with pytest.raises(SystemExit):
+            z.main()
 
-    def test_no_ceph_lvs_found(self, monkeypatch):
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    def test_no_ceph_lvs_and_no_ceph_raw_found(self, is_root, monkeypatch):
         osd = api.Volume(lv_name='volume1', lv_uuid='y', lv_tags='',
                          lv_path='/dev/VolGroup/lv')
         volumes = []
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id=100)
+        z = zap.Zap(['--osd-id', '100'])
+
+        with pytest.raises(SystemExit):
+            z.main()
 
-    def test_lv_is_matched_id(self, monkeypatch):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_lv_is_matched_id(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
                          lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes = [osd]
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+
+        z = zap.Zap(['--osd-id', '0'])
+        z.main()
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once()
+
+    # @patch('ceph_volume.devices.lvm.zap.disk.has_bluestore_label', Mock(return_value=True))
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_id(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
-        monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_id='0')
-        assert result[0].path == '/dev/VolGroup/lv'
+        z = zap.Zap(['--osd-id', '0'])
+        z.main()
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once()
 
-    def test_lv_is_matched_fsid(self, monkeypatch):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    def test_lv_is_matched_fsid(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,' +\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
                          lv_path='/dev/VolGroup/lv', lv_tags=tags)
-        volumes = []
-        volumes.append(osd)
+        volumes = [osd]
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: deepcopy(volumes))
         monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_fsid='asdf-lkjh')
-        assert result[0].path == '/dev/VolGroup/lv'
+        z = zap.Zap(['--osd-fsid', 'asdf-lkjh'])
+        z.main()
+
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_fsid(self, mock_zap, monkeypatch, is_root):
+        volumes = []
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+
+        z = zap.Zap(['--osd-fsid', 'd5a496bc-dcb9-4ad0-a12c-393d3200d2b6'])
+        z.main()
 
-    def test_lv_is_matched_id_fsid(self, monkeypatch):
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    def test_lv_is_matched_id_fsid(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,' +\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
@@ -113,26 +180,43 @@ class TestFindAssociatedDevices(object):
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
         monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_id='0', osd_fsid='asdf-lkjh')
-        assert result[0].path == '/dev/VolGroup/lv'
-
+        z = zap.Zap(['--osd-id', '0', '--osd-fsid', 'asdf-lkjh', '--no-systemd'])
+        z.main()
 
-class TestEnsureAssociatedLVs(object):
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once
 
-    @patch('ceph_volume.devices.lvm.zap.api', Mock(return_value=[]))
-    def test_nothing_is_found(self):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_id_fsid(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == []
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
 
-    def test_data_is_found(self, fake_call):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
-        osd = api.Volume(
-            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/data', lv_tags=tags)
+        z = zap.Zap(['--osd-id', '0', '--osd-fsid', 'd5a496bc-dcb9-4ad0-a12c-393d3200d2b6'])
+        z.main()
+
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(side_effect=['/dev/vdx', '/dev/vdy', '/dev/vdz', None]))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_multiple_devices(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == ['/dev/VolGroup/data']
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+        z = zap.Zap(['--osd-id', '5'])
+        z.main()
+
+        set([device.path for device in z.args.devices]) == {'/dev/vdx', '/dev/vdy', '/dev/vdz'}
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.lvm.zap.api.get_lvs', Mock(return_value=[]))
+    def test_nothing_is_found(self, is_root):
+        z = zap.Zap(['--osd-id', '0'])
+        with pytest.raises(SystemExit):
+            z.main()
 
     def test_block_is_found(self, fake_call):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
@@ -140,7 +224,7 @@ class TestEnsureAssociatedLVs(object):
             lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
         volumes = []
         volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert result == ['/dev/VolGroup/block']
 
     def test_success_message_for_fsid(self, factory, is_root, capsys):
@@ -159,28 +243,6 @@ class TestEnsureAssociatedLVs(object):
         out, err = capsys.readouterr()
         assert "Zapping successful for OSD: 1" in err
 
-    def test_journal_is_found(self, fake_call):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
-        osd = api.Volume(
-            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
-        volumes = []
-        volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == ['/dev/VolGroup/lv']
-
-    @patch('ceph_volume.api.lvm.process.call', Mock(return_value=('', '', 0)))
-    def test_multiple_journals_are_found(self):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
-        volumes = []
-        for i in range(3):
-            osd = api.Volume(
-                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
-            volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert '/dev/VolGroup/lv0' in result
-        assert '/dev/VolGroup/lv1' in result
-        assert '/dev/VolGroup/lv2' in result
-
     @patch('ceph_volume.api.lvm.process.call', Mock(return_value=('', '', 0)))
     def test_multiple_dbs_are_found(self):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=db'
@@ -189,7 +251,7 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lv0' in result
         assert '/dev/VolGroup/lv1' in result
         assert '/dev/VolGroup/lv2' in result
@@ -202,7 +264,7 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lv0' in result
         assert '/dev/VolGroup/lv1' in result
         assert '/dev/VolGroup/lv2' in result
@@ -215,14 +277,14 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % _type, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % _type, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lvjournal' in result
         assert '/dev/VolGroup/lvwal' in result
         assert '/dev/VolGroup/lvdb' in result
 
     @patch('ceph_volume.devices.lvm.zap.api.get_lvs')
     def test_ensure_associated_lvs(self, m_get_lvs):
-        zap.ensure_associated_lvs([], lv_tags={'ceph.osd_id': '1'})
+        zap.Zap([]).ensure_associated_lvs([], lv_tags={'ceph.osd_id': '1'})
         calls = [
             call(tags={'ceph.type': 'db', 'ceph.osd_id': '1'}),
             call(tags={'ceph.type': 'wal', 'ceph.osd_id': '1'})
diff --git a/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py b/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py
new file mode 100644
index 00000000000..e1d1a48967a
--- /dev/null
+++ b/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py
@@ -0,0 +1,102 @@
+ceph_bluestore_tool_show_label_output: str = '''{
+    "/dev/sdb": {
+        "osd_uuid": "sdb-uuid",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "0"
+    },
+    "/dev/sdb2": {
+        "osd_uuid": "sdb2-uuid",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb2-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    },
+    "/dev/sde1": {
+        "osd_uuid": "sde1-uuid",
+        "size": 214747316224,
+        "btime": "2023-07-26T13:20:19.509457+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "214747316224",
+        "bluefs": "1",
+        "ceph_fsid": "sde1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCSHcFkUeLIMBAAjKqANkXafjvVISkXt6FGCA==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "1"
+    },
+    "/dev/mapper/ceph--osd--block--1": {
+        "osd_uuid": "lvm-1-uuid",
+        "size": 549751619584,
+        "btime": "2021-07-23T16:04:37.881060+0000",
+        "description": "main",
+        "bfm_blocks": "134216704",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "549751619584",
+        "bluefs": "1",
+        "ceph_fsid": "lvm-1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    },
+    "/dev/mapper/ceph--osd--block--1": {
+        "osd_uuid": "lvm-1-uuid",
+        "size": 549751619584,
+        "btime": "2021-07-23T16:04:37.881060+0000",
+        "description": "main",
+        "bfm_blocks": "134216704",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "549751619584",
+        "bluefs": "1",
+        "ceph_fsid": "lvm-1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    }
+}'''
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py b/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
index 604fb4faa3e..23d2bfdaa2c 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
@@ -1,5 +1,7 @@
+# type: ignore
 import pytest
-from mock.mock import patch
+from .data_list import ceph_bluestore_tool_show_label_output
+from mock.mock import patch, Mock
 from ceph_volume.devices import raw
 
 # Sample lsblk output is below that overviews the test scenario. (--json output for reader clarity)
@@ -74,98 +76,6 @@ def _lsblk_output(dev, parent=None):
     ret = 'NAME="{}" KNAME="{}" PKNAME="{}"'.format(dev, dev, parent)
     return [ret] # needs to be in a list form
 
-def _bluestore_tool_label_output_sdb():
-    return '''{
-    "/dev/sdb": {
-        "osd_uuid": "sdb-uuid",
-        "size": 1099511627776,
-        "btime": "2021-07-23T16:02:22.809186+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "1099511627776",
-        "bluefs": "1",
-        "ceph_fsid": "sdb-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "0"
-    }
-}'''
-
-def _bluestore_tool_label_output_sdb2():
-    return '''{
-    "/dev/sdb2": {
-        "osd_uuid": "sdb2-uuid",
-        "size": 1099511627776,
-        "btime": "2021-07-23T16:02:22.809186+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "1099511627776",
-        "bluefs": "1",
-        "ceph_fsid": "sdb2-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "2"
-    }
-}'''
-
-def _bluestore_tool_label_output_sde1():
-    return '''{
-    "/dev/sde1": {
-        "osd_uuid": "sde1-uuid",
-        "size": 214747316224,
-        "btime": "2023-07-26T13:20:19.509457+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "214747316224",
-        "bluefs": "1",
-        "ceph_fsid": "sde1-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQCSHcFkUeLIMBAAjKqANkXafjvVISkXt6FGCA==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "1"
-    }
-}'''
-
-def _bluestore_tool_label_output_dm_okay():
-    return '''{
-    "/dev/mapper/ceph--osd--block--1": {
-        "osd_uuid": "lvm-1-uuid",
-        "size": 549751619584,
-        "btime": "2021-07-23T16:04:37.881060+0000",
-        "description": "main",
-        "bfm_blocks": "134216704",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "549751619584",
-        "bluefs": "1",
-        "ceph_fsid": "lvm-1-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "2"
-    }
-}'''
-
 def _process_call_side_effect(command, **kw):
     if "lsblk" in command:
         if "/dev/" in command[-1]:
@@ -186,19 +96,7 @@ def _process_call_side_effect(command, **kw):
         pytest.fail('command {} needs behavior specified for it'.format(command))
 
     if "ceph-bluestore-tool" in command:
-        if "/dev/sdb" in command:
-            # sdb is a bluestore OSD
-            return _bluestore_tool_label_output_sdb(), '', 0
-        if "/dev/sdb2" in command:
-            # sdb2 is a phantom atari partition that appears to have some valid bluestore info
-            return _bluestore_tool_label_output_sdb2(), '', 0
-        if "/dev/sde1" in command:
-            return _bluestore_tool_label_output_sde1(), '', 0
-        if "/dev/mapper/ceph--osd--block--1" in command:
-            # dm device 1 is a valid bluestore OSD (the other is corrupted/invalid)
-            return _bluestore_tool_label_output_dm_okay(), '', 0
-        # sda and children, sdb's children, sdc, sdd, dm device 2 all do NOT have bluestore OSD data
-        return [], 'fake No such file or directory error', 1
+        return ceph_bluestore_tool_show_label_output, '', 0
     pytest.fail('command {} needs behavior specified for it'.format(command))
 
 def _has_bluestore_label_side_effect(disk_path):
@@ -224,6 +122,7 @@ def _has_bluestore_label_side_effect(disk_path):
 
 class TestList(object):
 
+    @patch('ceph_volume.devices.raw.list.List.exclude_lvm_osd_devices', Mock())
     @patch('ceph_volume.util.device.disk.get_devices')
     @patch('ceph_volume.util.disk.has_bluestore_label')
     @patch('ceph_volume.process.call')
@@ -257,6 +156,7 @@ class TestList(object):
         assert sde1['ceph_fsid'] == 'sde1-fsid'
         assert sde1['type'] == 'bluestore'
 
+    @patch('ceph_volume.devices.raw.list.List.exclude_lvm_osd_devices', Mock())
     @patch('ceph_volume.util.device.disk.get_devices')
     @patch('ceph_volume.util.disk.has_bluestore_label')
     @patch('ceph_volume.process.call')
@@ -275,4 +175,4 @@ class TestList(object):
 
         result = raw.list.List([]).generate()
         assert len(result) == 2
-        assert 'sdb-uuid' in result
+        assert {'sdb-uuid', 'sde1-uuid'} == set(result.keys())
diff --git a/src/ceph-volume/ceph_volume/tests/test_inventory.py b/src/ceph-volume/ceph_volume/tests/test_inventory.py
index 832c0836642..29cd1fc4e4d 100644
--- a/src/ceph-volume/ceph_volume/tests/test_inventory.py
+++ b/src/ceph-volume/ceph_volume/tests/test_inventory.py
@@ -118,7 +118,7 @@ def device_data(device_info):
 class TestInventory(object):
 
     expected_keys = [
-        'ceph_device',
+        'ceph_device_lvm',
         'path',
         'rejected_reasons',
         'sys_api',
diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py
index 82ee3266e3f..04eefeac750 100644
--- a/src/ceph-volume/ceph_volume/util/device.py
+++ b/src/ceph-volume/ceph_volume/util/device.py
@@ -86,7 +86,7 @@ class Device(object):
      {attr:<25} {value}"""
 
     report_fields = [
-        'ceph_device',
+        'ceph_device_lvm',
         'rejected_reasons',
         'available',
         'path',
@@ -137,7 +137,7 @@ class Device(object):
         self.blkid_api = None
         self._exists = None
         self._is_lvm_member = None
-        self.ceph_device = False
+        self.ceph_device_lvm = False
         self.being_replaced: bool = self.is_being_replaced
         self._parse()
         if self.path in sys_info.devices.keys():
@@ -236,7 +236,7 @@ class Device(object):
             self.path = lv.lv_path
             self.vg_name = lv.vg_name
             self.lv_name = lv.name
-            self.ceph_device = lvm.is_ceph_device(lv)
+            self.ceph_device_lvm = lvm.is_ceph_device(lv)
         else:
             self.lvs = []
             if self.lsblk_all:
@@ -366,7 +366,7 @@ class Device(object):
                     self._is_lvm_member = True
                     self.lvs.extend(lvm.get_device_lvs(path))
                 if self.lvs:
-                    self.ceph_device = any([True if lv.tags.get('ceph.osd_id') else False for lv in self.lvs])
+                    self.ceph_device_lvm = any([True if lv.tags.get('ceph.osd_id') else False for lv in self.lvs])
 
     def _get_partitions(self):
         """
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 30ee56808c7..77b55314f66 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -7,7 +7,7 @@ import json
 from ceph_volume import process, allow_loop_devices
 from ceph_volume.api import lvm
 from ceph_volume.util.system import get_file_contents
-from typing import Dict, List, Any, Union
+from typing import Dict, List, Any, Union, Optional
 
 
 logger = logging.getLogger(__name__)
@@ -251,7 +251,9 @@ def lsblk(device, columns=None, abspath=False):
 
     return result[0]
 
-def lsblk_all(device='', columns=None, abspath=False):
+def lsblk_all(device: str = '',
+              columns: Optional[List[str]] = None,
+              abspath: bool = False) -> List[Dict[str, str]]:
     """
     Create a dictionary of identifying values for a device using ``lsblk``.
     Each supported column is a key, in its *raw* format (all uppercase
@@ -332,7 +334,6 @@ def lsblk_all(device='', columns=None, abspath=False):
     if device:
         base_command.append('--nodeps')
         base_command.append(device)
-
     out, err, rc = process.call(base_command)
 
     if rc != 0:
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 3fa5346b463..68fe30760a7 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -81,9 +81,10 @@ static void fuse_usage()
 void usage()
 {
   cout <<
-"usage: ceph-fuse [-n client.username] [-m mon-ip-addr:mon-port] <mount point> [OPTIONS]\n"
-"  --client_mountpoint/-r <sub_directory>\n"
-"                    use sub_directory as the mounted root, rather than the full Ceph tree.\n"
+"\nusage: ceph-fuse [-n client.username] [-m mon-ip-addr:mon-port] [--client_fs <fsname>] [--client_mountpoint/-r <sub_directory>] <mount point> [OPTIONS]\n\n"
+
+"  --client_mountpoint/-r: use sub_directory as the mounted root, rather than the full CephFS tree.\n"
+"  --client_fs: named file system to mount (default: usually the first file system created).\n"
 "\n";
   fuse_usage();
   generic_client_usage();
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index 5a917fa807c..ba8726a2be3 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -81,7 +81,7 @@ static void handle_mds_signal(int signum)
 
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-mds");
+  ceph_pthread_setname("ceph-mds");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/ceph_mgr.cc b/src/ceph_mgr.cc
index 67bda0c51be..bd2c643bc6b 100644
--- a/src/ceph_mgr.cc
+++ b/src/ceph_mgr.cc
@@ -41,7 +41,7 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-mgr");
+  ceph_pthread_setname("ceph-mgr");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 279fdb20ccb..63eb252e38f 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -250,7 +250,7 @@ int main(int argc, const char **argv)
 {
   // reset our process name, in case we did a respawn, so that it's not
   // left as "exe".
-  ceph_pthread_setname(pthread_self(), "ceph-mon");
+  ceph_pthread_setname("ceph-mon");
 
   int err;
 
diff --git a/src/ceph_nvmeof_monitor_client.cc b/src/ceph_nvmeof_monitor_client.cc
index 05457998cb8..fa41bed08ad 100644
--- a/src/ceph_nvmeof_monitor_client.cc
+++ b/src/ceph_nvmeof_monitor_client.cc
@@ -45,7 +45,7 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-nvmeof-monitor-client");
+  ceph_pthread_setname("ceph-nvmeof-monitor-client");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/cephadm/build.py b/src/cephadm/build.py
index ed39c84e9af..43bc58a4003 100755
--- a/src/cephadm/build.py
+++ b/src/cephadm/build.py
@@ -269,6 +269,9 @@ def _build(dest, src, config):
         mdir.mkdir(parents=True, exist_ok=True)
         (mdir / "__init__.py").touch(exist_ok=True)
         versioning_vars = config.cli_args.version_vars
+        shutil.copytree(
+            "../python-common/ceph", appdir / "ceph"
+        )
         if versioning_vars:
             generate_version_file(versioning_vars, mdir / "version.py")
         if dinfo:
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e32e2bc49f3..d2ddf564116 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -57,6 +57,7 @@ from cephadmlib.constants import (
     LOG_DIR_MODE,
     SYSCTL_DIR,
     UNIT_DIR,
+    DAEMON_FAILED_ERROR,
 )
 from cephadmlib.context import CephadmContext
 from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@ from cephadmlib.exceptions import (
     ClusterAlreadyExists,
     Error,
     UnauthorizedRegistryError,
+    DaemonStartException,
 )
 from cephadmlib.exe_utils import find_executable, find_program
 from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
         call_throws(ctx, ['systemctl', 'enable', unit_name])
     if start:
         clean_cgroup(ctx, ident.fsid, unit_name)
-        call_throws(ctx, ['systemctl', 'start', unit_name])
+        try:
+            call_throws(ctx, ['systemctl', 'start', unit_name])
+        except Exception as e:
+            logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
+            raise DaemonStartException()
 
 
 def _osd_unit_run_commands(
@@ -1988,11 +1994,15 @@ def get_image_info_from_inspect(out, image):
 def get_public_net_from_cfg(ctx: CephadmContext) -> Optional[str]:
     """Get mon public network from configuration file."""
     cp = read_config(ctx.config)
-    if not cp.has_option('global', 'public_network'):
+    public_network = ''
+    if cp.has_option('mon', 'public_network'):
+        public_network = cp.get('mon', 'public_network').strip('"').strip("'")
+    elif cp.has_option('global', 'public_network'):
+        public_network = cp.get('global', 'public_network').strip('"').strip("'")
+    else:
         return None
 
     # Ensure all public CIDR networks are valid
-    public_network = cp.get('global', 'public_network').strip('"').strip("'")
     rc, _, err_msg = check_subnet(public_network)
     if rc:
         raise Error(f'Invalid public_network {public_network} parameter: {err_msg}')
@@ -2597,7 +2607,7 @@ def finish_bootstrap_config(
 
     if mon_network:
         cp = read_config(ctx.config)
-        cfg_section = 'global' if cp.has_option('global', 'public_network') else 'mon'
+        cfg_section = 'mon' if cp.has_option('mon', 'public_network') else 'global'
         logger.info(f'Setting public_network to {mon_network} in {cfg_section} config section')
         cli(['config', 'set', cfg_section, 'public_network', mon_network])
 
@@ -3046,7 +3056,10 @@ def get_deployment_type(
 @deprecated_command
 def command_deploy(ctx):
     # type: (CephadmContext) -> None
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def apply_deploy_config_to_ctx(
@@ -3089,7 +3102,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
     config_data = read_configuration_source(ctx)
     logger.debug('Loaded deploy configuration: %r', config_data)
     apply_deploy_config_to_ctx(config_data, ctx)
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def _common_deploy(ctx: CephadmContext) -> None:
@@ -4485,8 +4501,9 @@ def _rm_cluster(ctx: CephadmContext, keep_logs: bool, zap_osds: bool) -> None:
 ##################################
 
 
-def check_time_sync(ctx, enabler=None):
-    # type: (CephadmContext, Optional[Packager]) -> bool
+def check_time_sync(
+    ctx: CephadmContext, enabler: Optional[Packager] = None
+) -> bool:
     units = [
         'chrony.service',  # 18.04 (at least)
         'chronyd.service',  # el / opensuse
diff --git a/src/cephadm/cephadmlib/call_wrappers.py b/src/cephadm/cephadmlib/call_wrappers.py
index 3fe2171e99d..d3d327c218c 100644
--- a/src/cephadm/cephadmlib/call_wrappers.py
+++ b/src/cephadm/cephadmlib/call_wrappers.py
@@ -311,14 +311,14 @@ def call_throws(
     return out, err, ret
 
 
-def call_timeout(ctx, command, timeout):
-    # type: (CephadmContext, List[str], int) -> int
+def call_timeout(
+    ctx: CephadmContext, command: List[str], timeout: int
+) -> int:
     logger.debug(
         'Running command (timeout=%s): %s' % (timeout, ' '.join(command))
     )
 
-    def raise_timeout(command, timeout):
-        # type: (List[str], int) -> NoReturn
+    def raise_timeout(command: List[str], timeout: int) -> NoReturn:
         msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
         logger.debug(msg)
         raise TimeoutExpired(msg)
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 354c3782398..1df46353fb3 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -4,24 +4,6 @@
 DEFAULT_IMAGE = 'quay.ceph.io/ceph-ci/ceph:main'
 DEFAULT_IMAGE_IS_MAIN = True
 DEFAULT_IMAGE_RELEASE = 'squid'
-DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
-DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
-DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
-DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
-DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
-DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
-DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
-DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
-DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
-DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
-DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
-DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
-DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
 DEFAULT_REGISTRY = 'quay.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
 
@@ -54,3 +36,4 @@ QUIET_LOG_LEVEL = 9  # DEBUG is 10, so using 9 to be lower level than DEBUG
 NO_DEPRECATED = False
 UID_NOBODY = 65534
 GID_NOGROUP = 65534
+DAEMON_FAILED_ERROR = 17
diff --git a/src/cephadm/cephadmlib/container_types.py b/src/cephadm/cephadmlib/container_types.py
index 791a545538a..f1e829cbdf7 100644
--- a/src/cephadm/cephadmlib/container_types.py
+++ b/src/cephadm/cephadmlib/container_types.py
@@ -8,28 +8,8 @@ import os
 from typing import Dict, List, Optional, Any, Union, Tuple, Iterable, cast
 
 from .call_wrappers import call, call_throws, CallVerbosity
-from .constants import (
-    DEFAULT_TIMEOUT,
-    # default container images
-    DEFAULT_ALERT_MANAGER_IMAGE,
-    DEFAULT_GRAFANA_IMAGE,
-    DEFAULT_LOKI_IMAGE,
-    DEFAULT_NODE_EXPORTER_IMAGE,
-    DEFAULT_PROMETHEUS_IMAGE,
-    DEFAULT_PROMTAIL_IMAGE,
-    DEFAULT_HAPROXY_IMAGE,
-    DEFAULT_KEEPALIVED_IMAGE,
-    DEFAULT_NVMEOF_IMAGE,
-    DEFAULT_SNMP_GATEWAY_IMAGE,
-    DEFAULT_ELASTICSEARCH_IMAGE,
-    DEFAULT_JAEGER_COLLECTOR_IMAGE,
-    DEFAULT_JAEGER_AGENT_IMAGE,
-    DEFAULT_JAEGER_QUERY_IMAGE,
-    DEFAULT_SMB_IMAGE,
-    DEFAULT_SMBMETRICS_IMAGE,
-    DEFAULT_NGINX_IMAGE,
-    DEFAULT_OAUTH2_PROXY_IMAGE,
-)
+from .constants import DEFAULT_TIMEOUT
+import ceph.cephadm.images as default_images
 from .container_engines import Docker, Podman
 from .context import CephadmContext
 from .daemon_identity import DaemonIdentity, DaemonSubIdentity
@@ -687,24 +667,12 @@ def get_mgr_images() -> dict:
     """Return dict of default mgr images"""
     mgr_prefix = 'mgr/cephadm/container_image_'
     mgr_images = {}
-    mgr_images[mgr_prefix + 'prometheus'] = DEFAULT_PROMETHEUS_IMAGE
-    mgr_images[mgr_prefix + 'alertmanager'] = DEFAULT_ALERT_MANAGER_IMAGE
-    mgr_images[mgr_prefix + 'graphana'] = DEFAULT_GRAFANA_IMAGE
-    mgr_images[mgr_prefix + 'loki'] = DEFAULT_LOKI_IMAGE
-    mgr_images[mgr_prefix + 'promtail'] = DEFAULT_PROMTAIL_IMAGE
-    mgr_images[mgr_prefix + 'node_exporter'] = DEFAULT_NODE_EXPORTER_IMAGE
-    mgr_images[mgr_prefix + 'haproxy'] = DEFAULT_HAPROXY_IMAGE
-    mgr_images[mgr_prefix + 'keepalived'] = DEFAULT_KEEPALIVED_IMAGE
-    mgr_images[mgr_prefix + 'nvmeof'] = DEFAULT_NVMEOF_IMAGE
-    mgr_images[mgr_prefix + 'snmp_gateway'] = DEFAULT_SNMP_GATEWAY_IMAGE
-    mgr_images[mgr_prefix + 'elasticsearch'] = DEFAULT_ELASTICSEARCH_IMAGE
-    mgr_images[
-        mgr_prefix + 'jaeger_collector'
-    ] = DEFAULT_JAEGER_COLLECTOR_IMAGE
-    mgr_images[mgr_prefix + 'jaeger_agent'] = DEFAULT_JAEGER_AGENT_IMAGE
-    mgr_images[mgr_prefix + 'jaeger_query'] = DEFAULT_JAEGER_QUERY_IMAGE
-    mgr_images[mgr_prefix + 'smb'] = DEFAULT_SMB_IMAGE
-    mgr_images[mgr_prefix + 'smbmetrics'] = DEFAULT_SMBMETRICS_IMAGE
-    mgr_images[mgr_prefix + 'nginx'] = DEFAULT_NGINX_IMAGE
-    mgr_images[mgr_prefix + 'oauth2_proxy'] = DEFAULT_OAUTH2_PROXY_IMAGE
+    images = vars(default_images)
+    for key, value in images.items():
+        if key.startswith('DEFAULT_') and key.endswith('_IMAGE'):
+            # flake8 and black disagree about spaces around ":" hence the noqa comment
+            suffix = key[
+                len('DEFAULT_') : -len('_IMAGE')  # noqa: E203
+            ].lower()
+            mgr_images[mgr_prefix + suffix] = value
     return mgr_images
diff --git a/src/cephadm/cephadmlib/daemon_identity.py b/src/cephadm/cephadmlib/daemon_identity.py
index 52a18092bf0..bfe1a855186 100644
--- a/src/cephadm/cephadmlib/daemon_identity.py
+++ b/src/cephadm/cephadmlib/daemon_identity.py
@@ -157,7 +157,7 @@ class DaemonSubIdentity(DaemonIdentity):
         )
 
     def sidecar_script(self, base_data_dir: Union[str, os.PathLike]) -> str:
-        sname = f'sidecar-{ self.subcomponent }.run'
+        sname = f'sidecar-{self.subcomponent}.run'
         return str(pathlib.Path(self.data_dir(base_data_dir)) / sname)
 
     @property
diff --git a/src/cephadm/cephadmlib/daemons/ingress.py b/src/cephadm/cephadmlib/daemons/ingress.py
index 6064cf538fb..c88e39ac025 100644
--- a/src/cephadm/cephadmlib/daemons/ingress.py
+++ b/src/cephadm/cephadmlib/daemons/ingress.py
@@ -2,9 +2,11 @@ import os
 
 from typing import Dict, List, Optional, Tuple, Union
 
-from ..constants import (
+from ceph.cephadm.images import (
     DEFAULT_HAPROXY_IMAGE,
     DEFAULT_KEEPALIVED_IMAGE,
+)
+from ..constants import (
     DATA_DIR_MODE,
 )
 from ..container_daemon_form import ContainerDaemonForm, daemon_to_container
@@ -80,8 +82,7 @@ class HAproxy(ContainerDaemonForm):
     def get_daemon_args(self) -> List[str]:
         return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
 
-    def validate(self):
-        # type: () -> None
+    def validate(self) -> None:
         if not is_fsid(self.fsid):
             raise Error('not an fsid: %s' % self.fsid)
         if not self.daemon_id:
@@ -97,12 +98,10 @@ class HAproxy(ContainerDaemonForm):
                         'required file missing from config-json: %s' % fname
                     )
 
-    def get_daemon_name(self):
-        # type: () -> str
+    def get_daemon_name(self) -> str:
         return '%s.%s' % (self.daemon_type, self.daemon_id)
 
-    def get_container_name(self, desc=None):
-        # type: (Optional[str]) -> str
+    def get_container_name(self, desc: Optional[str] = None) -> str:
         cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
         if desc:
             cname = '%s-%s' % (cname, desc)
@@ -210,8 +209,7 @@ class Keepalived(ContainerDaemonForm):
         # populate files from the config-json
         populate_files(data_dir, self.files, uid, gid)
 
-    def validate(self):
-        # type: () -> None
+    def validate(self) -> None:
         if not is_fsid(self.fsid):
             raise Error('not an fsid: %s' % self.fsid)
         if not self.daemon_id:
@@ -227,20 +225,17 @@ class Keepalived(ContainerDaemonForm):
                         'required file missing from config-json: %s' % fname
                     )
 
-    def get_daemon_name(self):
-        # type: () -> str
+    def get_daemon_name(self) -> str:
         return '%s.%s' % (self.daemon_type, self.daemon_id)
 
-    def get_container_name(self, desc=None):
-        # type: (Optional[str]) -> str
+    def get_container_name(self, desc: Optional[str] = None) -> str:
         cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
         if desc:
             cname = '%s-%s' % (cname, desc)
         return cname
 
     @staticmethod
-    def get_container_envs():
-        # type: () -> List[str]
+    def get_container_envs() -> List[str]:
         envs = [
             'KEEPALIVED_AUTOCONF=false',
             'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
diff --git a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
index 01c68a83ba8..85f72495909 100644
--- a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
+++ b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
@@ -11,7 +11,7 @@ from ..context_getters import fetch_configs
 from ..daemon_form import register as register_daemon_form
 from ..daemon_identity import DaemonIdentity
 from ..deployment_utils import to_deployment_container
-from ..constants import DEFAULT_NGINX_IMAGE
+from ceph.cephadm.images import DEFAULT_NGINX_IMAGE
 from ..data_utils import dict_get, is_fsid
 from ..file_utils import populate_files, makedirs, recursive_chown
 from ..exceptions import Error
diff --git a/src/cephadm/cephadmlib/daemons/monitoring.py b/src/cephadm/cephadmlib/daemons/monitoring.py
index fca4da406e5..710093f0f46 100644
--- a/src/cephadm/cephadmlib/daemons/monitoring.py
+++ b/src/cephadm/cephadmlib/daemons/monitoring.py
@@ -3,13 +3,15 @@ import os
 from typing import Dict, List, Tuple
 
 from ..call_wrappers import call, CallVerbosity
-from ..constants import (
-    DEFAULT_ALERT_MANAGER_IMAGE,
+from ceph.cephadm.images import (
+    DEFAULT_ALERTMANAGER_IMAGE,
     DEFAULT_GRAFANA_IMAGE,
     DEFAULT_LOKI_IMAGE,
     DEFAULT_NODE_EXPORTER_IMAGE,
     DEFAULT_PROMETHEUS_IMAGE,
     DEFAULT_PROMTAIL_IMAGE,
+)
+from ..constants import (
     UID_NOBODY,
     GID_NOGROUP,
 )
@@ -91,7 +93,7 @@ class Monitoring(ContainerDaemonForm):
             ],
         },
         'alertmanager': {
-            'image': DEFAULT_ALERT_MANAGER_IMAGE,
+            'image': DEFAULT_ALERTMANAGER_IMAGE,
             'cpus': '2',
             'memory': '2GB',
             'args': [
diff --git a/src/cephadm/cephadmlib/daemons/nfs.py b/src/cephadm/cephadmlib/daemons/nfs.py
index f09374d5f46..70ccea65b5b 100644
--- a/src/cephadm/cephadmlib/daemons/nfs.py
+++ b/src/cephadm/cephadmlib/daemons/nfs.py
@@ -42,9 +42,13 @@ class NFSGanesha(ContainerDaemonForm):
         return cls.daemon_type == daemon_type
 
     def __init__(
-        self, ctx, fsid, daemon_id, config_json, image=DEFAULT_IMAGE
-    ):
-        # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
+        self,
+        ctx: CephadmContext,
+        fsid: str,
+        daemon_id: Union[int, str],
+        config_json: Dict,
+        image: str = DEFAULT_IMAGE,
+    ) -> None:
         self.ctx = ctx
         self.fsid = fsid
         self.daemon_id = daemon_id
@@ -62,8 +66,9 @@ class NFSGanesha(ContainerDaemonForm):
         self.validate()
 
     @classmethod
-    def init(cls, ctx, fsid, daemon_id):
-        # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
+    def init(
+        cls, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str]
+    ) -> 'NFSGanesha':
         return cls(ctx, fsid, daemon_id, fetch_configs(ctx), ctx.image)
 
     @classmethod
diff --git a/src/cephadm/cephadmlib/daemons/nvmeof.py b/src/cephadm/cephadmlib/daemons/nvmeof.py
index 9bce32201bb..d916c7e6391 100644
--- a/src/cephadm/cephadmlib/daemons/nvmeof.py
+++ b/src/cephadm/cephadmlib/daemons/nvmeof.py
@@ -8,7 +8,7 @@ from ..container_types import CephContainer
 from ..context_getters import fetch_configs, get_config_and_keyring
 from ..daemon_form import register as register_daemon_form
 from ..daemon_identity import DaemonIdentity
-from ..constants import DEFAULT_NVMEOF_IMAGE
+from ceph.cephadm.images import DEFAULT_NVMEOF_IMAGE
 from ..context import CephadmContext
 from ..data_utils import dict_get, is_fsid
 from ..deployment_utils import to_deployment_container
@@ -33,9 +33,13 @@ class CephNvmeof(ContainerDaemonForm):
         return cls.daemon_type == daemon_type
 
     def __init__(
-        self, ctx, fsid, daemon_id, config_json, image=DEFAULT_NVMEOF_IMAGE
-    ):
-        # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
+        self,
+        ctx: CephadmContext,
+        fsid: str,
+        daemon_id: Union[int, str],
+        config_json: Dict,
+        image: str = DEFAULT_NVMEOF_IMAGE,
+    ) -> None:
         self.ctx = ctx
         self.fsid = fsid
         self.daemon_id = daemon_id
@@ -48,8 +52,9 @@ class CephNvmeof(ContainerDaemonForm):
         self.validate()
 
     @classmethod
-    def init(cls, ctx, fsid, daemon_id):
-        # type: (CephadmContext, str, Union[int, str]) -> CephNvmeof
+    def init(
+        cls, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str]
+    ) -> 'CephNvmeof':
         return cls(ctx, fsid, daemon_id, fetch_configs(ctx), ctx.image)
 
     @classmethod
@@ -73,13 +78,18 @@ class CephNvmeof(ContainerDaemonForm):
             os.path.join(data_dir, 'ceph-nvmeof.conf')
         ] = '/src/ceph-nvmeof.conf:z'
         mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
-        mounts['/dev/hugepages'] = '/dev/hugepages'
-        mounts['/dev/vfio/vfio'] = '/dev/vfio/vfio'
         mounts[log_dir] = '/var/log/ceph:z'
         if mtls_dir:
             mounts[mtls_dir] = '/src/mtls:z'
         return mounts
 
+    def _get_huge_pages_mounts(self, files: Dict[str, str]) -> Dict[str, str]:
+        mounts = dict()
+        if 'spdk_mem_size' not in files:
+            mounts['/dev/hugepages'] = '/dev/hugepages'
+            mounts['/dev/vfio/vfio'] = '/dev/vfio/vfio'
+        return mounts
+
     def _get_tls_cert_key_mounts(
         self, data_dir: str, files: Dict[str, str]
     ) -> Dict[str, str]:
@@ -111,6 +121,7 @@ class CephNvmeof(ContainerDaemonForm):
             )
         else:
             mounts.update(self._get_container_mounts(data_dir, log_dir))
+        mounts.update(self._get_huge_pages_mounts(self.files))
         mounts.update(self._get_tls_cert_key_mounts(data_dir, self.files))
 
     def customize_container_binds(
@@ -198,11 +209,13 @@ class CephNvmeof(ContainerDaemonForm):
             )
         return cmd.split()
 
-    @staticmethod
-    def get_sysctl_settings() -> List[str]:
-        return [
-            'vm.nr_hugepages = 4096',
-        ]
+    def get_sysctl_settings(self) -> List[str]:
+        if 'spdk_mem_size' not in self.files:
+            return [
+                'vm.nr_hugepages = 4096',
+            ]
+        else:
+            return []
 
     def container(self, ctx: CephadmContext) -> CephContainer:
         ctr = daemon_to_container(ctx, self)
@@ -222,4 +235,6 @@ class CephNvmeof(ContainerDaemonForm):
         args.append(ctx.container_engine.unlimited_pids_option)
         args.extend(['--ulimit', 'memlock=-1:-1'])
         args.extend(['--ulimit', 'nofile=10240'])
-        args.extend(['--cap-add=SYS_ADMIN', '--cap-add=CAP_SYS_NICE'])
+        args.extend(['--cap-add=CAP_SYS_NICE'])
+        if 'spdk_mem_size' not in self.files:
+            args.extend(['--cap-add=SYS_ADMIN'])
diff --git a/src/cephadm/cephadmlib/daemons/oauth2_proxy.py b/src/cephadm/cephadmlib/daemons/oauth2_proxy.py
index 2b61df9d2e7..14202111c14 100644
--- a/src/cephadm/cephadmlib/daemons/oauth2_proxy.py
+++ b/src/cephadm/cephadmlib/daemons/oauth2_proxy.py
@@ -11,7 +11,8 @@ from ..context_getters import fetch_configs
 from ..daemon_form import register as register_daemon_form
 from ..daemon_identity import DaemonIdentity
 from ..deployment_utils import to_deployment_container
-from ..constants import DEFAULT_OAUTH2_PROXY_IMAGE, UID_NOBODY, GID_NOGROUP
+from ceph.cephadm.images import DEFAULT_OAUTH2_PROXY_IMAGE
+from ..constants import UID_NOBODY, GID_NOGROUP
 from ..data_utils import dict_get, is_fsid
 from ..file_utils import populate_files, makedirs, recursive_chown
 from ..exceptions import Error
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 82f886e72ec..33d43cbe6ce 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -1,3 +1,4 @@
+import dataclasses
 import enum
 import json
 import logging
@@ -13,7 +14,7 @@ from .. import data_utils
 from .. import deployment_utils
 from .. import file_utils
 from ..call_wrappers import call, CallVerbosity
-from ..constants import DEFAULT_SMB_IMAGE
+from ceph.cephadm.images import DEFAULT_SAMBA_IMAGE
 from ..container_daemon_form import ContainerDaemonForm, daemon_to_container
 from ..container_engines import Podman
 from ..container_types import (
@@ -67,83 +68,32 @@ class ClusterPublicIP(NamedTuple):
         return cls(address, destinations)
 
 
+@dataclasses.dataclass(frozen=True)
 class Config:
     identity: DaemonIdentity
     instance_id: str
     source_config: str
-    samba_debug_level: int
-    ctdb_log_level: str
-    debug_delay: int
     domain_member: bool
     clustered: bool
-    join_sources: List[str]
-    user_sources: List[str]
-    custom_dns: List[str]
-    smb_port: int
-    ceph_config_entity: str
-    vhostname: str
-    metrics_image: str
-    metrics_port: int
+    samba_debug_level: int = 0
+    ctdb_log_level: str = ''
+    debug_delay: int = 0
+    join_sources: List[str] = dataclasses.field(default_factory=list)
+    user_sources: List[str] = dataclasses.field(default_factory=list)
+    custom_dns: List[str] = dataclasses.field(default_factory=list)
+    smb_port: int = 0
+    ceph_config_entity: str = 'client.admin'
+    vhostname: str = ''
+    metrics_image: str = ''
+    metrics_port: int = 0
     # clustering related values
-    rank: int
-    rank_generation: int
-    cluster_meta_uri: str
-    cluster_lock_uri: str
-
-    def __init__(
-        self,
-        *,
-        identity: DaemonIdentity,
-        instance_id: str,
-        source_config: str,
-        domain_member: bool,
-        clustered: bool,
-        samba_debug_level: int = 0,
-        ctdb_log_level: str = '',
-        debug_delay: int = 0,
-        join_sources: Optional[List[str]] = None,
-        user_sources: Optional[List[str]] = None,
-        custom_dns: Optional[List[str]] = None,
-        smb_port: int = 0,
-        ceph_config_entity: str = 'client.admin',
-        vhostname: str = '',
-        metrics_image: str = '',
-        metrics_port: int = 0,
-        rank: int = -1,
-        rank_generation: int = -1,
-        cluster_meta_uri: str = '',
-        cluster_lock_uri: str = '',
-        cluster_public_addrs: Optional[List[ClusterPublicIP]] = None,
-    ) -> None:
-        self.identity = identity
-        self.instance_id = instance_id
-        self.source_config = source_config
-        self.domain_member = domain_member
-        self.clustered = clustered
-        self.samba_debug_level = samba_debug_level
-        self.ctdb_log_level = ctdb_log_level
-        self.debug_delay = debug_delay
-        self.join_sources = join_sources or []
-        self.user_sources = user_sources or []
-        self.custom_dns = custom_dns or []
-        self.smb_port = smb_port
-        self.ceph_config_entity = ceph_config_entity
-        self.vhostname = vhostname
-        self.metrics_image = metrics_image
-        self.metrics_port = metrics_port
-        self.rank = rank
-        self.rank_generation = rank_generation
-        self.cluster_meta_uri = cluster_meta_uri
-        self.cluster_lock_uri = cluster_lock_uri
-        self.cluster_public_addrs = cluster_public_addrs
-
-    def __str__(self) -> str:
-        return (
-            f'SMB Config[id={self.instance_id},'
-            f' source_config={self.source_config},'
-            f' domain_member={self.domain_member},'
-            f' clustered={self.clustered}]'
-        )
+    rank: int = -1
+    rank_generation: int = -1
+    cluster_meta_uri: str = ''
+    cluster_lock_uri: str = ''
+    cluster_public_addrs: List[ClusterPublicIP] = dataclasses.field(
+        default_factory=list
+    )
 
     def config_uris(self) -> List[str]:
         uris = [self.source_config]
@@ -418,7 +368,7 @@ class SMB(ContainerDaemonForm):
 
     daemon_type = 'smb'
     daemon_base = '/usr/sbin/smbd'
-    default_image = DEFAULT_SMB_IMAGE
+    default_image = DEFAULT_SAMBA_IMAGE
 
     @classmethod
     def for_daemon_type(cls, daemon_type: str) -> bool:
@@ -432,7 +382,7 @@ class SMB(ContainerDaemonForm):
         self._raw_configs: Dict[str, Any] = context_getters.fetch_configs(ctx)
         self._config_keyring = context_getters.get_config_and_keyring(ctx)
         self._cached_layout: Optional[ContainerLayout] = None
-        self._rank_info = context_getters.fetch_rank_info(ctx)
+        self._rank_info = context_getters.fetch_rank_info(ctx) or (-1, -1)
         self.smb_port = 445
         self.metrics_port = 9922
         self._network_mapper = _NetworkMapper(ctx)
@@ -502,6 +452,7 @@ class SMB(ContainerDaemonForm):
             # cache the cephadm networks->devices mapping for later
             self._network_mapper.load()
 
+        rank, rank_gen = self._rank_info
         self._instance_cfg = Config(
             identity=self._identity,
             instance_id=instance_id,
@@ -516,15 +467,12 @@ class SMB(ContainerDaemonForm):
             vhostname=vhostname,
             metrics_image=metrics_image,
             metrics_port=metrics_port,
+            rank=rank,
+            rank_generation=rank_gen,
             cluster_meta_uri=cluster_meta_uri,
             cluster_lock_uri=cluster_lock_uri,
             cluster_public_addrs=_public_addrs,
         )
-        if self._rank_info:
-            (
-                self._instance_cfg.rank,
-                self._instance_cfg.rank_generation,
-            ) = self._rank_info
         self._files = files
         logger.debug('SMB Instance Config: %s', self._instance_cfg)
         logger.debug('Configured files: %s', self._files)
diff --git a/src/cephadm/cephadmlib/daemons/snmp.py b/src/cephadm/cephadmlib/daemons/snmp.py
index f334e5f7652..ab84a302f2c 100644
--- a/src/cephadm/cephadmlib/daemons/snmp.py
+++ b/src/cephadm/cephadmlib/daemons/snmp.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.error import HTTPError, URLError
 from urllib.request import urlopen
 
-from ..constants import DEFAULT_SNMP_GATEWAY_IMAGE
+from ceph.cephadm.images import DEFAULT_SNMP_GATEWAY_IMAGE
 from ..container_daemon_form import ContainerDaemonForm, daemon_to_container
 from ..container_types import CephContainer
 from ..context import CephadmContext
diff --git a/src/cephadm/cephadmlib/daemons/tracing.py b/src/cephadm/cephadmlib/daemons/tracing.py
index 365458a9c57..4cf74339455 100644
--- a/src/cephadm/cephadmlib/daemons/tracing.py
+++ b/src/cephadm/cephadmlib/daemons/tracing.py
@@ -2,7 +2,7 @@ import logging
 
 from typing import Any, Dict, List, Tuple
 
-from ..constants import (
+from ceph.cephadm.images import (
     DEFAULT_ELASTICSEARCH_IMAGE,
     DEFAULT_JAEGER_AGENT_IMAGE,
     DEFAULT_JAEGER_COLLECTOR_IMAGE,
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 0ab8b38d2b5..9caef3f72e5 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -189,8 +189,9 @@ def normalize_image_digest(digest: str) -> str:
     return digest
 
 
-def get_legacy_config_fsid(cluster, legacy_dir=None):
-    # type: (str, Optional[str]) -> Optional[str]
+def get_legacy_config_fsid(
+    cluster: str, legacy_dir: Optional[str] = None
+) -> Optional[str]:
     config_file = '/etc/ceph/%s.conf' % cluster
     if legacy_dir is not None:
         config_file = os.path.abspath(legacy_dir + config_file)
diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py
index 0d215fdd332..762ce782127 100644
--- a/src/cephadm/cephadmlib/exceptions.py
+++ b/src/cephadm/cephadmlib/exceptions.py
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
 
 class PortOccupiedError(Error):
     pass
+
+
+class DaemonStartException(Exception):
+    """
+    Special exception type we raise when the
+    systemctl start command fails during daemon
+    deployment. Necessary because the cephadm mgr module
+    needs to handle this case differently than a failure
+    earlier in the deploy process where no attempt was made
+    to actually start the daemon
+    """
+
+    pass
diff --git a/src/cephadm/cephadmlib/file_utils.py b/src/cephadm/cephadmlib/file_utils.py
index 399729f2dcc..27e70e31756 100644
--- a/src/cephadm/cephadmlib/file_utils.py
+++ b/src/cephadm/cephadmlib/file_utils.py
@@ -52,8 +52,9 @@ def write_new(
     os.rename(tempname, destination)
 
 
-def populate_files(config_dir, config_files, uid, gid):
-    # type: (str, Dict, int, int) -> None
+def populate_files(
+    config_dir: str, config_files: Dict, uid: int, gid: int
+) -> None:
     """create config files for different services"""
     for fname in config_files:
         config_file = os.path.join(config_dir, fname)
@@ -71,8 +72,7 @@ def touch(
         os.chown(file_path, uid, gid)
 
 
-def write_tmp(s, uid, gid):
-    # type: (str, int, int) -> IO[str]
+def write_tmp(s: str, uid: int, gid: int) -> IO[str]:
     tmp_f = tempfile.NamedTemporaryFile(mode='w', prefix='ceph-tmp')
     os.fchown(tmp_f.fileno(), uid, gid)
     tmp_f.write(s)
@@ -97,8 +97,7 @@ def recursive_chown(path: str, uid: int, gid: int) -> None:
             os.chown(os.path.join(dirpath, filename), uid, gid)
 
 
-def read_file(path_list, file_name=''):
-    # type: (List[str], str) -> str
+def read_file(path_list: List[str], file_name: str = '') -> str:
     """Returns the content of the first file found within the `path_list`
 
     :param path_list: list of file paths to search
@@ -123,14 +122,12 @@ def read_file(path_list, file_name=''):
     return 'Unknown'
 
 
-def pathify(p):
-    # type: (str) -> str
+def pathify(p: str) -> str:
     p = os.path.expanduser(p)
     return os.path.abspath(p)
 
 
-def get_file_timestamp(fn):
-    # type: (str) -> Optional[str]
+def get_file_timestamp(fn: str) -> Optional[str]:
     try:
         mt = os.path.getmtime(fn)
         return datetime.datetime.fromtimestamp(
diff --git a/src/cephadm/cephadmlib/systemd.py b/src/cephadm/cephadmlib/systemd.py
index a07757eccad..1956957d457 100644
--- a/src/cephadm/cephadmlib/systemd.py
+++ b/src/cephadm/cephadmlib/systemd.py
@@ -11,8 +11,7 @@ from .packagers import Packager
 logger = logging.getLogger()
 
 
-def check_unit(ctx, unit_name):
-    # type: (CephadmContext, str) -> Tuple[bool, str, bool]
+def check_unit(ctx: CephadmContext, unit_name: str) -> Tuple[bool, str, bool]:
     # NOTE: we ignore the exit code here because systemctl outputs
     # various exit codes based on the state of the service, but the
     # string result is more explicit (and sufficient).
@@ -56,8 +55,9 @@ def check_unit(ctx, unit_name):
     return (enabled, state, installed)
 
 
-def check_units(ctx, units, enabler=None):
-    # type: (CephadmContext, List[str], Optional[Packager]) -> bool
+def check_units(
+    ctx: CephadmContext, units: List[str], enabler: Optional[Packager] = None
+) -> bool:
     for u in units:
         (enabled, state, installed) = check_unit(ctx, u)
         if enabled and state == 'running':
diff --git a/src/cephadm/tests/test_agent.py b/src/cephadm/tests/test_agent.py
index 52cce74e1fb..8e453e3ac3c 100644
--- a/src/cephadm/tests/test_agent.py
+++ b/src/cephadm/tests/test_agent.py
@@ -668,7 +668,7 @@ def test_mgr_listener_run(_load_cert_chain, _load_verify_locations, _handle_json
         agent.mgr_listener.run()
 
         # verify payload was correctly extracted
-        assert _handle_json_payload.called_with(json.loads(payload))
+        _handle_json_payload.assert_called_with(json.loads(payload))
         FakeConn.send.assert_called_once_with(b'ACK')
 
         # second run, with bad json data received
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index f27b9bcd362..bbaaf2d39f8 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -1,5 +1,6 @@
 # type: ignore
 
+import contextlib
 import copy
 import errno
 import json
@@ -38,6 +39,13 @@ def get_ceph_conf(
         mon_host = {mon_host}
 '''
 
+@contextlib.contextmanager
+def bootstrap_test_ctx(*args, **kwargs):
+    with with_cephadm_ctx(*args, **kwargs) as ctx:
+        ctx.no_cleanup_on_failure = True
+        yield ctx
+
+
 class TestCephAdm(object):
 
     @mock.patch('cephadm.logger')
@@ -1432,13 +1440,13 @@ class TestBootstrap(object):
             '--config', conf_file,
         )
 
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             msg = r'No such file or directory'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cephadm_fs.create_file(conf_file)
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1446,7 +1454,7 @@ class TestBootstrap(object):
         funkypatch.patch('cephadmlib.systemd.call')
 
         cmd = self._get_cmd()
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             msg = r'must specify --mon-ip or --mon-addrv'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
@@ -1455,13 +1463,13 @@ class TestBootstrap(object):
         funkypatch.patch('cephadmlib.systemd.call')
         cmd = self._get_cmd('--mon-ip', '192.168.1.1')
 
-        with with_cephadm_ctx(cmd, list_networks={}) as ctx:
+        with bootstrap_test_ctx(cmd, list_networks={}) as ctx:
             msg = r'--skip-mon-network'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cmd += ['--skip-mon-network']
-        with with_cephadm_ctx(cmd, list_networks={}) as ctx:
+        with bootstrap_test_ctx(cmd, list_networks={}) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1540,12 +1548,12 @@ class TestBootstrap(object):
 
         cmd = self._get_cmd('--mon-ip', mon_ip)
         if not result:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 msg = r'--skip-mon-network'
                 with pytest.raises(_cephadm.Error, match=msg):
                     _cephadm.command_bootstrap(ctx)
         else:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 retval = _cephadm.command_bootstrap(ctx)
                 assert retval == 0
 
@@ -1604,11 +1612,11 @@ class TestBootstrap(object):
 
         cmd = self._get_cmd('--mon-addrv', mon_addrv)
         if err:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 with pytest.raises(_cephadm.Error, match=err):
                     _cephadm.command_bootstrap(ctx)
         else:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 retval = _cephadm.command_bootstrap(ctx)
                 assert retval == 0
 
@@ -1621,13 +1629,13 @@ class TestBootstrap(object):
             '--skip-mon-network',
         )
 
-        with with_cephadm_ctx(cmd, hostname=hostname) as ctx:
+        with bootstrap_test_ctx(cmd, hostname=hostname) as ctx:
             msg = r'--allow-fqdn-hostname'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cmd += ['--allow-fqdn-hostname']
-        with with_cephadm_ctx(cmd, hostname=hostname) as ctx:
+        with bootstrap_test_ctx(cmd, hostname=hostname) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1646,7 +1654,7 @@ class TestBootstrap(object):
             '--fsid', fsid,
         )
 
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             if err:
                 with pytest.raises(_cephadm.Error, match=err):
                     _cephadm.command_bootstrap(ctx)
@@ -1661,7 +1669,7 @@ class TestShell(object):
         fsid = '00000000-0000-0000-0000-0000deadbeef'
 
         cmd = ['shell', '--fsid', fsid]
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             retval = _cephadm.command_shell(ctx)
             assert retval == 0
             assert ctx.fsid == fsid
diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py
index 58f212beff6..c5094db335f 100644
--- a/src/cephadm/tests/test_deploy.py
+++ b/src/cephadm/tests/test_deploy.py
@@ -495,6 +495,7 @@ def test_deploy_ceph_exporter_container(cephadm_fs, funkypatch):
 def test_deploy_and_rm_iscsi(cephadm_fs, funkypatch):
     # Test that the deploy and remove paths for iscsi (which has sidecar container)
     # create and remove the correct unit files.
+    funkypatch.patch('shutil.rmtree')  # fakefs + shutil.rmtree breaks on py3.12
     mocks = _common_patches(funkypatch)
     _firewalld = mocks['Firewalld']
     fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7'
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index 20608c1681c..d643b1ba74f 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -12,18 +12,22 @@ skipsdist = true
 max-line-length = 100
 inline-quotes = '
 ignore =
-    E501, \
+    E501,
     W503,
 exclude =
-    .tox, \
-    .vagrant, \
-    __pycache__, \
-    *.pyc, \
-    templates, \
+    .tox,
+    .vagrant,
+    __pycache__,
+    *.pyc,
+    templates,
     .eggs
 statistics = True
 
 [testenv]
+setenv =
+    PYTHONPATH = $PYTHONPATH:..:{toxinidir}/../python-common
+passenv =
+    PYTHONPATH
 skip_install=true
 deps =
   -rzipapp-reqs.txt
@@ -35,6 +39,10 @@ deps =
 commands=pytest {posargs}
 
 [testenv:mypy]
+setenv =
+    MYPYPATH = {toxinidir}/..:{toxinidir}/../python-common
+passenv =
+    MYPYPATH
 deps =
     mypy
     types-PyYAML
@@ -45,12 +53,12 @@ commands = mypy --config-file ../mypy.ini {posargs:cephadm.py cephadmlib}
 [testenv:flake8]
 allowlist_externals = bash
 deps =
-    flake8 == 5.0.4
+    flake8
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
     bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1'
-    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 7'
 # Downstream distributions may choose to alter this "docker.io" number,
 # to make sure no new references to docker.io are creeping in unnoticed.
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index f8373095b38..c404057b929 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -3646,6 +3646,9 @@ void Client::put_cap_ref(Inode *in, int cap)
     if (last & CEPH_CAP_FILE_CACHE) {
       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
       ++put_nref;
+
+      ldout(cct, 10) << __func__ << " calling signal_caps_inode" << dendl;
+      signal_caps_inode(in);
     }
     if (drop)
       check_caps(in, 0);
@@ -3840,6 +3843,7 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
 				   want,
 				   flush,
 				   cap->mseq,
+                                   cap->issue_seq,
                                    cap_epoch_barrier);
   /*
    * Since the setattr will check the cephx mds auth access before
@@ -3853,7 +3857,6 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
   m->caller_uid = -1;
   m->caller_gid = -1;
 
-  m->head.issue_seq = cap->issue_seq;
   m->set_tid(flush_tid);
 
   m->head.uid = in->uid;
@@ -5518,10 +5521,10 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<
         if (it != in->caps.end()) {
 	  Cap &tcap = it->second;
 	  if (tcap.cap_id == m->peer.cap_id &&
-	      ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
+	      ceph_seq_cmp(tcap.seq, m->peer.issue_seq) < 0) {
 	    tcap.cap_id = m->peer.cap_id;
-	    tcap.seq = m->peer.seq - 1;
-	    tcap.issue_seq = tcap.seq;
+	    tcap.seq = m->peer.issue_seq - 1;
+	    tcap.issue_seq = tcap.issue_seq;
 	    tcap.issued |= cap.issued;
 	    tcap.implemented |= cap.issued;
 	    if (&cap == in->auth_cap)
@@ -5531,7 +5534,7 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<
 	  }
         } else {
 	  add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
-		         m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
+		         m->peer.issue_seq - 1, m->peer.mseq, (uint64_t)-1,
 		         &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
 		         cap.latest_perms);
         }
@@ -16231,7 +16234,7 @@ int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
   if (offset < 0 || length <= 0)
     return -CEPHFS_EINVAL;
 
-  if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+  if (mode == 0 || (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
     return -CEPHFS_EOPNOTSUPP;
 
   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
diff --git a/src/client/MetaSession.cc b/src/client/MetaSession.cc
index b5160a84331..3baa833851f 100644
--- a/src/client/MetaSession.cc
+++ b/src/client/MetaSession.cc
@@ -56,7 +56,7 @@ void MetaSession::enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t
   ceph_mds_cap_item i;
   i.ino = ino;
   i.cap_id = cap_id;
-  i.seq = iseq;
+  i.issue_seq = iseq;
   i.migrate_seq = mseq;
   release->caps.push_back(i);
 }
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index 3b408dd3f2d..6b315d2dee3 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -290,6 +290,7 @@ SyntheticClient::SyntheticClient(StandaloneClient *client, int w)
 
 void *synthetic_client_thread_entry(void *ptr)
 {
+  ceph_pthread_setname("client");
   SyntheticClient *sc = static_cast<SyntheticClient*>(ptr);
   //int r = 
   sc->run();
@@ -945,7 +946,6 @@ int SyntheticClient::start_thread()
 
   pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this);
   ceph_assert(thread_id);
-  ceph_pthread_setname(thread_id, "client");
   return 0;
 }
 
diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h
index 9455ecc5a33..30570c72a30 100644
--- a/src/common/DecayCounter.h
+++ b/src/common/DecayCounter.h
@@ -16,7 +16,6 @@
 #define CEPH_DECAYCOUNTER_H
 
 #include "include/buffer.h"
-#include "common/Formatter.h"
 #include "common/StackStringStream.h"
 #include "common/ceph_time.h"
 
@@ -24,6 +23,8 @@
 #include <list>
 #include <sstream>
 
+namespace ceph { class Formatter; }
+
 /**
  *
  * TODO: normalize value based on some function of half_life, 
diff --git a/src/common/Graylog.cc b/src/common/Graylog.cc
index cbd63fab25f..099acacd803 100644
--- a/src/common/Graylog.cc
+++ b/src/common/Graylog.cc
@@ -2,6 +2,9 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "Graylog.h"
+
+#include <iostream> // for std::cerr
+
 #include "common/Formatter.h"
 #include "common/LogEntry.h"
 #include "log/Entry.h"
diff --git a/src/common/Journald.cc b/src/common/Journald.cc
index 164b65834a6..12e1a97e998 100644
--- a/src/common/Journald.cc
+++ b/src/common/Journald.cc
@@ -14,6 +14,9 @@
 #include <sys/un.h>
 #include <syslog.h>
 #include <unistd.h>
+
+#include <iostream> // for std::cerr
+
 #include <fmt/format.h>
 #include <fmt/ostream.h>
 
@@ -23,7 +26,6 @@
 #include "log/SubsystemMap.h"
 #include "msg/msg_fmt.h"
 
-
 namespace ceph::logging {
 
 namespace {
diff --git a/src/common/StackStringStream.h b/src/common/StackStringStream.h
index 8cb48ff6fcd..6a144fb938a 100644
--- a/src/common/StackStringStream.h
+++ b/src/common/StackStringStream.h
@@ -18,10 +18,9 @@
 #include <boost/container/small_vector.hpp>
 
 #include <algorithm>
-#include <iostream>
 #include <memory>
 #include <ostream>
-#include <sstream>
+#include <string>
 #include <string_view>
 #include <vector>
 
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 3903e8c0ed7..c714aa0aa87 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -83,7 +83,7 @@ void *Thread::entry_wrapper()
   if (pid && cpuid >= 0)
     _set_affinity(cpuid);
 
-  ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str());
+  ceph_pthread_setname(thread_name.c_str());
   return entry();
 }
 
@@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize)
 void Thread::create(const char *name, size_t stacksize)
 {
   ceph_assert(strlen(name) < 16);
-  Thread::thread_name = name;
+  thread_name = name;
 
   int ret = try_create(stacksize);
   if (ret != 0) {
@@ -203,24 +203,6 @@ int Thread::set_affinity(int id)
 // Functions for std::thread
 // =========================
 
-void set_thread_name(std::thread& t, const std::string& s) {
-  int r = ceph_pthread_setname(t.native_handle(), s.c_str());
-  if (r != 0) {
-    throw std::system_error(r, std::generic_category());
-  }
-}
-std::string get_thread_name(const std::thread& t) {
-  std::string s(256, '\0');
-
-  int r = ceph_pthread_getname(const_cast<std::thread&>(t).native_handle(),
-			       s.data(), s.length());
-  if (r != 0) {
-    throw std::system_error(r, std::generic_category());
-  }
-  s.resize(std::strlen(s.data()));
-  return s;
-}
-
 void kill(std::thread& t, int signal)
 {
   auto r = ceph_pthread_kill(t.native_handle(), signal);
diff --git a/src/common/Thread.h b/src/common/Thread.h
index d3892c1b36b..8dc0e6c3cbe 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -17,8 +17,8 @@
 #define CEPH_THREAD_H
 
 #include <functional>
+#include <string>
 #include <string_view>
-#include <system_error>
 #include <thread>
 #include <cstring>
 
@@ -27,7 +27,6 @@
 
 #include "include/ceph_assert.h"
 #include "include/compat.h"
-#include "include/spinlock.h"
 
 extern pid_t ceph_gettid();
 
@@ -36,7 +35,7 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int cpuid;
-  static inline thread_local std::string thread_name;
+  std::string thread_name;
 
   void *entry_wrapper();
 
@@ -64,15 +63,10 @@ class Thread {
   int join(void **prval = 0);
   int detach();
   int set_affinity(int cpuid);
-  static const std::string get_thread_name() {
-    return Thread::thread_name;
-  }
 };
 
 // Functions for with std::thread
 
-void set_thread_name(std::thread& t, const std::string& s);
-std::string get_thread_name(const std::thread& t);
 void kill(std::thread& t, int signal);
 
 template<typename Fun, typename... Args>
@@ -81,7 +75,7 @@ std::thread make_named_thread(std::string_view n,
 			      Args&& ...args) {
 
   return std::thread([n = std::string(n)](auto&& fun, auto&& ...args) {
-		       ceph_pthread_setname(pthread_self(), n.data());
+		       ceph_pthread_setname(n.data());
 		       std::invoke(std::forward<Fun>(fun),
 				   std::forward<Args>(args)...);
 		     }, std::forward<Fun>(fun), std::forward<Args>(args)...);
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index e190b946c45..fb5d949b438 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -6,7 +6,7 @@
 
 #include <atomic>
 #include <chrono>
-#include <iostream>
+#include <iosfwd>
 #include <list>
 #include <map>
 
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 1e73ce0836a..55b87de3207 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -12,7 +12,13 @@
  *
  */
 #include <poll.h>
+#include <signal.h>
 #include <sys/un.h>
+
+#ifndef WIN32
+#include <sys/wait.h>
+#endif
+
 #include <optional>
 
 #include <stdlib.h>
diff --git a/src/common/assert.cc b/src/common/assert.cc
index 7fb4c2d726b..68ad99c878e 100644
--- a/src/common/assert.cc
+++ b/src/common/assert.cc
@@ -44,8 +44,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     ostringstream tss;
     tss << ceph_clock_now();
@@ -122,8 +121,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
     BackTrace *bt = new ClibBackTrace(1);
@@ -168,8 +166,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BackTrace *bt = new ClibBackTrace(1);
     snprintf(g_assert_msg, sizeof(g_assert_msg),
@@ -210,8 +207,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
     BackTrace *bt = new ClibBackTrace(1);
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index b4640979289..4443ef14124 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -19,6 +19,8 @@
 
 #include <sys/uio.h>
 
+#include <iostream>
+
 #include "include/ceph_assert.h"
 #include "include/types.h"
 #include "include/buffer_raw.h"
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index 9b989fe7270..ad12e0b6764 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -16,6 +16,7 @@
 #include "auth/Auth.h"
 #include "common/ceph_argparse.h"
 #include "common/config.h"
+#include "common/strtol.h" // for strict_strtof()
 #include "common/version.h"
 #include "include/str_list.h"
 
diff --git a/src/common/ceph_argparse.h b/src/common/ceph_argparse.h
index d63a2bdd796..5a160dd0b79 100644
--- a/src/common/ceph_argparse.h
+++ b/src/common/ceph_argparse.h
@@ -29,6 +29,8 @@
 #include "common/entity_name.h"
 #include "include/encoding.h"
 
+class entity_addrvec_t;
+
 /////////////////////// Types ///////////////////////
 class CephInitParameters
 {
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
index bae038862cf..01feff4c063 100644
--- a/src/common/ceph_time.h
+++ b/src/common/ceph_time.h
@@ -16,7 +16,7 @@
 #define COMMON_CEPH_TIME_H
 
 #include <chrono>
-#include <iostream>
+#include <iosfwd>
 #include <string>
 #include <optional>
 #include <fmt/chrono.h>
diff --git a/src/common/ceph_timer.h b/src/common/ceph_timer.h
index bc324bfa243..7fb2c7bac12 100644
--- a/src/common/ceph_timer.h
+++ b/src/common/ceph_timer.h
@@ -98,6 +98,7 @@ class timer {
   std::thread thread;
 
   void timer_thread() {
+    ceph_pthread_setname("ceph_timer");
     std::unique_lock l(lock);
     while (!suspended) {
       auto now = TC::now();
@@ -155,7 +156,6 @@ class timer {
 public:
   timer() : suspended(false) {
     thread = std::thread(&timer::timer_thread, this);
-    set_thread_name(thread, "ceph_timer");
   }
 
   // Create a suspended timer, jobs will be executed in order when
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
index 14d55f60c30..21633fc5d41 100644
--- a/src/common/code_environment.cc
+++ b/src/common/code_environment.cc
@@ -11,6 +11,7 @@
  * Foundation.  See file COPYING.
  *
  */
+#include "include/compat.h"
 
 #include "common/code_environment.h"
 
@@ -18,10 +19,6 @@
 
 #include "acconfig.h"
 
-#ifdef HAVE_PTHREAD_GETNAME_NP
-#include <pthread.h>
-#endif
-
 #include <string.h>
 
 code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
@@ -57,7 +54,7 @@ int get_process_name(char *buf, int len)
   }
   // FIPS zeroization audit 20191115: this memset is not security related.
   memset(buf, 0, len);
-  return pthread_getname_np(pthread_self(), buf, len);
+  return ceph_pthread_getname(buf, len);
 }
 
 #elif defined(HAVE_GETPROGNAME)
diff --git a/src/common/compat.cc b/src/common/compat.cc
index 82b57ad94b5..84a395c5a19 100644
--- a/src/common/compat.cc
+++ b/src/common/compat.cc
@@ -565,3 +565,66 @@ ssize_t get_self_exe_path(char* path, int buff_length) {
 }
 
 #endif /* _WIN32 */
+
+
+static thread_local char cached_thread_name[256]{};
+
+int ceph_pthread_setname(char const* name)
+{
+  strncpy(cached_thread_name, name, sizeof cached_thread_name - 1);
+#if defined(_WIN32) && defined(__clang__) && \
+    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+  // In this case, llvm doesn't use the pthread api for std::thread.
+  // We cannot use native_handle() with the pthread api, nor can we pass
+  // it to Windows API functions.
+  return 0;
+#elif defined(HAVE_PTHREAD_SETNAME_NP)
+  #if defined(__APPLE__)
+      return pthread_setname_np(name);
+  #else
+      return pthread_setname_np(pthread_self(), name);
+  #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+  pthread_set_name_np(pthread_self(), name);          \
+  return 0;
+#else
+  return 0;
+#endif
+}
+
+int ceph_pthread_getname(char* name, size_t len)
+{
+  if (cached_thread_name[0]) {
+    if (len > 0) {
+      strncpy(name, cached_thread_name, len);
+      name[len-1] = 0;
+    }
+    return 0;
+  } else {
+#if defined(_WIN32) && defined(__clang__) && \
+    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+    if (len > 0) {
+      strcpy(name, "");
+    }
+    return 0;
+#elif defined(HAVE_PTHREAD_GETNAME_NP) || defined(HAVE_PTHREAD_GET_NAME_NP)
+#  if defined(HAVE_PTHREAD_GETNAME_NP)
+    int rc = pthread_getname_np(pthread_self(), cached_thread_name, sizeof cached_thread_name);
+#  else
+    int rc = pthread_get_name_np(pthread_self(), cached_thread_name, sizeof cached_thread_name);
+#  endif
+    if (rc == 0) {
+      strncpy(name, cached_thread_name, len);
+      name[len-1] = 0;
+      return 0;
+    } else {
+      return rc;
+    }
+#else
+    if (len > 0) {
+      strcpy(name, "");
+    }
+    return 0;
+#endif
+  }
+}
diff --git a/src/common/config_cacher.h b/src/common/config_cacher.h
index a84bad08eee..91b8152dde1 100644
--- a/src/common/config_cacher.h
+++ b/src/common/config_cacher.h
@@ -50,7 +50,7 @@ public:
     conf.remove_observer(this);
   }
 
-  operator ValueT() const {
+  ValueT operator*() const {
     return value_cache.load();
   }
 };
diff --git a/src/common/error_code.cc b/src/common/error_code.cc
index ed0e681b22b..9c981a21077 100644
--- a/src/common/error_code.cc
+++ b/src/common/error_code.cc
@@ -13,10 +13,9 @@
  * COPYING.
  */
 
-#include <exception>
+#include "common/error_code.h"
 
 #include <boost/asio/error.hpp>
-#include "common/error_code.h"
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
diff --git a/src/common/error_code.h b/src/common/error_code.h
index e39122f8ce3..93a1bf31c00 100644
--- a/src/common/error_code.h
+++ b/src/common/error_code.h
@@ -16,9 +16,8 @@
 #ifndef COMMON_CEPH_ERROR_CODE
 #define COMMON_CEPH_ERROR_CODE
 
-#include <netdb.h>
-
-#include <boost/system.hpp>
+#include <boost/system/error_code.hpp>
+#include <boost/system/system_error.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
diff --git a/src/common/intrusive_timer.h b/src/common/intrusive_timer.h
new file mode 100644
index 00000000000..b32286a2096
--- /dev/null
+++ b/src/common/intrusive_timer.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+#include <condition_variable>
+
+#include <boost/intrusive/set.hpp>
+
+#include "common/ceph_time.h"
+
+namespace ceph::common {
+
+/**
+ * intrusive_timer
+ *
+ * SafeTimer (common/Timer.h) isn't well suited to usage in high
+ * usage pathways for a few reasons:
+ * - Usage generally requires allocation of a fresh context for each
+ *   scheduled operation.  One could override Context::complete to avoid
+ *   destroying the instance, but actually reusing the instance is tricky
+ *   as SafeTimer doesn't guarrantee cancelation if safe_callbacks is false.
+ * - SafeTimer only guarrantees cancelation if safe_timer is true, which
+ *   it generally won't be if the user needs to call into SafeTimer while
+ *   holding locks taken by callbacks.
+ *
+ * This implementation allows the user to repeatedly schedule and cancel
+ * an object inheriting from the callback_t interface below while
+ * guarranteeing cancelation provided that the user holds the lock
+ * associated with a particular callback while calling into intrusive_timer.
+ */
+class intrusive_timer {
+  using clock_t = ceph::coarse_real_clock;
+
+public:
+  /**
+   * callback_t
+   *
+   * Objects inheriting from callback_t can be scheduled
+   * via intrusive_timer.
+   */
+  class callback_t : public boost::intrusive::set_base_hook<> {
+    friend class intrusive_timer;
+    clock_t::time_point schedule_point;
+    unsigned incarnation = 0;
+
+  public:
+    /**
+     * add_ref, dec_ref
+     *
+     * callback_t must remain live and all methods must remain
+     * safe to call as long as calls to add_ref() outnumber calls
+     * to dec_ref().
+     */
+    virtual void add_ref() = 0;
+    virtual void dec_ref() = 0;
+
+    /**
+     * lock, unlock
+     *
+     * For any specific callback_t, must lock/unlock a lock held while
+     * accessing intrusive_timer public methods for that callback_t
+     * instance.
+     */
+    virtual void lock() = 0;
+    virtual void unlock() = 0;
+
+    /// Invokes callback, will be called with lock held
+    virtual void invoke() = 0;
+
+    /**
+     * is_scheduled
+     *
+     * Return true iff callback is scheduled to be invoked.
+     * May only be validly invoked while lock associated with
+     * callback_t instance is held.
+     */
+    bool is_scheduled() const { return incarnation % 2 == 1; }
+    virtual ~callback_t() = default;
+
+    /// Order callback_t by schedule_point
+    auto operator<=>(const callback_t &rhs) const {
+      return std::make_pair(schedule_point, this) <=>
+	std::make_pair(rhs.schedule_point, &rhs);
+    }
+  };
+
+private:
+  /// protects events, stopping
+  std::mutex lock;
+
+  /// stopping, cv used to signal that t should halt
+  std::condition_variable cv;
+  bool stopping = false;
+
+  /// queued events ordered by callback_t::schedule_point
+  boost::intrusive::set<callback_t> events;
+
+  /// thread responsible for calling scheduled callbacks
+  std::thread t;
+
+  /// peek front of queue, null if empty
+  callback_t *peek() {
+    return events.empty() ? nullptr : &*(events.begin());
+  }
+
+  /// entry point for t
+  void _run() {
+    std::unique_lock l(lock);
+    while (true) {
+      if (stopping) {
+	return;
+      }
+    
+      auto next = peek();
+      if (!next) {
+	cv.wait(l);
+	continue;
+      }
+
+      if (next->schedule_point > clock_t::now()) {
+	cv.wait_until(l, next->schedule_point);
+	continue;
+      }
+
+      // we release the reference below
+      events.erase(*next);
+
+      /* cancel() and schedule_after() both hold both intrusive_timer::lock
+       * and the callback_t lock (precondition of both) while mutating
+       * next->incarnation, so this read is safe.  We're relying on the
+       * fact that only this method in this thread will access
+       * next->incarnation under only one of the two. */
+      auto incarnation = next->incarnation;
+      l.unlock();
+      {
+	/* Note that intrusive_timer::cancel may observe that
+	 * callback_t::is_scheduled() returns true while
+	 * callback_t::is_linked() is false since we drop
+	 * intrusive_timer::lock between removing next from the
+	 * queue and incrementing callback_t::incarnation here
+	 * under the callback_t lock.  In that case, cancel()
+	 * increments incarnation logically canceling the callback
+	 * but leaves the reference for us to drop.
+	 */
+	std::unique_lock m(*next);
+	if (next->incarnation == incarnation) {
+	  /* As above, cancel() and schedule_after() hold both locks so this
+	   * mutation and read are safe. */
+	  ++next->incarnation;
+	  next->invoke();
+	}
+	/* else, next was canceled between l.unlock() and next->lock().
+	 * Note that if incarnation does not match, we do nothing to next
+	 * other than drop our reference -- it might well have been
+	 * rescheduled already! */
+      }
+      next->dec_ref();
+      l.lock();
+    }
+  }
+
+public:
+  intrusive_timer() : t([this] { _run(); }) {}
+
+  /**
+   * schedule_after
+   *
+   * Schedule cb to run after the specified period.
+   * The lock associated with cb must be held.
+   * cb must not already be scheduled.
+   *
+   * @param cb [in] callback to schedule
+   * @param after [in] period after which to schedule cb
+   */
+  template <typename T>
+  void schedule_after(callback_t &cb, T after) {
+    ceph_assert(!cb.is_scheduled());
+    std::unique_lock l(lock);
+    ceph_assert(!cb.is_linked());
+
+    ++cb.incarnation;
+    cb.schedule_point = clock_t::now() + after;
+
+    cb.add_ref();
+    events.insert(cb);
+
+    cv.notify_one();
+  }
+
+  /**
+   * cancel
+   *
+   * Cancel already scheduled cb.
+   * The lock associated with cb must be held.
+   *
+   * @param cb [in] callback to cancel
+   */
+  void cancel(callback_t &cb) {
+    ceph_assert(cb.is_scheduled());
+    std::unique_lock l(lock);
+    ++cb.incarnation;
+
+    if (cb.is_linked()) {
+      events.erase(cb);
+      cb.dec_ref();
+    }
+  }
+
+  /// Stop intrusive_timer
+  void stop() {
+    {
+      std::unique_lock l(lock);
+      stopping = true;
+      cv.notify_one();
+    }
+    t.join();
+  }
+};
+
+}
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 32ecc958618..f5e744e2339 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -99,6 +99,7 @@ ostream& ObjBencher::out(ostream& os)
 }
 
 void *ObjBencher::status_printer(void *_bencher) {
+  ceph_pthread_setname("OB::stat_print");
   ObjBencher *bencher = static_cast<ObjBencher *>(_bencher);
   bench_data& data = bencher->data;
   Formatter *formatter = bencher->formatter;
@@ -453,7 +454,6 @@ int ObjBencher::write_bench(int secondsToRun,
   pthread_t print_thread;
 
   pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "write_stat");
   std::unique_lock locker{lock};
   data.finished = 0;
   data.start_time = mono_clock::now();
@@ -691,7 +691,6 @@ int ObjBencher::seq_read_bench(
 
   pthread_t print_thread;
   pthread_create(&print_thread, NULL, status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "seq_read_stat");
 
   mono_time finish_time = data.start_time + time_to_run;
   //start initial reads
@@ -903,7 +902,6 @@ int ObjBencher::rand_read_bench(
 
   pthread_t print_thread;
   pthread_create(&print_thread, NULL, status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "rand_read_stat");
 
   mono_time finish_time = data.start_time + time_to_run;
   //start initial reads
diff --git a/src/common/options.cc b/src/common/options.cc
index a68e2474a3d..3f6894b01c1 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -5,6 +5,7 @@
 #include "options.h"
 #include "common/Formatter.h"
 #include "common/options/build_options.h"
+#include "common/strtol.h" // for strict_si_cast()
 
 // Helpers for validators
 #include "include/stringify.h"
diff --git a/src/common/options.h b/src/common/options.h
index abded4cc0dd..ec6db7770c3 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <chrono>
+#include <iostream> // for std::cerr
 #include <string>
 #include <variant>
 #include <vector>
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index 18efba561ed..94824faef6b 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -586,16 +586,6 @@ options:
   min: 1
   services:
   - mds
-- name: mds_log_major_segment_event_ratio
-  type: uint
-  level: advanced
-  desc: multiple of mds_log_events_per_segment between major segments
-  default: 12
-  services:
-  - mds
-  min: 1
-  see_also:
-  - mds_log_events_per_segment
 # segment size for mds log, default to default file_layout_t
 - name: mds_log_segment_size
   type: size
@@ -1741,3 +1731,12 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_log_minor_segments_per_major_segment
+  type: uint
+  level: advanced
+  desc: number of minor segments per major segment.
+  long_desc: The number of minor mds log segments since last major segment after which a major segment is started/logged.
+  default: 16
+  services:
+  - mds
+  min: 8
diff --git a/src/common/options/mgr.yaml.in b/src/common/options/mgr.yaml.in
index 773b0d36591..5095710afdf 100644
--- a/src/common/options/mgr.yaml.in
+++ b/src/common/options/mgr.yaml.in
@@ -152,7 +152,7 @@ options:
     first started after installation, to populate the list of enabled manager modules.  Subsequent
     updates are done using the 'mgr module [enable|disable]' commands.  List may be
     comma or space separated.
-  default: restful iostat nfs
+  default: iostat nfs
   services:
   - mon
   - common
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index e12061cf93c..49099f42b71 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -58,7 +58,10 @@ options:
     in recovery and 1 shard of another recovering PG.
   fmt_desc: The maximum number of backfills allowed to or from a single OSD.
     Note that this is applied separately for read and write operations.
+    This setting is automatically reset when the mClock scheduler is used.
   default: 1
+  see_also:
+  - osd_mclock_override_recovery_settings
   flags:
   - runtime
   with_legacy: true
@@ -95,6 +98,7 @@ options:
   fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
     Increasing this value will slow down recovery operation while
     client operations will be less impacted.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   flags:
   - runtime
@@ -105,6 +109,7 @@ options:
   desc: Time in seconds to sleep before next recovery or backfill op for HDDs
   fmt_desc: Time in seconds to sleep before next recovery or backfill op
     for HDDs.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0.1
   flags:
   - runtime
@@ -115,6 +120,7 @@ options:
   desc: Time in seconds to sleep before next recovery or backfill op for SSDs
   fmt_desc: Time in seconds to sleep before the next recovery or backfill op
     for SSDs.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   see_also:
   - osd_recovery_sleep
@@ -128,6 +134,7 @@ options:
     on HDD and journal is on SSD
   fmt_desc: Time in seconds to sleep before the next recovery or backfill op
     when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0.025
   see_also:
   - osd_recovery_sleep
@@ -141,6 +148,7 @@ options:
   fmt_desc: Time in seconds to sleep before next snap trim op.
     Increasing this value will slow down snap trimming.
     This option overrides backend specific variants.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   flags:
   - runtime
@@ -149,6 +157,7 @@ options:
   type: float
   level: advanced
   desc: Time in seconds to sleep before next snap trim for HDDs
+  note: This setting is ignored when the mClock scheduler is used.
   default: 5
   flags:
   - runtime
@@ -158,6 +167,7 @@ options:
   desc: Time in seconds to sleep before next snap trim for SSDs
   fmt_desc: Time in seconds to sleep before next snap trim op
     for SSD OSDs (including NVMe).
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   flags:
   - runtime
@@ -168,6 +178,7 @@ options:
     is on SSD
   fmt_desc: Time in seconds to sleep before next snap trim op
     when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 2
   flags:
   - runtime
@@ -182,6 +193,7 @@ options:
   desc: Maximum concurrent scrubs on a single OSD
   fmt_desc: The maximum number of simultaneous scrub operations for
     a Ceph OSD Daemon.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 3
   with_legacy: true
 - name: osd_scrub_during_recovery
@@ -212,11 +224,8 @@ options:
   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
   fmt_desc: This restricts scrubbing to this hour of the day or later.
     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
-    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
-    window, in which the scrubs can happen.
-    But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour`` they define a time
+    window, only in which will periodic scrubs be initiated.
   default: 0
   see_also:
   - osd_scrub_end_hour
@@ -228,12 +237,10 @@ options:
   level: advanced
   desc: Restrict scrubbing to hours of the day earlier than this
   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
-  fmt_desc: This restricts scrubbing to the hour earlier than this.
+  fmt_desc: This restricts scrubbing to the hours earlier than this.
     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
     for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    window, only in which can periodic scrubs be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_begin_hour
@@ -250,9 +257,7 @@ options:
     0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
     Along with ``osd_scrub_end_week_day``, they define a time window in which
-    scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, when the PG's
-    scrub interval exceeds ``osd_scrub_max_interval``.
+    periodic scrubs can be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_end_week_day
@@ -269,9 +274,7 @@ options:
     0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
     Along with ``osd_scrub_begin_week_day``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    window, in which periodic scrubs can be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_begin_week_day
@@ -282,8 +285,9 @@ options:
   type: float
   level: advanced
   desc: Allow scrubbing when system load divided by number of CPUs is below this value
-  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
-    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
+  fmt_desc: The normalized maximum load. Ceph will not initiate periodic (regular)
+    scrubs when the system load (as defined by ``getloadavg() / number of online CPUs``)
+    is higher than this number.
     Default is ``0.5``.
   default: 0.5
   with_legacy: true
@@ -292,8 +296,7 @@ options:
   type: float
   level: advanced
   desc: The desired interval between scrubs of a specific PG.
-  fmt_desc: The desired interval in seconds between scrubs of a specific PG
-    when the Ceph Storage Cluster load is low.
+  fmt_desc: The desired interval in seconds between scrubs of a specific PG.
   default: 1_day
   see_also:
   - osd_scrub_max_interval
@@ -303,8 +306,7 @@ options:
   type: float
   level: advanced
   desc: Scrub each PG no less often than this interval
-  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
-    irrespective of cluster load.
+  fmt_desc: The maximum interval in seconds for scrubbing each PG.
   default: 7_day
   see_also:
   - osd_scrub_min_interval
@@ -315,7 +317,7 @@ options:
   level: advanced
   desc: Ratio of scrub interval to randomly vary
   long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
-    so that they are soon uniformly distributed over the week
+    so that they are uniformly distributed over time.
   fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
     the next scrub job for a PG. The delay is a random
     value less than ``osd_scrub_min_interval`` \*
@@ -344,7 +346,7 @@ options:
   default: 5
   see_also:
   - osd_scrub_chunk_max
-  with_legacy: true
+  with_legacy: false
 - name: osd_scrub_chunk_max
   type: int
   level: advanced
@@ -355,7 +357,7 @@ options:
   default: 15
   see_also:
   - osd_scrub_chunk_min
-  with_legacy: true
+  with_legacy: false
 - name: osd_shallow_scrub_chunk_min
   type: int
   level: advanced
@@ -367,7 +369,7 @@ options:
   see_also:
   - osd_shallow_scrub_chunk_max
   - osd_scrub_chunk_min
-  with_legacy: true
+  with_legacy: false
 - name: osd_shallow_scrub_chunk_max
   type: int
   level: advanced
@@ -378,7 +380,7 @@ options:
   see_also:
   - osd_shallow_scrub_chunk_min
   - osd_scrub_chunk_max
-  with_legacy: true
+  with_legacy: false
 # sleep between [deep]scrub ops
 - name: osd_scrub_sleep
   type: float
@@ -387,7 +389,7 @@ options:
   fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk).
     Increasing this value will slow down the overall rate of scrubbing, reducing scrub
     impact on client operations.
-    This setting is ignored when the mClock scheduler is used.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   flags:
   - runtime
@@ -402,7 +404,7 @@ options:
     This configuration value is used for scrubbing out of scrubbing hours.
     Increasing this value will slow down the overall rate of scrubbing, reducing scrub
     impact on client operations.
-    This setting is ignored when the mClock scheduler is used.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   see_also:
   - osd_scrub_begin_hour
@@ -1293,12 +1295,33 @@ options:
   level: basic
   desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
     the OSD bench results for an OSD (for rotational media)
-  long_desc: This option specifies the threshold IOPS capacity for an OSD under
-    which the OSD bench results can be considered for QoS calculations. Only
-    considered for osd_op_queue = mclock_scheduler
+  long_desc: This option specifies the high threshold IOPS capacity for an OSD
+    below which the OSD bench results can be considered for QoS calculations.
+    Only considered when osd_op_queue = mclock_scheduler
   fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
-    ignore OSD bench results for an OSD (for rotational media)
+    ignore OSD bench results for an OSD (for rotational media) and fall back to
+    the last valid or default IOPS capacity defined by
+    ``osd_mclock_max_capacity_iops_hdd``.
   default: 500
+  see_also:
+  - osd_mclock_max_capacity_iops_hdd
+  flags:
+  - runtime
+- name: osd_mclock_iops_capacity_low_threshold_hdd
+  type: float
+  level: basic
+  desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
+    the OSD bench results for an OSD (for rotational media)
+  long_desc: This option specifies the low threshold IOPS capacity of an OSD
+    above which the OSD bench results can be considered for QoS calculations.
+    Only considered when osd_op_queue = mclock_scheduler
+  fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
+    ignore OSD bench results for an OSD (for rotational media) and fall back to
+    the last valid or default IOPS capacity defined by
+    ``osd_mclock_max_capacity_iops_hdd``.
+  default: 50
+  see_also:
+  - osd_mclock_max_capacity_iops_hdd
   flags:
   - runtime
 - name: osd_mclock_iops_capacity_threshold_ssd
@@ -1306,12 +1329,33 @@ options:
   level: basic
   desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
     the OSD bench results for an OSD (for solid state media)
-  long_desc: This option specifies the threshold IOPS capacity for an OSD under
-    which the OSD bench results can be considered for QoS calculations. Only
-    considered for osd_op_queue = mclock_scheduler
+  long_desc: This option specifies the high threshold IOPS capacity for an OSD
+    below which the OSD bench results can be considered for QoS calculations.
+    Only considered when osd_op_queue = mclock_scheduler
   fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
-    ignore OSD bench results for an OSD (for solid state media)
+    ignore OSD bench results for an OSD (for solid state media) and fall back to
+    the last valid or default IOPS capacity defined by
+    ``osd_mclock_max_capacity_iops_ssd``.
   default: 80000
+  see_also:
+  - osd_mclock_max_capacity_iops_ssd
+  flags:
+  - runtime
+- name: osd_mclock_iops_capacity_low_threshold_ssd
+  type: float
+  level: basic
+  desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
+    the OSD bench results for an OSD (for solid state media)
+  long_desc: This option specifies the low threshold IOPS capacity for an OSD
+    above which the OSD bench results can be considered for QoS calculations.
+    Only considered when osd_op_queue = mclock_scheduler
+  fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
+    ignore OSD bench results for an OSD (for solid state media) and fall back to
+    the last valid or default IOPS capacity defined by
+    ``osd_mclock_max_capacity_iops_ssd``.
+  default: 1000
+  see_also:
+  - osd_mclock_max_capacity_iops_ssd
   flags:
   - runtime
 # Set to true for testing.  Users should NOT set this.
@@ -1346,10 +1390,12 @@ options:
     is ``0``, which means that the ``hdd`` or ``ssd`` values
     (below) are used, depending on the type of the primary
     device backing the OSD.
+    This setting is automatically reset when the mClock scheduler is used.
   default: 0
   see_also:
   - osd_recovery_max_active_hdd
   - osd_recovery_max_active_ssd
+  - osd_mclock_override_recovery_settings
   flags:
   - runtime
   with_legacy: true
@@ -1360,10 +1406,12 @@ options:
     devices)
   fmt_desc: The number of active recovery requests per OSD at one time, if the
     primary device is rotational.
+  note: This setting is automatically reset when the mClock scheduler is used.
   default: 3
   see_also:
   - osd_recovery_max_active
   - osd_recovery_max_active_ssd
+  - osd_mclock_override_recovery_settings
   flags:
   - runtime
   with_legacy: true
@@ -1374,10 +1422,12 @@ options:
     solid state devices)
   fmt_desc: The number of active recovery requests per OSD at one time, if the
     primary device is non-rotational (i.e., an SSD).
+  note: This setting is automatically reset when the mClock scheduler is used.
   default: 10
   see_also:
   - osd_recovery_max_active
   - osd_recovery_max_active_hdd
+  - osd_mclock_override_recovery_settings
   flags:
   - runtime
   with_legacy: true
@@ -1472,13 +1522,15 @@ options:
     overrides _ssd, _hdd, and _hybrid if non-zero.
   fmt_desc: Time in seconds to sleep before the next removal transaction. This
     throttles the PG deletion process.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 0
   flags:
   - runtime
 - name: osd_delete_sleep_hdd
   type: float
   level: advanced
-  desc: Time in seconds to sleep before next removal transaction for HDDs
+  desc: Time in seconds to sleep before next removal transaction for HDDs.
+  note: This setting is ignored when the mClock scheduler is used.
   default: 5
   flags:
   - runtime
@@ -1486,6 +1538,7 @@ options:
   type: float
   level: advanced
   desc: Time in seconds to sleep before next removal transaction for SSDs
+  note: This setting is ignored when the mClock scheduler is used.
   default: 1
   flags:
   - runtime
@@ -1494,6 +1547,7 @@ options:
   level: advanced
   desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
     and OSD journal or WAL+DB is on SSD
+  note: This setting is ignored when the mClock scheduler is used.
   default: 1
   flags:
   - runtime
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index 85fe62d2343..0ce5bc332fd 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -59,6 +59,14 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_parquet_buffer_size
+  type: size
+  level: advanced
+  desc: the Maximum parquet buffer size, a limit to memory consumption for parquet reading operations.
+  default: 16_M
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_rados_tracing
   type: bool
   level: advanced
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
index b5e361b505c..2eeaa80aae8 100644
--- a/src/common/perf_counters.cc
+++ b/src/common/perf_counters.cc
@@ -18,6 +18,7 @@
 #include "common/dout.h"
 #include "common/valgrind.h"
 #include "include/common_fwd.h"
+#include "include/utime.h"
 
 using std::ostringstream;
 using std::make_pair;
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
index 942edf6d7e5..0d0fe86a092 100644
--- a/src/common/perf_counters.h
+++ b/src/common/perf_counters.h
@@ -17,6 +17,8 @@
 #ifndef CEPH_COMMON_PERF_COUNTERS_H
 #define CEPH_COMMON_PERF_COUNTERS_H
 
+#include <functional>
+#include <set>
 #include <string>
 #include <vector>
 #include <memory>
@@ -24,11 +26,12 @@
 #include <cstdint>
 
 #include "common/perf_histogram.h"
-#include "include/utime.h"
 #include "include/common_fwd.h"
 #include "common/ceph_mutex.h"
 #include "common/ceph_time.h"
 
+class utime_t;
+
 namespace TOPNSPC::common {
   class CephContext;
   class PerfCountersBuilder;
diff --git a/src/common/perf_counters_cache.h b/src/common/perf_counters_cache.h
index 866f56ee350..aa786fc5bf0 100644
--- a/src/common/perf_counters_cache.h
+++ b/src/common/perf_counters_cache.h
@@ -3,6 +3,7 @@
 #include "common/perf_counters.h"
 #include "common/ceph_context.h"
 #include "common/intrusive_lru.h"
+#include "include/utime.h"
 
 namespace ceph::perf_counters {
 
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index aa6b765bc56..d125d7171e0 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -15,6 +15,7 @@
 #include "common/pick_address.h"
 
 #include <bitset>
+#include <ifaddrs.h> // for struct ifaddrs
 #include <netdb.h>
 #include <netinet/in.h>
 #ifdef _WIN32
@@ -40,6 +41,7 @@
 #include "common/debug.h"
 #include "common/errno.h"
 #include "common/numa.h"
+#include "common/safe_io.h"
 
 #ifndef HAVE_IN_ADDR_T
 typedef uint32_t in_addr_t;
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index c9e982b6396..c97942adec5 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -146,43 +146,54 @@ T strict_iec_cast(std::string_view str, std::string *err)
   if (u != std::string_view::npos) {
     n = str.substr(0, u);
     unit = str.substr(u, str.length() - u);
+    // handling cases when prefixes entered as KB, MB, ...
+    // and KiB, MiB, ....
+    if (unit.length() > 1 && unit.back() == 'B') {
+      unit = unit.substr(0, unit.length() - 1);
+    }
     // we accept both old si prefixes as well as the proper iec prefixes
     // i.e. K, M, ... and Ki, Mi, ...
-    if (unit.back() == 'i') {
-      if (unit.front() == 'B') {
-        *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
-        return 0;
-      }
-    }
     if (unit.length() > 2) {
       *err = "strict_iecstrtoll: illegal prefix (length > 2)";
       return 0;
     }
-    switch(unit.front()) {
-      case 'K':
-        m = 10;
-        break;
-      case 'M':
-        m = 20;
-        break;
-      case 'G':
-        m = 30;
-        break;
-      case 'T':
-        m = 40;
-        break;
-      case 'P':
-        m = 50;
-        break;
-      case 'E':
-        m = 60;
-        break;
-      case 'B':
-        break;
-      default:
-        *err = "strict_iecstrtoll: unit prefix not recognized";
-        return 0;
+    if ((unit.back() == 'i') || (unit.length() == 1)) {
+      if (unit.back() == 'i') {
+        if (unit.front() == 'B') {
+          *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
+          return 0;
+        }
+      }
+      switch(unit.front()) {
+        case 'K':
+          m = 10;
+          break;
+        case 'M':
+          m = 20;
+          break;
+        case 'G':
+          m = 30;
+          break;
+        case 'T':
+          m = 40;
+          break;
+        case 'P':
+          m = 50;
+          break;
+        case 'E':
+          m = 60;
+          break;
+        case 'B':
+          break;
+        default:
+          *err = ("strict_iecstrtoll: unit prefix not recognized '" + std::string{unit} + "' ");
+          return 0;
+      }
     }
+    else {
+      *err = ("strict_iecstrtoll: illegal prefix '" + std::string{unit} + "' ");
+      return 0;
+    }   
   }
 
   long long ll = strict_strtoll(n, 10, err);
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
deleted file mode 100644
index f457f655df5..00000000000
--- a/src/common/sync_filesystem.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2011 New Dream Network
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#ifndef CEPH_SYNC_FILESYSTEM_H
-#define CEPH_SYNC_FILESYSTEM_H
-
-#include <unistd.h>
-
-#if defined(__linux__)
-#include <sys/ioctl.h>
-#include <syscall.h>
-#include "os/fs/btrfs_ioctl.h"
-#endif
-
-inline int sync_filesystem(int fd)
-{
-  /* On Linux, newer versions of glibc have a function called syncfs that
-   * performs a sync on only one filesystem. If we don't have this call, we
-   * have to fall back on sync(), which synchronizes every filesystem on the
-   * computer. */
-#ifdef HAVE_SYS_SYNCFS
-  if (syncfs(fd) == 0)
-    return 0;
-#elif defined(SYS_syncfs)
-  if (syscall(SYS_syncfs, fd) == 0)
-    return 0;
-#elif defined(__NR_syncfs)
-  if (syscall(__NR_syncfs, fd) == 0)
-    return 0;
-#endif
-
-#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
-  else if (errno == ENOSYS) {
-    sync();
-    return 0;
-  } else {
-    return -errno;
-  }
-#else
-  sync();
-  return 0;
-#endif
-}
-
-#endif
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
index 5cf9590e61e..277055ec51e 100644
--- a/src/crimson/os/alienstore/thread_pool.cc
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -27,7 +27,7 @@ ThreadPool::ThreadPool(size_t n_threads,
         pin(*cpus);
       }
       block_sighup();
-      (void) pthread_setname_np(pthread_self(), "alien-store-tp");
+      (void) ceph_pthread_setname("alien-store-tp");
       loop(queue_max_wait, i);
     });
   }
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 5dcb7514ee1..70fec7caca4 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -172,6 +172,7 @@ void Cache::register_metrics()
     {extent_types_t::LADDR_INTERNAL,      sm::label_instance("ext", "LADDR_INTERNAL")},
     {extent_types_t::LADDR_LEAF,          sm::label_instance("ext", "LADDR_LEAF")},
     {extent_types_t::DINK_LADDR_LEAF,     sm::label_instance("ext", "DINK_LADDR_LEAF")},
+    {extent_types_t::ROOT_META,           sm::label_instance("ext", "ROOT_META")},
     {extent_types_t::OMAP_INNER,          sm::label_instance("ext", "OMAP_INNER")},
     {extent_types_t::OMAP_LEAF,           sm::label_instance("ext", "OMAP_LEAF")},
     {extent_types_t::ONODE_BLOCK_STAGED,  sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
@@ -1093,6 +1094,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
   case extent_types_t::LADDR_LEAF:
     return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>(
       t, length, hint, gen);
+  case extent_types_t::ROOT_META:
+    return alloc_new_non_data_extent<RootMetaBlock>(
+      t, length, hint, gen);
   case extent_types_t::ONODE_BLOCK_STAGED:
     return alloc_new_non_data_extent<onode::SeastoreNodeExtent>(
       t, length, hint, gen);
@@ -2193,6 +2197,12 @@ Cache::do_get_caching_extent_by_type(
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
+    case extent_types_t::ROOT_META:
+      return do_get_caching_extent<RootMetaBlock>(
+	offset, length, std::move(extent_init_func), std::move(on_cache)
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
     case extent_types_t::OMAP_INNER:
       return do_get_caching_extent<omap_manager::OMapInnerNode>(
           offset, length, std::move(extent_init_func), std::move(on_cache)
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index dba3610e95f..c37d9c5c7cd 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -978,7 +978,8 @@ public:
     auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen);
 #endif
     if (!result) {
-      return nullptr;
+      SUBERRORT(seastore_cache, "insufficient space", t);
+      std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
     }
     auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result->bp));
     ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
@@ -1019,6 +1020,10 @@ public:
 #else
     auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen);
 #endif
+    if (results.empty()) {
+      SUBERRORT(seastore_cache, "insufficient space", t);
+      std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
+    }
     std::vector<TCachedExtentRef<T>> extents;
     for (auto &result : results) {
       auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
index 072c57864be..fa2ed65c0f3 100644
--- a/src/crimson/os/seastore/onode.h
+++ b/src/crimson/os/seastore/onode.h
@@ -36,8 +36,8 @@ struct onode_layout_t {
 
   object_data_le_t object_data;
 
-  char oi[MAX_OI_LENGTH];
-  char ss[MAX_SS_LENGTH];
+  char oi[MAX_OI_LENGTH] = {0};
+  char ss[MAX_SS_LENGTH] = {0};
 } __attribute__((packed));
 
 class Transaction;
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
index 9f6a566d15c..97b7902edf5 100644
--- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
@@ -188,10 +188,10 @@ BlockRBManager::write_ertr::future<> BlockRBManager::write(
 void BlockRBManager::prefill_fragmented_device()
 {
   LOG_PREFIX(BlockRBManager::prefill_fragmented_device);
-  // the first 2 blocks must be allocated to lba root
+  // the first 3 blocks must be allocated to lba root
   // and backref root during mkfs
-  for (size_t block = get_block_size() * 2;
-      block <= get_size() - get_block_size() * 2;
+  for (size_t block = get_block_size() * 3;
+      block <= get_size() - get_block_size() * 3;
       block += get_block_size() * 2) {
     DEBUG("marking {}~{} used",
       get_start_rbm_addr() + block,
diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h
new file mode 100644
index 00000000000..edf082f1e38
--- /dev/null
+++ b/src/crimson/os/seastore/root_meta.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+struct RootMetaBlock : LogicalCachedExtent {
+  using meta_t = std::map<std::string, std::string>;
+  using Ref = TCachedExtentRef<RootMetaBlock>;
+  static constexpr size_t SIZE = 4096;
+  static constexpr int MAX_META_LENGTH = 1024;
+
+  explicit RootMetaBlock(ceph::bufferptr &&ptr)
+    : LogicalCachedExtent(std::move(ptr)) {}
+  explicit RootMetaBlock(extent_len_t length)
+    : LogicalCachedExtent(length) {}
+  RootMetaBlock(const RootMetaBlock &rhs)
+    : LogicalCachedExtent(rhs) {}
+
+  CachedExtentRef duplicate_for_write(Transaction&) final {
+    return CachedExtentRef(new RootMetaBlock(*this));
+  }
+
+  static constexpr extent_types_t TYPE = extent_types_t::ROOT_META;
+  extent_types_t get_type() const final {
+    return extent_types_t::ROOT_META;
+  }
+
+  /// dumps root meta as delta
+  ceph::bufferlist get_delta() final {
+    ceph::bufferlist bl;
+    ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH);
+    bl.append(bptr);
+    return bl;
+  }
+
+  /// overwrites root
+  void apply_delta(const ceph::bufferlist &_bl) final
+  {
+    assert(_bl.length() == MAX_META_LENGTH);
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str());
+  }
+
+  meta_t get_meta() const {
+    bufferlist bl;
+    bl.append(get_bptr());
+    meta_t ret;
+    auto iter = bl.cbegin();
+    decode(ret, iter);
+    return ret;
+  }
+
+  void set_meta(const meta_t &m) {
+    ceph::bufferlist bl;
+    encode(m, bl);
+    ceph_assert(bl.length() <= MAX_META_LENGTH);
+    bl.rebuild();
+    get_bptr().zero(0, MAX_META_LENGTH);
+    get_bptr().copy_in(0, bl.length(), bl.front().c_str());
+  }
+
+};
+using RootMetaBlockRef = RootMetaBlock::Ref;
+
+} // crimson::os::seastore
+
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock>
+  : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index f379dd0117c..450118e5e75 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
     return out << "LADDR_LEAF";
   case extent_types_t::ONODE_BLOCK_STAGED:
     return out << "ONODE_BLOCK_STAGED";
+  case extent_types_t::ROOT_META:
+    return out << "ROOT_META";
   case extent_types_t::OMAP_INNER:
     return out << "OMAP_INNER";
   case extent_types_t::OMAP_LEAF:
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index df5c184e7ab..65cad878fba 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -1378,23 +1378,24 @@ enum class extent_types_t : uint8_t {
   LADDR_INTERNAL = 1,
   LADDR_LEAF = 2,
   DINK_LADDR_LEAF = 3, // should only be used for unitttests
-  OMAP_INNER = 4,
-  OMAP_LEAF = 5,
-  ONODE_BLOCK_STAGED = 6,
-  COLL_BLOCK = 7,
-  OBJECT_DATA_BLOCK = 8,
-  RETIRED_PLACEHOLDER = 9,
+  ROOT_META = 4,
+  OMAP_INNER = 5,
+  OMAP_LEAF = 6,
+  ONODE_BLOCK_STAGED = 7,
+  COLL_BLOCK = 8,
+  OBJECT_DATA_BLOCK = 9,
+  RETIRED_PLACEHOLDER = 10,
   // the following two types are not extent types,
   // they are just used to indicates paddr allocation deltas
-  ALLOC_INFO = 10,
-  JOURNAL_TAIL = 11,
+  ALLOC_INFO = 11,
+  JOURNAL_TAIL = 12,
   // Test Block Types
-  TEST_BLOCK = 12,
-  TEST_BLOCK_PHYSICAL = 13,
-  BACKREF_INTERNAL = 14,
-  BACKREF_LEAF = 15,
+  TEST_BLOCK = 13,
+  TEST_BLOCK_PHYSICAL = 14,
+  BACKREF_INTERNAL = 15,
+  BACKREF_LEAF = 16,
   // None and the number of valid extent_types_t
-  NONE = 16,
+  NONE = 17,
 };
 using extent_types_le_t = uint8_t;
 constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
@@ -1409,12 +1410,12 @@ constexpr bool is_data_type(extent_types_t type) {
 }
 
 constexpr bool is_logical_metadata_type(extent_types_t type) {
-  return type >= extent_types_t::OMAP_INNER &&
+  return type >= extent_types_t::ROOT_META &&
          type <= extent_types_t::COLL_BLOCK;
 }
 
 constexpr bool is_logical_type(extent_types_t type) {
-  if ((type >= extent_types_t::OMAP_INNER &&
+  if ((type >= extent_types_t::ROOT_META &&
        type <= extent_types_t::OBJECT_DATA_BLOCK) ||
       type == extent_types_t::TEST_BLOCK) {
     assert(is_logical_metadata_type(type) ||
@@ -1926,44 +1927,18 @@ using backref_root_t = phy_tree_root_t;
  * TODO: generalize this to permit more than one lba_manager implementation
  */
 struct __attribute__((packed)) root_t {
-  using meta_t = std::map<std::string, std::string>;
-
-  static constexpr int MAX_META_LENGTH = 1024;
-
   backref_root_t backref_root;
   lba_root_t lba_root;
   laddr_le_t onode_root;
   coll_root_le_t collection_root;
+  laddr_le_t meta;
 
-  char meta[MAX_META_LENGTH];
-
-  root_t() {
-    set_meta(meta_t{});
-  }
+  root_t() = default;
 
   void adjust_addrs_from_base(paddr_t base) {
     lba_root.adjust_addrs_from_base(base);
     backref_root.adjust_addrs_from_base(base);
   }
-
-  meta_t get_meta() {
-    bufferlist bl;
-    bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
-    meta_t ret;
-    auto iter = bl.cbegin();
-    decode(ret, iter);
-    return ret;
-  }
-
-  void set_meta(const meta_t &m) {
-    ceph::bufferlist bl;
-    encode(m, bl);
-    ceph_assert(bl.length() < MAX_META_LENGTH);
-    bl.rebuild();
-    auto &bptr = bl.front();
-    ::memset(meta, 0, MAX_META_LENGTH);
-    ::memcpy(meta, bptr.c_str(), bl.length());
-  }
 };
 
 struct alloc_blk_t {
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index f4e3b0858f2..717c3822db9 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -74,6 +74,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
         return lba_manager->mkfs(t);
       }).si_then([this, &t] {
         return backref_manager->mkfs(t);
+      }).si_then([this, &t] {
+        return init_root_meta(t);
       }).si_then([this, FNAME, &t] {
         INFOT("submitting mkfs transaction", t);
         return submit_transaction_direct(t);
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index c7a94a9ef11..841c5638abc 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -23,6 +23,7 @@
 #include "crimson/os/seastore/logging.h"
 #include "crimson/os/seastore/seastore_types.h"
 #include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/root_meta.h"
 #include "crimson/os/seastore/lba_manager.h"
 #include "crimson/os/seastore/backref_manager.h"
 #include "crimson/os/seastore/journal.h"
@@ -303,10 +304,6 @@ public:
       len,
       placement_hint,
       INIT_GENERATION);
-    if (!ext) {
-      SUBERRORT(seastore_tm, "insufficient space!", t);
-      return crimson::ct_error::enospc::make();
-    }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
@@ -342,10 +339,6 @@ public:
       len,
       placement_hint,
       INIT_GENERATION);
-    if (exts.empty()) {
-      SUBERRORT(seastore_tm, "insufficient space!", t);
-      return crimson::ct_error::enospc::make();
-    }
     return lba_manager->alloc_extents(
       t,
       laddr_hint,
@@ -690,9 +683,11 @@ public:
     const std::string &key) {
     return cache->get_root(
       t
-    ).si_then([&key, &t](auto root) {
+    ).si_then([&t, this](auto root) {
+      return read_extent<RootMetaBlock>(t, root->root.meta);
+    }).si_then([key, &t](auto mblock) {
       LOG_PREFIX(TransactionManager::read_root_meta);
-      auto meta = root->root.get_meta();
+      auto meta = mblock->get_meta();
       auto iter = meta.find(key);
       if (iter == meta.end()) {
         SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
@@ -701,7 +696,35 @@ public:
         SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
 	return seastar::make_ready_future<read_root_meta_bare>(iter->second);
       }
-    });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
+  }
+
+  /**
+   * init_root_meta
+   *
+   * create the root meta block
+   */
+  using init_root_meta_iertr = base_iertr;
+  using init_root_meta_ret = init_root_meta_iertr::future<>;
+  init_root_meta_ret init_root_meta(Transaction &t) {
+    return alloc_non_data_extent<RootMetaBlock>(
+      t, L_ADDR_MIN, RootMetaBlock::SIZE
+    ).si_then([this, &t](auto meta) {
+      meta->set_meta(RootMetaBlock::meta_t{});
+      return cache->get_root(t
+      ).si_then([this, &t, meta](auto root) {
+	auto mroot = cache->duplicate_for_write(
+	  t, root)->template cast<RootBlock>();
+	mroot->root.meta = meta->get_laddr();
+	return seastar::now();
+      });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
   }
 
   /**
@@ -719,15 +742,21 @@ public:
     SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
-    ).si_then([this, &t, &key, &value](RootBlockRef root) {
-      root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
+    ).si_then([this, &t](RootBlockRef root) {
+      return read_extent<RootMetaBlock>(t, root->root.meta);
+    }).si_then([this, key, value, &t](auto mblock) {
+      mblock = get_mutable_extent(t, mblock
+	)->template cast<RootMetaBlock>();
 
-      auto meta = root->root.get_meta();
+      auto meta = mblock->get_meta();
       meta[key] = value;
 
-      root->root.set_meta(meta);
+      mblock->set_meta(meta);
       return seastar::now();
-    });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
   }
 
   /**
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 522a93a1ddc..64544d4c870 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -36,6 +36,10 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     return peering_state.get_info().log_tail;
   }
 
+  const PGLog& get_pg_log() const override {
+    return peering_state.get_pg_log();
+  }
+
   void scan_log_after(eversion_t v, scan_log_func_t f) const override {
     peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f));
   }
@@ -73,6 +77,10 @@ struct PGFacade final : BackfillState::PGFacade {
     return pg.projected_last_update;
   }
 
+  const PGLog::IndexedLog& get_projected_log() const override {
+    return pg.projected_log;
+  }
+
   PGFacade(PG& pg) : pg(pg) {}
 };
 
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 018e58b68f8..837fd2eb2af 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -125,7 +125,6 @@ void BackfillState::Enqueuing::maybe_update_range()
     logger().info("{}: bi is current", __func__);
     ceph_assert(primary_bi.version == pg().get_projected_last_update());
   } else if (primary_bi.version >= peering_state().get_log_tail()) {
-#if 0
     if (peering_state().get_pg_log().get_log().empty() &&
         pg().get_projected_log().empty()) {
       /* Because we don't move log_tail on split, the log might be
@@ -137,13 +136,11 @@ void BackfillState::Enqueuing::maybe_update_range()
       ceph_assert(primary_bi.version == eversion_t());
       return;
     }
-#endif
     logger().debug("{}: bi is old, ({}) can be updated with log to {}",
                    __func__,
                    primary_bi.version,
                    pg().get_projected_last_update());
-    logger().debug("{}: scanning pg log first", __func__);
-    peering_state().scan_log_after(primary_bi.version,
+    auto func =
       [&](const pg_log_entry_t& e) {
         logger().debug("maybe_update_range(lambda): updating from version {}",
                        e.version);
@@ -160,7 +157,11 @@ void BackfillState::Enqueuing::maybe_update_range()
             primary_bi.objects.erase(e.soid);
           }
         }
-      });
+      };
+    logger().debug("{}: scanning pg log first", __func__);
+    peering_state().scan_log_after(primary_bi.version, func);
+    logger().debug("{}: scanning projected log", __func__);
+    pg().get_projected_log().scan_log_after(primary_bi.version, func);
     primary_bi.version = pg().get_projected_last_update();
   } else {
     ceph_abort_msg(
@@ -378,16 +379,17 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-    } else {
+      backfill_listener().maybe_flush();
+    } else if (!primary_bi.empty()) {
       auto result = update_on_peers(check);
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      if (!primary_bi.empty()) {
-	primary_bi.pop_front();
-      }
+      primary_bi.pop_front();
+      backfill_listener().maybe_flush();
+    } else {
+      break;
     }
-    backfill_listener().maybe_flush();
   } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
   if (backfill_state().progress_tracker->tracked_objects_completed()
@@ -609,4 +611,12 @@ void BackfillState::ProgressTracker::complete_to(
   }
 }
 
+void BackfillState::enqueue_standalone_push(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers) {
+  progress_tracker->enqueue_push(obj);
+  backfill_machine.backfill_listener.enqueue_push(obj, v, peers);
+}
+
 } // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index ddc0cbf7355..072c91e079d 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -14,6 +14,7 @@
 #include <boost/statechart/transition.hpp>
 
 #include "osd/recovery_types.h"
+#include "osd/PGLog.h"
 
 namespace crimson::osd {
 
@@ -285,8 +286,12 @@ public:
   struct Done : sc::state<Done, BackfillMachine>,
                 StateHelper<Done> {
     using reactions = boost::mpl::list<
+      sc::custom_reaction<CancelBackfill>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Done(my_context);
+    sc::result react(CancelBackfill) {
+      return discard_event();
+    }
   };
 
   BackfillState(BackfillListener& backfill_listener,
@@ -299,6 +304,15 @@ public:
     backfill_machine.process_event(*std::move(evt));
   }
 
+  void enqueue_standalone_push(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
+
+  bool is_triggered() const {
+    return backfill_machine.triggering_event() != nullptr;
+  }
+
   hobject_t get_last_backfill_started() const {
     return last_backfill_started;
   }
@@ -363,6 +377,7 @@ struct BackfillState::PeeringFacade {
   virtual hobject_t earliest_backfill() const = 0;
   virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0;
   virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0;
+  virtual const PGLog& get_pg_log() const = 0;
   virtual const eversion_t& get_last_update() const = 0;
   virtual const eversion_t& get_log_tail() const = 0;
 
@@ -388,6 +403,8 @@ struct BackfillState::PeeringFacade {
 // of behaviour that must be provided by a unit test's mock.
 struct BackfillState::PGFacade {
   virtual const eversion_t& get_projected_last_update() const = 0;
+  virtual const PGLog::IndexedLog& get_projected_log() const = 0;
+
   virtual ~PGFacade() {}
 };
 
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
index 32eaaf02b3f..007d0bf35f3 100644
--- a/src/crimson/osd/ec_backend.cc
+++ b/src/crimson/osd/ec_backend.cc
@@ -26,6 +26,7 @@ ECBackend::_read(const hobject_t& hoid,
 ECBackend::rep_op_fut_t
 ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards,
                               const hobject_t& hoid,
+			      crimson::osd::ObjectContextRef&& new_clone,
                               ceph::os::Transaction&& txn,
                               osd_op_params_t&& osd_op_p,
                               epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
index 90a7e2b1f4d..b14c78c9fc4 100644
--- a/src/crimson/osd/ec_backend.h
+++ b/src/crimson/osd/ec_backend.h
@@ -28,6 +28,7 @@ private:
   rep_op_fut_t
   submit_transaction(const std::set<pg_shard_t> &pg_shards,
 		     const hobject_t& hoid,
+		     crimson::osd::ObjectContextRef&& new_clone,
 		     ceph::os::Transaction&& txn,
 		     osd_op_params_t&& req,
 		     epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h
index e17af91e3ad..6f51045931d 100644
--- a/src/crimson/osd/object_context.h
+++ b/src/crimson/osd/object_context.h
@@ -132,22 +132,6 @@ public:
   }
 
 private:
-  template <typename Lock, typename Func>
-  auto _with_lock(Lock& lock, Func&& func) {
-    return lock.lock(
-    ).then([&lock, func=std::forward<Func>(func), obc=Ref(this)]() mutable {
-      return seastar::futurize_invoke(
-	func
-      ).finally([&lock, obc=std::move(obc)] {
-	/* We chain the finally block here because it's possible for lock.lock()
-	 * above to fail due to a call to ObjectContext::interrupt, which calls
-	 * tri_mutex::abort.  In the event of such an error, the lock isn't
-	 * actually taken and calling unlock() would be incorrect. */
-	lock.unlock();
-      });
-    });
-  }
-
   boost::intrusive::list_member_hook<> obc_accessing_hook;
   uint64_t list_link_cnt = 0;
   bool fully_loaded = false;
@@ -177,117 +161,6 @@ public:
     boost::intrusive::list_member_hook<>,
     &ObjectContext::obc_accessing_hook>;
 
-  template<RWState::State Type, typename InterruptCond = void, typename Func>
-  auto with_lock(Func&& func) {
-    if constexpr (!std::is_void_v<InterruptCond>) {
-      auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
-      switch (Type) {
-      case RWState::RWWRITE:
-	return _with_lock(lock.for_write(), std::move(wrapper));
-      case RWState::RWREAD:
-	return _with_lock(lock.for_read(), std::move(wrapper));
-      case RWState::RWEXCL:
-	return _with_lock(lock.for_excl(), std::move(wrapper));
-      case RWState::RWNONE:
-	return seastar::futurize_invoke(std::move(wrapper));
-      default:
-	assert(0 == "noop");
-      }
-    } else {
-      switch (Type) {
-      case RWState::RWWRITE:
-	return _with_lock(lock.for_write(), std::forward<Func>(func));
-      case RWState::RWREAD:
-	return _with_lock(lock.for_read(), std::forward<Func>(func));
-      case RWState::RWEXCL:
-	return _with_lock(lock.for_excl(), std::forward<Func>(func));
-      case RWState::RWNONE:
-	return seastar::futurize_invoke(std::forward<Func>(func));
-      default:
-	assert(0 == "noop");
-      }
-    }
-  }
-
-  /**
-   * load_then_with_lock
-   *
-   * Takes two functions as arguments -- load_func to be invoked
-   * with an exclusive lock, and func to be invoked under the
-   * lock type specified by the Type template argument.
-   *
-   * Caller must ensure that *this is not already locked, presumably
-   * by invoking load_then_with_lock immediately after construction.
-   *
-   * @param [in] load_func Function to be invoked under excl lock
-   * @param [in] func Function to be invoked after load_func under
-   *             lock of type Type.
-   */
-  template<RWState::State Type, typename Func, typename Func2>
-  auto load_then_with_lock(Func &&load_func, Func2 &&func) {
-    class lock_state_t {
-      tri_mutex *lock = nullptr;
-      bool excl = false;
-
-    public:
-      lock_state_t(tri_mutex &lock) : lock(&lock), excl(true) {
-	ceph_assert(lock.try_lock_for_excl());
-      }
-      lock_state_t(lock_state_t &&o) : lock(o.lock), excl(o.excl) {
-	o.lock = nullptr;
-	o.excl = false;
-      }
-      lock_state_t() = delete;
-      lock_state_t &operator=(lock_state_t &&o) = delete;
-      lock_state_t(const lock_state_t &o) = delete;
-      lock_state_t &operator=(const lock_state_t &o) = delete;
-
-      void demote() {
-	ceph_assert(excl);
-	ceph_assert(lock);
-	if constexpr (Type == RWState::RWWRITE) {
-	  lock->demote_to_write();
-	} else if constexpr (Type == RWState::RWREAD) {
-	  lock->demote_to_read();
-	} else if constexpr (Type == RWState::RWNONE) {
-	  lock->unlock_for_excl();
-	}
-	excl = false;
-      }
-
-      ~lock_state_t() {
-	if (!lock)
-	  return;
-
-	if constexpr (Type == RWState::RWEXCL) {
-	  lock->unlock_for_excl();
-	} else {
-	  if (excl) {
-	    lock->unlock_for_excl();
-	    return;
-	  }
-
-	  if constexpr (Type == RWState::RWWRITE) {
-	    lock->unlock_for_write();
-	  } else if constexpr (Type == RWState::RWREAD) {
-	    lock->unlock_for_read();
-	  }
-	}
-      }
-    };
-
-    return seastar::do_with(
-      lock_state_t{lock},
-      [load_func=std::move(load_func), func=std::move(func)](auto &ls) mutable {
-	return std::invoke(
-	  std::move(load_func)
-	).si_then([func=std::move(func), &ls]() mutable {
-	  ls.demote();
-	  return std::invoke(std::move(func));
-	});
-      });
-  }
-
   bool empty() const {
     return !lock.is_acquired();
   }
@@ -336,3 +209,6 @@ std::optional<hobject_t> resolve_oid(const SnapSet &ss,
                                      const hobject_t &oid);
 
 } // namespace crimson::osd
+
+template <>
+struct fmt::formatter<RWState::State> : fmt::ostream_formatter {};
diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc
index 12aa40b925a..869ca91504c 100644
--- a/src/crimson/osd/object_context_loader.cc
+++ b/src/crimson/osd/object_context_loader.cc
@@ -1,3 +1,4 @@
+#include "crimson/common/coroutine.h"
 #include "crimson/osd/object_context_loader.h"
 #include "osd/osd_types_fmt.h"
 #include "osd/object_state_fmt.h"
@@ -8,207 +9,155 @@ namespace crimson::osd {
 
 using crimson::common::local_conf;
 
-  template<RWState::State State>
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_head_obc(const hobject_t& oid,
-                                     with_obc_func_t&& func)
-  {
-    return with_locked_obc<State, true /* track */>(
-      oid,
-      [func=std::move(func)](auto obc) {
-        // The template with_obc_func_t wrapper supports two obcs (head and clone).
-        // In the 'with_head_obc' case, however, only the head is in use.
-        // Pass the same head obc twice in order to
-        // to support the generic with_obc sturcture.
-	return std::invoke(std::move(func), obc, obc);
-      });
-  }
 
-  template<RWState::State State>
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_clone_obc(const hobject_t& oid,
-                                      with_obc_func_t&& func,
-                                      bool resolve_clone)
-  {
-    LOG_PREFIX(ObjectContextLoader::with_clone_obc);
-    assert(!oid.is_head());
-    return with_head_obc<RWState::RWREAD>(
-      oid.get_head(),
-      [FNAME, oid, func=std::move(func), resolve_clone, this]
-      (auto head, auto) mutable -> load_obc_iertr::future<> {
-      if (!head->obs.exists) {
-	ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
-        return load_obc_iertr::future<>{
-          crimson::ct_error::enoent::make()
-        };
-      }
-      return this->with_clone_obc_only<State>(std::move(head),
-                                              oid,
-                                              std::move(func),
-                                              resolve_clone);
-    });
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock_head(Manager &manager, RWState::State lock_type)
+{
+  LOG_PREFIX(ObjectContextLoader::load_and_lock_head);
+  DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+  auto releaser = manager.get_releaser();
+  // no users pre-populate head_state on this path, so we don't bother to
+  // handle it
+  ceph_assert(manager.target.is_head());
+  ceph_assert(manager.head_state.is_empty());
+  ceph_assert(manager.target_state.is_empty());
+  auto [obc, existed] = obc_registry.get_cached_obc(manager.target);
+  manager.set_state_obc(manager.target_state, obc);
+  manager.set_state_obc(manager.head_state, obc);
+
+  if (existed) {
+    co_await manager.target_state.lock_to(lock_type);
+  } else {
+    manager.target_state.lock_excl_sync();
+    co_await load_obc(manager.target_state.obc);
+    manager.target_state.demote_excl_to(lock_type);
   }
+  releaser.cancel();
+}
+
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock_clone(Manager &manager, RWState::State lock_type)
+{
+  LOG_PREFIX(ObjectContextLoader::load_and_lock_clone);
+  DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+  auto releaser = manager.get_releaser();
 
-  template<RWState::State State>
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_clone_obc_only(ObjectContextRef head,
-                                           hobject_t clone_oid,
-                                           with_obc_func_t&& func,
-                                           bool resolve_clone)
-  {
-    LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
-    DEBUGDPP("{}", dpp, clone_oid);
-    assert(!clone_oid.is_head());
-    if (resolve_clone) {
-      auto resolved_oid = resolve_oid(head->get_head_ss(), clone_oid);
-      if (!resolved_oid) {
-        ERRORDPP("clone {} not found", dpp, clone_oid);
-        return load_obc_iertr::future<>{
-          crimson::ct_error::enoent::make()
-        };
-      }
-      if (resolved_oid->is_head()) {
-        // See resolve_oid
-        return std::move(func)(head, head);
-      }
-      clone_oid = *resolved_oid;
+  ceph_assert(!manager.target.is_head());
+  ceph_assert(manager.target_state.is_empty());
+
+  if (manager.head_state.is_empty()) {
+    auto [obc, existed] = obc_registry.get_cached_obc(manager.target.get_head());
+    manager.set_state_obc(manager.head_state, obc);
+
+    if (existed) {
+      co_await manager.head_state.lock_to(RWState::RWREAD);
+    } else {
+      manager.head_state.lock_excl_sync();
+      co_await load_obc(manager.head_state.obc);
+      manager.head_state.demote_excl_to(RWState::RWREAD);
     }
-    return with_locked_obc<State, false /* don't track */>(
-      clone_oid,
-      [head=std::move(head), func=std::move(func)](auto clone) {
-        clone->set_clone_ssc(head->ssc);
-        return std::move(func)(std::move(head), std::move(clone));
-      });
   }
 
-  template<RWState::State State>
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_obc(hobject_t oid,
-                                with_obc_func_t&& func,
-                                bool resolve_clone)
-  {
-    if (oid.is_head()) {
-      return with_head_obc<State>(oid, std::move(func));
-    } else {
-      return with_clone_obc<State>(oid, std::move(func), resolve_clone);
+  if (manager.options.resolve_clone) {
+    auto resolved_oid = resolve_oid(
+      manager.head_state.obc->get_head_ss(),
+      manager.target);
+    if (!resolved_oid) {
+      ERRORDPP("clone {} not found", dpp, manager.target);
+      co_await load_obc_iertr::future<>(
+	crimson::ct_error::enoent::make()
+      );
     }
+    // note: might be head if snap was taken after most recent write!
+    manager.target = *resolved_oid;
   }
 
-  template<RWState::State State, bool track, typename Func>
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_locked_obc(const hobject_t& oid,
-		  Func&& func)
-  {
-    LOG_PREFIX(ObjectContextLoader::with_locked_obc);
-    auto [obc, existed] = obc_registry.get_cached_obc(oid);
-    DEBUGDPP("object {} existed {}",
-             dpp, obc->get_oid(), existed);
-    if constexpr (track) {
-      obc->append_to(obc_set_accessing);
+  if (manager.target.is_head()) {
+    /* Yes, we assert at the top that manager.target is not head.  However, it's
+     * possible that the requested snap (the resolve_clone path above) actually
+     * maps to head (a read on an rbd snapshot more recent than the most recent
+     * write on this specific rbd block, for example).
+     *
+     * In such an event, it's hypothetically possible that lock_type isn't
+     * RWREAD, in which case we need to drop and reacquire the lock.  However,
+     * this case is at present impossible.  Actual client requests cannot write
+     * to a snapshot and will therefore always be RWREAD.  The pathways that
+     * actually can mutate a clone do not set resolve_clone, so target will not
+     * become head here.
+     */
+    manager.set_state_obc(manager.target_state, manager.head_state.obc);
+    if (lock_type != manager.head_state.state) {
+      // This case isn't actually possible at the moment for the above reason.
+      manager.head_state.release_lock();
+      co_await manager.target_state.lock_to(lock_type);
+    } else {
+      manager.target_state.state = manager.head_state.state;
+      manager.head_state.state = RWState::RWNONE;
     }
+  } else {
+    auto [obc, existed] = obc_registry.get_cached_obc(manager.target);
+    manager.set_state_obc(manager.target_state, obc);
+
     if (existed) {
-      return obc->with_lock<State, IOInterruptCondition>(
-	[func=std::move(func), obc=ObjectContextRef(obc)] {
-	  return std::invoke(std::move(func), obc);
-	}
-      ).finally([FNAME, this, obc=ObjectContextRef(obc)] {
-	DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs);
-	if constexpr (track) {
-	  obc->remove_from(obc_set_accessing);
-	}
-      });
+      co_await manager.target_state.lock_to(RWState::RWREAD);
     } else {
-      return obc->load_then_with_lock<State> (
-	[this, obc=ObjectContextRef(obc)] {
-	  return load_obc(obc);
-	},
-	[func=std::move(func), obc=ObjectContextRef(obc)] {
-	  return std::invoke(std::move(func), obc);
-	}
-      ).finally([FNAME, this, obc=ObjectContextRef(obc)] {
-	DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs);
-	if constexpr (track) {
-	  obc->remove_from(obc_set_accessing);
-	}
-      });
+      manager.target_state.lock_excl_sync();
+      co_await load_obc(manager.target_state.obc);
+      manager.target_state.obc->set_clone_ssc(manager.head_state.obc->ssc);
+      manager.target_state.demote_excl_to(RWState::RWREAD);
     }
   }
 
+  releaser.cancel();
+}
+
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock(Manager &manager, RWState::State lock_type)
+{
+  LOG_PREFIX(ObjectContextLoader::load_and_lock);
+  DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+  if (manager.target.is_head()) {
+    return load_and_lock_head(manager, lock_type);
+  } else {
+    return load_and_lock_clone(manager, lock_type);
+  }
+}
 
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::load_obc(ObjectContextRef obc)
-  {
-    LOG_PREFIX(ObjectContextLoader::load_obc);
-    return backend.load_metadata(obc->get_oid())
+ObjectContextLoader::load_obc_iertr::future<>
+ObjectContextLoader::load_obc(ObjectContextRef obc)
+{
+  LOG_PREFIX(ObjectContextLoader::load_obc);
+  return backend.load_metadata(obc->get_oid())
     .safe_then_interruptible(
       [FNAME, this, obc=std::move(obc)](auto md)
       -> load_obc_ertr::future<> {
-      const hobject_t& oid = md->os.oi.soid;
-      DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
-      if (oid.is_head()) {
-        if (!md->ssc) {
-	  ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
-          return crimson::ct_error::object_corrupted::make();
-        }
-        obc->set_head_state(std::move(md->os),
-                            std::move(md->ssc));
-      } else {
-        // we load and set the ssc only for head obc.
-        // For clones, the head's ssc will be referenced later.
-        // See set_clone_ssc
-        obc->set_clone_state(std::move(md->os));
-      }
-      DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
-      return seastar::now();
-    });
-  }
-
-  ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::reload_obc(ObjectContext& obc) const
-  {
-    LOG_PREFIX(ObjectContextLoader::reload_obc);
-    assert(obc.is_head());
-    return backend.load_metadata(obc.get_oid())
-    .safe_then_interruptible<false>(
-      [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> {
-      DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid());
-      if (!md->ssc) {
-	ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid());
-        return crimson::ct_error::object_corrupted::make();
-      }
-      obc.set_head_state(std::move(md->os), std::move(md->ssc));
-      return load_obc_ertr::now();
-    });
-  }
+	const hobject_t& oid = md->os.oi.soid;
+	DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
+	if (oid.is_head()) {
+	  if (!md->ssc) {
+	    ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
+	    return crimson::ct_error::object_corrupted::make();
+	  }
+	  obc->set_head_state(std::move(md->os),
+			      std::move(md->ssc));
+	} else {
+	  // we load and set the ssc only for head obc.
+	  // For clones, the head's ssc will be referenced later.
+	  // See set_clone_ssc
+	  obc->set_clone_state(std::move(md->os));
+	}
+	DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
+	return seastar::now();
+      });
+}
 
-  void ObjectContextLoader::notify_on_change(bool is_primary)
-  {
-    LOG_PREFIX(ObjectContextLoader::notify_on_change);
-    DEBUGDPP("is_primary: {}", dpp, is_primary);
-    for (auto& obc : obc_set_accessing) {
-      DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
-      obc.interrupt(::crimson::common::actingset_changed(is_primary));
-    }
+void ObjectContextLoader::notify_on_change(bool is_primary)
+{
+  LOG_PREFIX(ObjectContextLoader::notify_on_change);
+  DEBUGDPP("is_primary: {}", dpp, is_primary);
+  for (auto& obc : obc_set_accessing) {
+    DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
+    obc.interrupt(::crimson::common::actingset_changed(is_primary));
   }
-
-  // explicitly instantiate the used instantiations
-  template ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t,
-                                                 with_obc_func_t&&,
-                                                 bool resolve_clone);
-
-  template ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t,
-                                                 with_obc_func_t&&,
-                                                 bool resolve_clone);
-
-  template ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t,
-                                                  with_obc_func_t&&,
-                                                 bool resolve_clone);
-
-  template ObjectContextLoader::load_obc_iertr::future<>
-  ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t,
-                                                 with_obc_func_t&&,
-                                                 bool resolve_clone);
+}
 }
diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h
index 277708eca4f..6d007d65176 100644
--- a/src/crimson/osd/object_context_loader.h
+++ b/src/crimson/osd/object_context_loader.h
@@ -1,9 +1,12 @@
 #pragma once
 
 #include <seastar/core/future.hh>
+#include <seastar/util/defer.hh>
 #include "crimson/common/errorator.h"
+#include "crimson/common/log.h"
 #include "crimson/osd/object_context.h"
 #include "crimson/osd/pg_backend.h"
+#include "osd/object_state_fmt.h"
 
 namespace crimson::osd {
 class ObjectContextLoader {
@@ -29,6 +32,178 @@ public:
       ::crimson::osd::IOInterruptCondition,
       load_obc_ertr>;
 
+  class Manager {
+    ObjectContextLoader &loader;
+    hobject_t target;
+
+    Manager() = delete;
+    template <typename T>
+    Manager(ObjectContextLoader &loader, T &&t)
+      : loader(loader), target(std::forward<T>(t)) {}
+    Manager(const Manager &) = delete;
+    Manager &operator=(const Manager &o) = delete;
+
+    struct options_t {
+      bool resolve_clone = true;
+    } options;
+
+    struct state_t {
+      RWState::State state = RWState::RWNONE;
+      ObjectContextRef obc;
+      bool is_empty() const { return !obc; }
+
+      void lock_excl_sync() {
+	bool locked = obc->lock.try_lock_for_excl();
+	ceph_assert(locked);
+	state = RWState::RWEXCL;
+      }
+
+      void demote_excl_to(RWState::State lock_type) {
+	assert(state == RWState::RWEXCL);
+	switch (lock_type) {
+	case RWState::RWWRITE:
+	  obc->lock.demote_to_write();
+	  state = RWState::RWWRITE;
+	  break;
+	case RWState::RWREAD:
+	  obc->lock.demote_to_read();
+	  state = RWState::RWREAD;
+	  break;
+	case RWState::RWNONE:
+	  obc->lock.unlock_for_excl();
+	  state = RWState::RWNONE;
+	  break;
+	case RWState::RWEXCL:
+	  //noop
+	  break;
+	default:
+	  ceph_assert(0 == "impossible");
+	}
+      }
+
+      auto lock_to(RWState::State lock_type) {
+	assert(state == RWState::RWNONE);
+	switch (lock_type) {
+	case RWState::RWWRITE:
+	  return interruptor::make_interruptible(
+	    obc->lock.lock_for_write().then([this] {
+	      state = RWState::RWWRITE;
+	    }));
+	case RWState::RWREAD:
+	  return interruptor::make_interruptible(
+	    obc->lock.lock_for_read().then([this] {
+	      state = RWState::RWREAD;
+	    }));
+	case RWState::RWNONE:
+	  // noop
+	  return interruptor::now();
+	case RWState::RWEXCL:
+	  return interruptor::make_interruptible(
+	    obc->lock.lock_for_excl().then([this] {
+	      state = RWState::RWEXCL;
+	    }));
+	default:
+	  ceph_assert(0 == "impossible");
+	  return interruptor::now();
+	}
+      }
+
+      void release_lock() {
+	switch (state) {
+	case RWState::RWREAD:
+	  obc->lock.unlock_for_read();
+	  break;
+	case RWState::RWWRITE:
+	  obc->lock.unlock_for_write();
+	  break;
+	case RWState::RWEXCL:
+	  obc->lock.unlock_for_excl();
+	  break;
+	case RWState::RWNONE:
+	  // noop
+	  break;
+	default:
+	  ceph_assert(0 == "invalid");
+	}
+	state = RWState::RWNONE;
+      }
+    };
+    state_t head_state;
+    state_t target_state;
+
+    friend ObjectContextLoader;
+
+    void set_state_obc(state_t &s, ObjectContextRef _obc) {
+      s.obc = std::move(_obc);
+      s.obc->append_to(loader.obc_set_accessing);
+    }
+
+    void release_state(state_t &s) {
+      LOG_PREFIX(ObjectContextLoader::release_state);
+      if (s.is_empty()) return;
+
+      s.release_lock();
+      SUBDEBUGDPP(
+	osd, "released object {}, {}",
+	loader.dpp, s.obc->get_oid(), s.obc->obs);
+      s.obc->remove_from(loader.obc_set_accessing);
+      s = state_t();
+    }
+  public:
+    Manager(Manager &&rhs) : loader(rhs.loader) {
+      std::swap(target, rhs.target);
+      std::swap(options, rhs.options);
+      std::swap(head_state, rhs.head_state);
+      std::swap(target_state, rhs.target_state);
+    }
+
+    Manager &operator=(Manager &&o) {
+      this->~Manager();
+      new(this) Manager(std::move(o));
+      return *this;
+    }
+
+    ObjectContextRef &get_obc() {
+      ceph_assert(!target_state.is_empty());
+      return target_state.obc;
+    }
+
+    ObjectContextRef &get_head_obc() {
+      ceph_assert(!head_state.is_empty());
+      return head_state.obc;
+    }
+
+    void release() {
+      release_state(head_state);
+      release_state(target_state);
+    }
+
+    auto get_releaser() {
+      return seastar::defer([this] {
+	release();
+      });
+    }
+
+    ~Manager() {
+      release();
+    }
+  };
+  Manager get_obc_manager(hobject_t oid, bool resolve_clone = true) {
+    Manager ret(*this, oid);
+    ret.options.resolve_clone = resolve_clone;
+    return ret;
+  }
+
+  using load_and_lock_ertr = load_obc_ertr;
+  using load_and_lock_iertr = interruptible::interruptible_errorator<
+    IOInterruptCondition, load_and_lock_ertr>;
+  using load_and_lock_fut = load_and_lock_iertr::future<>;
+private:
+  load_and_lock_fut load_and_lock_head(Manager &, RWState::State);
+  load_and_lock_fut load_and_lock_clone(Manager &, RWState::State);
+public:
+  load_and_lock_fut load_and_lock(Manager &, RWState::State);
+
   using interruptor = ::crimson::interruptible::interruptor<
     ::crimson::osd::IOInterruptCondition>;
 
@@ -43,8 +218,13 @@ public:
   // See SnapTrimObjSubEvent::remove_or_update - in_removed_snaps_queue usage.
   template<RWState::State State>
   load_obc_iertr::future<> with_obc(hobject_t oid,
-                                    with_obc_func_t&& func,
-                                    bool resolve_clone = true);
+                                    with_obc_func_t func,
+                                    bool resolve_clone = true) {
+    auto manager = get_obc_manager(oid, resolve_clone);
+    co_await load_and_lock(manager, State);
+    co_await std::invoke(
+      func, manager.get_head_obc(), manager.get_obc());
+  }
 
   // Use this variant in the case where the head object
   // obc is already locked and only the clone obc is needed.
@@ -53,10 +233,20 @@ public:
   template<RWState::State State>
   load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head,
                                                hobject_t clone_oid,
-                                               with_obc_func_t&& func,
-                                               bool resolve_clone = true);
-
-  load_obc_iertr::future<> reload_obc(ObjectContext& obc) const;
+                                               with_obc_func_t func,
+                                               bool resolve_clone = true) {
+    LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
+    SUBDEBUGDPP(osd, "{}", dpp, clone_oid);
+    auto manager = get_obc_manager(clone_oid, resolve_clone);
+    // We populate head_state here with the passed obc assuming that
+    // it has been loaded and locked appropriately.  We do not populate
+    // head_state.state because we won't be taking or releasing any
+    // locks on head as part of this call.
+    manager.head_state.obc = head;
+    manager.head_state.obc->append_to(obc_set_accessing);
+    co_await load_and_lock(manager, State);
+    co_await std::invoke(func, head, manager.get_obc());
+  }
 
   void notify_on_change(bool is_primary);
 
@@ -66,24 +256,9 @@ private:
   DoutPrefixProvider& dpp;
   obc_accessing_list_t obc_set_accessing;
 
-  template<RWState::State State>
-  load_obc_iertr::future<> with_clone_obc(const hobject_t& oid,
-                                          with_obc_func_t&& func,
-                                          bool resolve_clone);
-
-  template<RWState::State State>
-  load_obc_iertr::future<> with_head_obc(const hobject_t& oid,
-                                         with_obc_func_t&& func);
-
-  template<RWState::State State, bool track, typename Func>
-  load_obc_iertr::future<> with_locked_obc(const hobject_t& oid,
-					   Func&& func);
-
-  template<RWState::State State>
-  load_obc_iertr::future<ObjectContextRef>
-  get_or_load_obc(ObjectContextRef obc,
-                  bool existed);
-
   load_obc_iertr::future<> load_obc(ObjectContextRef obc);
 };
+
+using ObjectContextManager = ObjectContextLoader::Manager;
+
 }
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 9bf60140374..97b241fdce4 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -828,7 +828,7 @@ void OpsExecuter::fill_op_params(OpsExecuter::modified_by m)
   osd_op_params->mtime = msg->get_mtime();
   osd_op_params->at_version = pg->get_next_version();
   osd_op_params->pg_trim_to = pg->get_pg_trim_to();
-  osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk();
+  osd_op_params->pg_committed_to = pg->get_pg_committed_to();
   osd_op_params->last_complete = pg->get_info().last_complete;
   osd_op_params->user_modify = (m == modified_by::user);
 }
@@ -940,6 +940,7 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
   };
   encode(cloned_snaps, cloning_ctx->log_entry.snaps);
   cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size);
+  cloning_ctx->clone_obc = clone_obc;
 
   return cloning_ctx;
 }
@@ -966,7 +967,7 @@ void OpsExecuter::update_clone_overlap() {
 
 void OpsExecuter::CloningContext::apply_to(
   std::vector<pg_log_entry_t>& log_entries,
-  ObjectContext& processed_obc) &&
+  ObjectContext& processed_obc)
 {
   log_entry.mtime = processed_obc.obs.oi.mtime;
   log_entries.insert(log_entries.begin(), std::move(log_entry));
@@ -983,7 +984,7 @@ OpsExecuter::flush_clone_metadata(
   assert(!txn.empty());
   update_clone_overlap();
   if (cloning_ctx) {
-    std::move(*cloning_ctx).apply_to(log_entries, *obc);
+    cloning_ctx->apply_to(log_entries, *obc);
   }
   if (snapc.seq > obc->ssc->snapset.seq) {
      // update snapset with latest snap context
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index e770e825b32..94b64ccebb1 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -197,10 +197,11 @@ private:
   struct CloningContext {
     SnapSet new_snapset;
     pg_log_entry_t log_entry;
+    ObjectContextRef clone_obc;
 
     void apply_to(
       std::vector<pg_log_entry_t>& log_entries,
-      ObjectContext& processed_obc) &&;
+      ObjectContext& processed_obc);
   };
   std::unique_ptr<CloningContext> cloning_ctx;
 
@@ -504,6 +505,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
     ceph_assert(want_mutate);
   }
 
+  apply_stats();
   if (want_mutate) {
     auto log_entries = flush_clone_metadata(
       prepare_transaction(ops),
@@ -519,14 +521,15 @@ OpsExecuter::flush_changes_n_do_ops_effects(
       std::move(txn),
       std::move(obc),
       std::move(*osd_op_params),
-      std::move(log_entries));
+      std::move(log_entries),
+      cloning_ctx
+	? std::move(cloning_ctx->clone_obc)
+	: nullptr);
 
     submitted = std::move(_submitted);
     all_completed = std::move(_all_completed);
   }
 
-  apply_stats();
-
   if (op_effects.size()) [[unlikely]] {
     // need extra ref pg due to apply_stats() which can be executed after
     // informing snap mapper
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index a89fb2c84bc..61a56600a57 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -14,6 +14,7 @@
 #include "crimson/osd/osd_operations/client_request.h"
 #include "crimson/osd/osd_connection_priv.h"
 #include "osd/object_state_fmt.h"
+#include "osd/osd_perf_counters.h"
 
 SET_SUBSYS(osd);
 
@@ -42,15 +43,17 @@ void ClientRequest::Orderer::clear_and_cancel(PG &pg)
 {
   LOG_PREFIX(ClientRequest::Orderer::clear_and_cancel);
   for (auto i = list.begin(); i != list.end(); ) {
-    DEBUGDPP("{}", pg, *i);
-    i->complete_request();
-    remove_request(*(i++));
+    auto &req = *i;
+    DEBUGDPP("{}", pg, req);
+    ++i;
+    req.complete_request(pg);
   }
 }
 
-void ClientRequest::complete_request()
+void ClientRequest::complete_request(PG &pg)
 {
   track_event<CompletionEvent>();
+  pg.client_request_orderer.remove_request(*this);
   on_complete.set_value();
 }
 
@@ -142,7 +145,6 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
     // parts would end up in the same PG so that they could be clone_range'd into
     // the same object via librados, but that's not how multipart upload works
     // anymore and we no longer support clone_range via librados.
-    get_handle().exit();
     co_await reply_op_error(pgref, -ENOTSUP);
     co_return;
   }
@@ -152,8 +154,6 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
 	std::ref(get_foreign_connection()), m->get_map_epoch()
       ));
     DEBUGDPP("{}: discarding {}", *pgref, *this, this_instance_id);
-    pgref->client_request_orderer.remove_request(*this);
-    complete_request();
     co_return;
   }
   DEBUGDPP("{}.{}: entering await_map stage",
@@ -190,15 +190,25 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
       DEBUGDPP("{}.{}: dropping misdirected op",
 	       pg, *this, this_instance_id);
       co_return;
-    } else if (const hobject_t& hoid = m->get_hobj();
-               !pg.get_peering_state().can_serve_replica_read(hoid)) {
+    }
+
+    pg.get_perf_logger().inc(l_osd_replica_read);
+    if (pg.is_unreadable_object(m->get_hobj())) {
+      DEBUGDPP("{}.{}: {} missing on replica, bouncing to primary",
+	       pg, *this, this_instance_id, m->get_hobj());
+      pg.get_perf_logger().inc(l_osd_replica_read_redirect_missing);
+      co_await reply_op_error(pgref, -EAGAIN);
+      co_return;
+    } else if (!pg.get_peering_state().can_serve_replica_read(m->get_hobj())) {
       DEBUGDPP("{}.{}: unstable write on replica, bouncing to primary",
 	       pg, *this, this_instance_id);
+      pg.get_perf_logger().inc(l_osd_replica_read_redirect_conflict);
       co_await reply_op_error(pgref, -EAGAIN);
       co_return;
     } else {
       DEBUGDPP("{}.{}: serving replica read on oid {}",
 	       pg, *this, this_instance_id, m->get_hobj());
+      pg.get_perf_logger().inc(l_osd_replica_read_served);
     }
   }
 
@@ -232,8 +242,6 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
   DEBUGDPP("{}.{}: process[_pg]_op complete,"
 	   "removing request from orderer",
 	   *pgref, *this, this_instance_id);
-  pgref->client_request_orderer.remove_request(*this);
-  complete_request();
 }
 
 seastar::future<> ClientRequest::with_pg_process(
@@ -250,7 +258,11 @@ seastar::future<> ClientRequest::with_pg_process(
   auto &ihref = *instance_handle;
   return interruptor::with_interruption(
     [this, pgref, this_instance_id, &ihref]() mutable {
-      return with_pg_process_interruptible(pgref, this_instance_id, ihref);
+      return with_pg_process_interruptible(
+	pgref, this_instance_id, ihref
+      ).then_interruptible([this, pgref] {
+	complete_request(*pgref);
+      });
     }, [FNAME, this, this_instance_id, pgref](std::exception_ptr eptr) {
       DEBUGDPP("{}.{}: interrupted due to {}",
 	       *pgref, *this, this_instance_id, eptr);
@@ -290,29 +302,41 @@ ClientRequest::process_pg_op(
 ClientRequest::interruptible_future<>
 ClientRequest::recover_missing_snaps(
   Ref<PG> pg,
-  instance_handle_t &ihref,
-  ObjectContextRef head,
   std::set<snapid_t> &snaps)
 {
   LOG_PREFIX(ClientRequest::recover_missing_snaps);
-  for (auto &snap : snaps) {
-    auto coid = head->obs.oi.soid;
-    coid.snap = snap;
-    auto oid = resolve_oid(head->get_head_ss(), coid);
-    /* Rollback targets may legitimately not exist if, for instance,
-     * the object is an rbd block which happened to be sparse and
-     * therefore non-existent at the time of the specified snapshot.
-     * In such a case, rollback will simply delete the object.  Here,
-     * we skip the oid as there is no corresponding clone to recover.
-     * See https://tracker.ceph.com/issues/63821 */
-    if (oid) {
-      auto unfound = co_await do_recover_missing(pg, *oid, m->get_reqid());
-      if (unfound) {
-        DEBUGDPP("{} unfound, hang it for now", *pg, *oid);
-        co_await interruptor::make_interruptible(
-          pg->get_recovery_backend()->add_unfound(*oid));
+
+  std::vector<hobject_t> ret;
+  auto resolve_oids = pg->obc_loader.with_obc<RWState::RWREAD>(
+    m->get_hobj().get_head(),
+    [&snaps, &ret](auto head, auto) {
+    for (auto &snap : snaps) {
+      auto coid = head->obs.oi.soid;
+      coid.snap = snap;
+      auto oid = resolve_oid(head->get_head_ss(), coid);
+      /* Rollback targets may legitimately not exist if, for instance,
+       * the object is an rbd block which happened to be sparse and
+       * therefore non-existent at the time of the specified snapshot.
+       * In such a case, rollback will simply delete the object.  Here,
+       * we skip the oid as there is no corresponding clone to recover.
+       * See https://tracker.ceph.com/issues/63821 */
+      if (oid) {
+        ret.emplace_back(std::move(*oid));
       }
     }
+    return seastar::now();
+  }).handle_error_interruptible(
+    crimson::ct_error::assert_all("unexpected error")
+  );
+  co_await std::move(resolve_oids);
+
+  for (auto &oid : ret) {
+    auto unfound = co_await do_recover_missing(pg, oid, m->get_reqid());
+    if (unfound) {
+      DEBUGDPP("{} unfound, hang it for now", *pg, oid);
+      co_await interruptor::make_interruptible(
+        pg->get_recovery_backend()->add_unfound(oid));
+    }
   }
 }
 
@@ -337,15 +361,7 @@ ClientRequest::process_op(
 
     std::set<snapid_t> snaps = snaps_need_to_recover();
     if (!snaps.empty()) {
-      auto with_obc = pg->obc_loader.with_obc<RWState::RWREAD>(
-        m->get_hobj().get_head(),
-        [&snaps, &ihref, pg, this](auto head, auto) {
-        return recover_missing_snaps(pg, ihref, head, snaps);
-      }).handle_error_interruptible(
-        crimson::ct_error::assert_all("unexpected error")
-      );
-      // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401
-      co_await std::move(with_obc);
+      co_await recover_missing_snaps(pg, snaps);
     }
   }
 
@@ -385,51 +401,40 @@ ClientRequest::process_op(
 
   DEBUGDPP("{}.{}: past scrub blocker, getting obc",
 	   *pg, *this, this_instance_id);
-  // call with_locked_obc() in order, but wait concurrently for loading.
+
+  auto obc_manager = pg->obc_loader.get_obc_manager(m->get_hobj());
+
+  // initiate load_and_lock in order, but wait concurrently
   ihref.enter_stage_sync(
       client_pp(*pg).lock_obc, *this);
-  auto process = pg->with_locked_obc(
-    m->get_hobj(), op_info,
-    [FNAME, this, pg, this_instance_id, &ihref] (
-      auto head, auto obc
-    ) -> interruptible_future<> {
-      DEBUGDPP("{}.{}: got obc {}, entering process stage",
-	       *pg, *this, this_instance_id, obc->obs);
-      return ihref.enter_stage<interruptor>(
-	client_pp(*pg).process, *this
-      ).then_interruptible(
-	[FNAME, this, pg, this_instance_id, obc, &ihref]() mutable {
-	  DEBUGDPP("{}.{}: in process stage, calling do_process",
-		   *pg, *this, this_instance_id);
-	  return do_process(
-	    ihref, pg, obc, this_instance_id
-	  );
-	}
-      );
-    }).handle_error_interruptible(
-      PG::load_obc_ertr::all_same_way(
-	[FNAME, this, pg=std::move(pg), this_instance_id](
-	  const auto &code
-	) -> interruptible_future<> {
-	  DEBUGDPP("{}.{}: saw error code {}",
-		   *pg, *this, this_instance_id, code);
-	  assert(code.value() > 0);
-	  return reply_op_error(pg, -code.value());
-	})
-    );
 
-  /* The following works around gcc bug
-   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401.
-   * The specific symptom I observed is the pg param being
-   * destructed multiple times resulting in the refcount going
-   * rapidly to 0 destoying the PG prematurely.
-   *
-   * This bug seems to be resolved in gcc 13.2.1.
-   *
-   * Assigning the intermediate result and moving it into the co_await
-   * expression bypasses both bugs.
-   */
-  co_await std::move(process);
+  int load_err = co_await pg->obc_loader.load_and_lock(
+    obc_manager, pg->get_lock_type(op_info)
+  ).si_then([]() -> int {
+    return 0;
+  }).handle_error_interruptible(
+    PG::load_obc_ertr::all_same_way(
+      [](const auto &code) -> int {
+	return -code.value();
+      })
+  );
+  if (load_err) {
+    DEBUGDPP("{}.{}: saw error code loading obc {}",
+	     *pg, *this, this_instance_id, load_err);
+    co_await reply_op_error(pg, load_err);
+    co_return;
+  }
+
+  DEBUGDPP("{}.{}: got obc {}, entering process stage",
+	   *pg, *this, this_instance_id, obc_manager.get_obc()->obs);
+  co_await ihref.enter_stage<interruptor>(
+    client_pp(*pg).process, *this);
+
+  DEBUGDPP("{}.{}: in process stage, calling do_process",
+	   *pg, *this, this_instance_id);
+  co_await do_process(
+    ihref, pg, obc_manager.get_obc(), this_instance_id
+  );
 }
 
 ClientRequest::interruptible_future<>
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index 6ee57e9874c..6d1043e2783 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -210,7 +210,7 @@ public:
     void requeue(Ref<PG> pg);
     void clear_and_cancel(PG &pg);
   };
-  void complete_request();
+  void complete_request(PG &pg);
 
   static constexpr OperationTypeCode type = OperationTypeCode::client_request;
 
@@ -285,8 +285,6 @@ private:
   interruptible_future<>
   recover_missing_snaps(
     Ref<PG> pg,
-    instance_handle_t &ihref,
-    ObjectContextRef head,
     std::set<snapid_t> &snaps);
   ::crimson::interruptible::interruptible_future<
     ::crimson::osd::IOInterruptCondition> process_op(
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 9e5867caf80..4790025065a 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -4,6 +4,7 @@
 #include <seastar/core/future.hh>
 
 #include "crimson/osd/osd_operations/internal_client_request.h"
+#include "osd/object_state_fmt.h"
 
 namespace {
   seastar::logger& logger() {
@@ -81,14 +82,7 @@ InternalClientRequest::interruptible_future<>
 InternalClientRequest::with_interruption()
 {
   LOG_PREFIX(InternalClientRequest::with_interruption);
-  co_await enter_stage<interruptor>(
-    client_pp().wait_for_active
-  );
-
-  co_await with_blocking_event<PGActivationBlocker::BlockingEvent,
-			       interruptor>([this] (auto&& trigger) {
-    return pg->wait_for_active_blocker.wait(std::move(trigger));
-  });
+  assert(pg->is_active());
 
   co_await enter_stage<interruptor>(client_pp().recover_missing);
 
@@ -112,21 +106,25 @@ InternalClientRequest::with_interruption()
   [[maybe_unused]] const int ret = op_info.set_from_op(
     std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
   assert(ret == 0);
-  // call with_locked_obc() in order, but wait concurrently for loading.
+
+  auto obc_manager = pg->obc_loader.get_obc_manager(get_target_oid());
+
+  // initiate load_and_lock in order, but wait concurrently
   enter_stage_sync(client_pp().lock_obc);
 
-  auto fut = pg->with_locked_obc(
-    get_target_oid(), op_info,
-    [&osd_ops, this](auto, auto obc) {
-      return enter_stage<interruptor>(client_pp().process
-      ).then_interruptible(
-	[obc=std::move(obc), &osd_ops, this]() mutable {
-	  return do_process(std::move(obc), osd_ops);
-	});
-    }).handle_error_interruptible(
-      crimson::ct_error::assert_all("unexpected error")
-    );
-  co_await std::move(fut);
+  co_await pg->obc_loader.load_and_lock(
+    obc_manager, pg->get_lock_type(op_info)
+  ).handle_error_interruptible(
+    crimson::ct_error::assert_all("unexpected error")
+  );
+
+  DEBUGDPP("{}: got obc {}, entering process stage",
+	   *pg, *this, obc_manager.get_obc()->obs);
+  co_await enter_stage<interruptor>(client_pp().process);
+
+  DEBUGDPP("{}: in process stage, calling do_process",
+	   *pg, *this);
+  co_await do_process(obc_manager.get_obc(), osd_ops);
 
   logger().debug("{}: complete", *this);
   co_await interruptor::make_interruptible(handle.complete());
diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h
index 102cb7fff6b..14202582100 100644
--- a/src/crimson/osd/osd_operations/osdop_params.h
+++ b/src/crimson/osd/osd_operations/osdop_params.h
@@ -12,7 +12,7 @@ struct osd_op_params_t {
   utime_t mtime;
   eversion_t at_version;
   eversion_t pg_trim_to;
-  eversion_t min_last_complete_ondisk;
+  eversion_t pg_committed_to;
   eversion_t last_complete;
   bool user_modify = false;
   ObjectCleanRegions clean_regions;
diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc
index a8d9fce69b6..fb5696b0a9e 100644
--- a/src/crimson/osd/osd_operations/peering_event.cc
+++ b/src/crimson/osd/osd_operations/peering_event.cc
@@ -166,7 +166,8 @@ void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services)
       ctx.send_notify(q.from.osd, {q.query.from, q.query.to,
 				   q.query.epoch_sent,
 				   map_epoch, empty,
-				   PastIntervals{}});
+				   PastIntervals{},
+				   PG_FEATURE_CRIMSON_ALL});
     }
   }
 }
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index 9ed0b73cfb4..8cab6125682 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -399,13 +399,20 @@ SnapTrimObjSubEvent::start()
     client_pp().check_already_complete_get_obc);
 
   logger().debug("{}: getting obc for {}", *this, coid);
-  // end of commonality
-  // lock both clone's and head's obcs
-  co_await pg->obc_loader.with_obc<RWState::RWWRITE>(
-    coid,
-    std::bind(&SnapTrimObjSubEvent::process_and_submit,
-              this, std::placeholders::_1, std::placeholders::_2),
-    false
+
+
+  auto obc_manager = pg->obc_loader.get_obc_manager(
+    coid, false /* resolve_oid */);
+
+  co_await pg->obc_loader.load_and_lock(
+    obc_manager, RWState::RWWRITE
+  ).handle_error_interruptible(
+    remove_or_update_iertr::pass_further{},
+    crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"}
+  );
+
+  co_await process_and_submit(
+    obc_manager.get_head_obc(), obc_manager.get_obc()
   ).handle_error_interruptible(
     remove_or_update_iertr::pass_further{},
     crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"}
@@ -428,6 +435,7 @@ SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc,
 
   auto [submitted, all_completed] = co_await pg->submit_transaction(
 	  std::move(clone_obc),
+	  nullptr,
 	  std::move(txn),
 	  std::move(osd_op_p),
 	  std::move(log_entries)
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 744a1dbc02b..1e2988efbbe 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -132,6 +132,7 @@ PG::PG(
 	pool,
 	name),
       osdmap,
+      PG_FEATURE_CRIMSON_ALL,
       this,
       this),
     scrubber(*this),
@@ -392,7 +393,13 @@ void PG::on_replica_activate()
 
 void PG::on_activate_complete()
 {
-  wait_for_active_blocker.unblock();
+  /* Confusingly, on_activate_complete is invoked when the primary and replicas
+   * have recorded the current interval.  At that point, the PG may either become
+   * ACTIVE or PEERED, depending on whether the acting set is eligible for client
+   * IO.  Only unblock wait_for_active_blocker if we actually became ACTIVE */
+  if (peering_state.is_active()) {
+    wait_for_active_blocker.unblock();
+  }
 
   if (peering_state.needs_recovery()) {
     logger().info("{}: requesting recovery",
@@ -900,11 +907,23 @@ void PG::mutate_object(
   }
 }
 
+void PG::enqueue_push_for_backfill(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers)
+{
+  assert(recovery_handler);
+  assert(recovery_handler->backfill_state);
+  auto backfill_state = recovery_handler->backfill_state.get();
+  backfill_state->enqueue_standalone_push(obj, v, peers);
+}
+
 PG::interruptible_future<
   std::tuple<PG::interruptible_future<>,
              PG::interruptible_future<>>>
 PG::submit_transaction(
   ObjectContextRef&& obc,
+  ObjectContextRef&& new_clone,
   ceph::os::Transaction&& txn,
   osd_op_params_t&& osd_op_p,
   std::vector<pg_log_entry_t>&& log_entries)
@@ -917,17 +936,23 @@ PG::submit_transaction(
   }
 
   epoch_t map_epoch = get_osdmap_epoch();
+  auto at_version = osd_op_p.at_version;
 
-  peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+  peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, at_version);
   peering_state.update_trim_to();
 
   ceph_assert(!log_entries.empty());
   ceph_assert(log_entries.rbegin()->version >= projected_last_update);
   projected_last_update = log_entries.rbegin()->version;
 
+  for (const auto& entry: log_entries) {
+    projected_log.add(entry);
+  }
+
   auto [submitted, all_completed] = co_await backend->submit_transaction(
       peering_state.get_acting_recovery_backfill(),
       obc->obs.oi.soid,
+      std::move(new_clone),
       std::move(txn),
       std::move(osd_op_p),
       peering_state.get_last_peering_reset(),
@@ -936,8 +961,8 @@ PG::submit_transaction(
   co_return std::make_tuple(
     std::move(submitted),
     all_completed.then_interruptible(
-      [this, last_complete=peering_state.get_info().last_complete,
-      at_version=osd_op_p.at_version](auto acked) {
+      [this, last_complete=peering_state.get_info().last_complete, at_version]
+      (auto acked) {
       for (const auto& peer : acked) {
         peering_state.update_peer_last_complete_ondisk(
           peer.shard, peer.last_complete_ondisk);
@@ -1034,7 +1059,7 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
   ceph::os::Transaction t;
   peering_state.merge_new_log_entries(
     log_entries, t, peering_state.get_pg_trim_to(),
-    peering_state.get_min_last_complete_ondisk());
+    peering_state.get_pg_committed_to());
 
   return seastar::do_with(log_entries, set<pg_shard_t>{},
     [this, t=std::move(t), rep_tid](auto& log_entries, auto& waiting_on) mutable {
@@ -1055,7 +1080,7 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
                    get_last_peering_reset(),
                    rep_tid,
                    peering_state.get_pg_trim_to(),
-                   peering_state.get_min_last_complete_ondisk());
+                   peering_state.get_pg_committed_to());
       waiting_on.insert(peer);
       logger().debug("submit_error_log: sending log"
         "missing_request (rep_tid: {} entries: {})"
@@ -1142,11 +1167,13 @@ PG::submit_executer_fut PG::submit_executer(
     [FNAME, this](auto&& txn,
 		  auto&& obc,
 		  auto&& osd_op_p,
-		  auto&& log_entries) {
+		  auto&& log_entries,
+                  auto&& new_clone) {
       DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
       mutate_object(obc, txn, osd_op_p);
       return submit_transaction(
 	std::move(obc),
+        std::move(new_clone),
 	std::move(txn),
 	std::move(osd_op_p),
 	std::move(log_entries));
@@ -1215,31 +1242,6 @@ void PG::check_blocklisted_obc_watchers(
   }
 }
 
-PG::load_obc_iertr::future<>
-PG::with_locked_obc(const hobject_t &hobj,
-                    const OpInfo &op_info,
-                    with_obc_func_t &&f)
-{
-  if (__builtin_expect(stopping, false)) {
-    throw crimson::common::system_shutdown_exception();
-  }
-  const hobject_t oid = get_oid(hobj);
-  auto wrapper = [f=std::move(f), this](auto head, auto obc) {
-    check_blocklisted_obc_watchers(obc);
-    return f(head, obc);
-  };
-  switch (get_lock_type(op_info)) {
-  case RWState::RWREAD:
-      return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper));
-  case RWState::RWWRITE:
-      return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper));
-  case RWState::RWEXCL:
-      return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper));
-  default:
-    ceph_abort();
-  };
-}
-
 void PG::update_stats(const pg_stat_t &stat) {
   peering_state.update_stats(
     [&stat] (auto& history, auto& stats) {
@@ -1272,7 +1274,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
   log_operation(std::move(log_entries),
                 req->pg_trim_to,
                 req->version,
-                req->min_last_complete_ondisk,
+                req->pg_committed_to,
                 !txn.empty(),
                 txn,
                 false);
@@ -1318,27 +1320,23 @@ void PG::log_operation(
   std::vector<pg_log_entry_t>&& logv,
   const eversion_t &trim_to,
   const eversion_t &roll_forward_to,
-  const eversion_t &min_last_complete_ondisk,
+  const eversion_t &pg_committed_to,
   bool transaction_applied,
   ObjectStore::Transaction &txn,
   bool async) {
   logger().debug("{}", __func__);
   if (is_primary()) {
-    ceph_assert(trim_to <= peering_state.get_last_update_ondisk());
+    ceph_assert(trim_to <= peering_state.get_pg_committed_to());
   }
-  /* TODO: when we add snap mapper and projected log support,
-   * we'll likely want to update them here.
-   *
-   * See src/osd/PrimaryLogPG.h:log_operation for how classic
-   * handles these cases.
-   */
-#if 0
   auto last = logv.rbegin();
   if (is_primary() && last != logv.rend()) {
+    logger().debug("{} on primary, trimming projected log",
+                   __func__);
     projected_log.skip_can_rollback_to_to_head();
-    projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+    projected_log.trim(shard_services.get_cct(), last->version,
+                       nullptr, nullptr, nullptr);
   }
-#endif
+
   if (!is_primary()) { // && !is_ec_pg()
     replica_clear_repop_obc(logv);
   }
@@ -1348,7 +1346,7 @@ void PG::log_operation(
   peering_state.append_log(std::move(logv),
                            trim_to,
                            roll_forward_to,
-                           min_last_complete_ondisk,
+                           pg_committed_to,
                            txn,
                            !txn.empty(),
                            false);
@@ -1387,17 +1385,17 @@ PG::interruptible_future<> PG::do_update_log_missing(
 
   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
   ObjectStore::Transaction t;
-  std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+  std::optional<eversion_t> op_trim_to, op_pg_committed_to;
   if (m->pg_trim_to != eversion_t())
     op_trim_to = m->pg_trim_to;
-  if (m->pg_roll_forward_to != eversion_t())
-    op_roll_forward_to = m->pg_roll_forward_to;
-  logger().debug("op_trim_to = {}, op_roll_forward_to = {}",
+  if (m->pg_committed_to != eversion_t())
+    op_pg_committed_to = m->pg_committed_to;
+  logger().debug("op_trim_to = {}, op_pg_committed_to = {}",
     op_trim_to.has_value() ? *op_trim_to : eversion_t(),
-    op_roll_forward_to.has_value() ? *op_roll_forward_to : eversion_t());
+    op_pg_committed_to.has_value() ? *op_pg_committed_to : eversion_t());
 
   peering_state.append_log_entries_update_missing(
-    m->entries, t, op_trim_to, op_roll_forward_to);
+    m->entries, t, op_trim_to, op_pg_committed_to);
 
   return interruptor::make_interruptible(shard_services.get_store().do_transaction(
     coll_ref, std::move(t))).then_interruptible(
@@ -1615,14 +1613,21 @@ bool PG::should_send_op(
     return true;
   bool should_send =
     (hoid.pool != (int64_t)get_info().pgid.pool() ||
-    (has_backfill_state() && hoid <= get_last_backfill_started()) ||
-    hoid <= peering_state.get_peer_info(peer).last_backfill);
+    // An object has been fully pushed to the backfill target if and only if
+    // either of the following conditions is met:
+    // 1. peer_info.last_backfill has passed "hoid"
+    // 2. last_backfill_started has passed "hoid" and "hoid" is not in the peer
+    //    missing set
+    hoid <= peering_state.get_peer_info(peer).last_backfill ||
+    (has_backfill_state() && hoid <= get_last_backfill_started() &&
+     !is_missing_on_peer(peer, hoid)));
   if (!should_send) {
     ceph_assert(is_backfill_target(peer));
     logger().debug("{} issue_repop shipping empty opt to osd."
                    "{}, object {} beyond std::max(last_backfill_started, "
                    "peer_info[peer].last_backfill {})",
-                   peer, hoid, peering_state.get_peer_info(peer).last_backfill);
+                   __func__, peer, hoid,
+                   peering_state.get_peer_info(peer).last_backfill);
   }
   return should_send;
   // TODO: should consider async recovery cases in the future which are not supported
@@ -1637,8 +1642,8 @@ PG::already_complete(const osd_reqid_t& reqid)
   int ret;
   std::vector<pg_log_op_return_item_t> op_returns;
 
-  if (peering_state.get_pg_log().get_log().get_request(
-	reqid, &version, &user_version, &ret, &op_returns)) {
+  if (check_in_progress_op(
+        reqid, &version, &user_version, &ret, &op_returns)) {
     complete_op_t dupinfo{
       user_version,
       version,
@@ -1703,4 +1708,19 @@ void PG::C_PG_FinishRecovery::finish(int r) {
     DEBUGDPP("stale recovery finsher", pg);
   }
 }
+bool PG::check_in_progress_op(
+  const osd_reqid_t& reqid,
+  eversion_t *version,
+  version_t *user_version,
+  int *return_code,
+  std::vector<pg_log_op_return_item_t> *op_returns
+  ) const
+{
+  return (
+    projected_log.get_request(reqid, version, user_version, return_code,
+                              op_returns) ||
+    peering_state.get_pg_log().get_log().get_request(
+      reqid, version, user_version, return_code, op_returns));
+}
+
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 604f49005ff..15aeec0e4f3 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -45,6 +45,7 @@
 class MQuery;
 class OSDMap;
 class PGBackend;
+class ReplicatedBackend;
 class PGPeeringEvent;
 class osd_op_params_t;
 
@@ -129,8 +130,8 @@ public:
     return peering_state.get_pg_trim_to();
   }
 
-  eversion_t get_min_last_complete_ondisk() const {
-    return peering_state.get_min_last_complete_ondisk();
+  eversion_t get_pg_committed_to() const {
+    return peering_state.get_pg_committed_to();
   }
 
   const pg_info_t& get_info() const final {
@@ -376,6 +377,7 @@ public:
   void check_blocklisted_watchers() final;
   void clear_primary_state() final {
     recovery_finisher = nullptr;
+    projected_log = PGLog::IndexedLog();
   }
 
   void queue_check_readable(epoch_t last_peering_reset,
@@ -517,6 +519,9 @@ public:
 
 
   // Utility
+  bool is_active() const {
+    return peering_state.is_active();
+  }
   bool is_active_clean() const {
     return peering_state.is_active() && peering_state.is_clean();
   }
@@ -589,11 +594,6 @@ public:
   using with_obc_func_t =
     std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>;
 
-  load_obc_iertr::future<> with_locked_obc(
-    const hobject_t &hobj,
-    const OpInfo &op_info,
-    with_obc_func_t&& f);
-
   interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
   void update_stats(const pg_stat_t &stat);
   interruptible_future<> update_snap_map(
@@ -603,7 +603,7 @@ public:
     std::vector<pg_log_entry_t>&& logv,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_commited_to,
     bool transaction_applied,
     ObjectStore::Transaction &txn,
     bool async = false);
@@ -679,6 +679,7 @@ private:
     std::tuple<interruptible_future<>, interruptible_future<>>>
   submit_transaction(
     ObjectContextRef&& obc,
+    ObjectContextRef&& new_clone,
     ceph::os::Transaction&& txn,
     osd_op_params_t&& oop,
     std::vector<pg_log_entry_t>&& log_entries);
@@ -826,8 +827,15 @@ public:
     const eversion_t version;
     const int err;
   };
+  PGLog::IndexedLog projected_log;
   interruptible_future<std::optional<complete_op_t>>
   already_complete(const osd_reqid_t& reqid);
+  bool check_in_progress_op(
+    const osd_reqid_t& reqid,
+    eversion_t *version,
+    version_t *user_version,
+    int *return_code,
+    std::vector<pg_log_op_return_item_t> *op_returns) const;
   int get_recovery_op_priority() const {
     int64_t pri = 0;
     get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
@@ -879,6 +887,10 @@ private:
   friend class SnapTrimObjSubEvent;
 private:
 
+  void enqueue_push_for_backfill(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
   void mutate_object(
     ObjectContextRef& obc,
     ceph::os::Transaction& txn,
@@ -887,7 +899,7 @@ private:
   bool can_discard_op(const MOSDOp& m) const;
   void context_registry_on_change();
   bool is_missing_object(const hobject_t& soid) const {
-    return peering_state.get_pg_log().get_missing().get_items().count(soid);
+    return get_local_missing().is_missing(soid);
   }
   bool is_unreadable_object(const hobject_t &oid,
 			    eversion_t* v = 0) const final {
@@ -895,6 +907,11 @@ private:
       !peering_state.get_missing_loc().readable_with_acting(
 	oid, get_actingset(), v);
   }
+  bool is_missing_on_peer(
+    const pg_shard_t &peer,
+    const hobject_t &soid) const {
+    return peering_state.get_peer_missing(peer).is_missing(soid);
+  }
   bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
   const std::set<pg_shard_t> &get_actingset() const {
     return peering_state.get_actingset();
@@ -902,6 +919,7 @@ private:
 
 private:
   friend class IOInterruptCondition;
+  friend class ::ReplicatedBackend;
   struct log_update_t {
     std::set<pg_shard_t> waiting_on;
     seastar::shared_promise<> all_committed;
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index fa1f1405ffe..813218983fd 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -414,6 +414,7 @@ public:
   virtual rep_op_fut_t
   submit_transaction(const std::set<pg_shard_t> &pg_shards,
 		     const hobject_t& hoid,
+		     crimson::osd::ObjectContextRef&& new_clone,
 		     ceph::os::Transaction&& txn,
 		     osd_op_params_t&& osd_op_p,
 		     epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 705b3176b97..657e6d3e888 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -45,6 +45,10 @@ public:
 
   seastar::future<> stop() { return seastar::now(); }
   void on_pg_clean();
+  void enqueue_push(
+    const hobject_t& obj,
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
 private:
   PGRecoveryListener* pg;
   size_t start_primary_recovery_ops(
@@ -108,10 +112,6 @@ private:
     const hobject_t& end) final;
   void request_primary_scan(
     const hobject_t& begin) final;
-  void enqueue_push(
-    const hobject_t& obj,
-    const eversion_t& v,
-    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index cbb8c883e07..f09cd147ea9 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -36,19 +36,59 @@ ReplicatedBackend::_read(const hobject_t& hoid,
   return store->read(coll, ghobject_t{hoid}, off, len, flags);
 }
 
+MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg(
+  const pg_shard_t &pg_shard,
+  const hobject_t &hoid,
+  const bufferlist &encoded_txn,
+  const osd_op_params_t &osd_op_p,
+  epoch_t min_epoch,
+  epoch_t map_epoch,
+  const std::vector<pg_log_entry_t> &log_entries,
+  bool send_op,
+  ceph_tid_t tid)
+{
+  ceph_assert(pg_shard != whoami);
+  auto m = crimson::make_message<MOSDRepOp>(
+    osd_op_p.req_id,
+    whoami,
+    spg_t{pgid, pg_shard.shard},
+    hoid,
+    CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+    map_epoch,
+    min_epoch,
+    tid,
+    osd_op_p.at_version);
+  if (send_op) {
+    m->set_data(encoded_txn);
+  } else {
+    ceph::os::Transaction t;
+    bufferlist bl;
+    encode(t, bl);
+    m->set_data(bl);
+  }
+  encode(log_entries, m->logbl);
+  m->pg_trim_to = osd_op_p.pg_trim_to;
+  m->pg_committed_to = osd_op_p.pg_committed_to;
+  m->pg_stats = pg.get_info().stats;
+  return m;
+}
+
 ReplicatedBackend::rep_op_fut_t
-ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
-                                      const hobject_t& hoid,
-                                      ceph::os::Transaction&& t,
-                                      osd_op_params_t&& opp,
-                                      epoch_t min_epoch, epoch_t map_epoch,
-				      std::vector<pg_log_entry_t>&& logv)
+ReplicatedBackend::submit_transaction(
+  const std::set<pg_shard_t> &pg_shards,
+  const hobject_t& hoid,
+  crimson::osd::ObjectContextRef &&new_clone,
+  ceph::os::Transaction&& t,
+  osd_op_params_t&& opp,
+  epoch_t min_epoch, epoch_t map_epoch,
+  std::vector<pg_log_entry_t>&& logv)
 {
   LOG_PREFIX(ReplicatedBackend::submit_transaction);
   DEBUGDPP("object {}", dpp, hoid);
   auto log_entries = std::move(logv);
   auto txn = std::move(t);
   auto osd_op_p = std::move(opp);
+  auto _new_clone = std::move(new_clone);
 
   const ceph_tid_t tid = shard_services.get_tid();
   auto pending_txn =
@@ -60,37 +100,34 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     le.mark_unrollbackable();
   }
 
+  std::vector<pg_shard_t> to_push_clone;
   auto sends = std::make_unique<std::vector<seastar::future<>>>();
-  for (auto pg_shard : pg_shards) {
-    if (pg_shard != whoami) {
-      auto m = crimson::make_message<MOSDRepOp>(
-	osd_op_p.req_id,
-	whoami,
-	spg_t{pgid, pg_shard.shard},
-	hoid,
-	CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
-	map_epoch,
-	min_epoch,
-	tid,
-	osd_op_p.at_version);
-      if (pg.should_send_op(pg_shard, hoid)) {
-	m->set_data(encoded_txn);
-      } else {
-	ceph::os::Transaction t;
-	bufferlist bl;
-	encode(t, bl);
-	m->set_data(bl);
+  for (auto &pg_shard : pg_shards) {
+    if (pg_shard == whoami) {
+      continue;
+    }
+    MURef<MOSDRepOp> m;
+    if (pg.should_send_op(pg_shard, hoid)) {
+      m = new_repop_msg(
+	pg_shard, hoid, encoded_txn, osd_op_p,
+	min_epoch, map_epoch, log_entries, true, tid);
+    } else {
+      m = new_repop_msg(
+	pg_shard, hoid, encoded_txn, osd_op_p,
+	min_epoch, map_epoch, log_entries, false, tid);
+      if (_new_clone && pg.is_missing_on_peer(pg_shard, hoid)) {
+	// The head is in the push queue but hasn't been pushed yet.
+	// We need to ensure that the newly created clone will be 
+	// pushed as well, otherwise we might skip it.
+	// See: https://tracker.ceph.com/issues/68808
+	to_push_clone.push_back(pg_shard);
       }
-      pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
-      encode(log_entries, m->logbl);
-      m->pg_trim_to = osd_op_p.pg_trim_to;
-      m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
-      m->pg_stats = pg.get_info().stats;
-      // TODO: set more stuff. e.g., pg_states
-      sends->emplace_back(
-	shard_services.send_to_osd(
-	  pg_shard.osd, std::move(m), map_epoch));
     }
+    pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+    // TODO: set more stuff. e.g., pg_states
+    sends->emplace_back(
+      shard_services.send_to_osd(
+	pg_shard.osd, std::move(m), map_epoch));
   }
 
   co_await pg.update_snap_map(log_entries, txn);
@@ -99,7 +136,7 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     std::move(log_entries),
     osd_op_p.pg_trim_to,
     osd_op_p.at_version,
-    osd_op_p.min_last_complete_ondisk,
+    osd_op_p.pg_committed_to,
     true,
     txn,
     false);
@@ -120,9 +157,16 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
       return seastar::now();
     }
     return peers->all_committed.get_shared_future();
-  }).then_interruptible([pending_txn, this] {
+  }).then_interruptible([pending_txn, this, _new_clone,
+			to_push_clone=std::move(to_push_clone)] {
     auto acked_peers = std::move(pending_txn->second.acked_peers);
     pending_trans.erase(pending_txn);
+    if (_new_clone && !to_push_clone.empty()) {
+      pg.enqueue_push_for_backfill(
+	_new_clone->obs.oi.soid,
+	_new_clone->obs.oi.version,
+	to_push_clone);
+    }
     return seastar::make_ready_future<
       crimson::osd::acked_peers_t>(std::move(acked_peers));
   });
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
index fb8704d8742..d5844b23a0c 100644
--- a/src/crimson/osd/replicated_backend.h
+++ b/src/crimson/osd/replicated_backend.h
@@ -35,6 +35,7 @@ private:
   rep_op_fut_t submit_transaction(
     const std::set<pg_shard_t> &pg_shards,
     const hobject_t& hoid,
+    crimson::osd::ObjectContextRef&& new_clone,
     ceph::os::Transaction&& txn,
     osd_op_params_t&& osd_op_p,
     epoch_t min_epoch, epoch_t max_epoch,
@@ -60,6 +61,17 @@ private:
   pending_transactions_t pending_trans;
   crimson::osd::PG& pg;
 
+  MURef<MOSDRepOp> new_repop_msg(
+    const pg_shard_t &pg_shard,
+    const hobject_t &hoid,
+    const bufferlist &encoded_txn,
+    const osd_op_params_t &osd_op_p,
+    epoch_t min_epoch,
+    epoch_t map_epoch,
+    const std::vector<pg_log_entry_t> &log_entries,
+    bool send_op,
+    ceph_tid_t tid);
+
   seastar::future<> request_committed(
     const osd_reqid_t& reqid, const eversion_t& at_version) final;
 };
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index a053d9d5044..c2340898929 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -802,15 +802,19 @@ seastar::future<> ShardServices::dispatch_context_messages(
 
 seastar::future<> ShardServices::dispatch_context(
   crimson::os::CollectionRef col,
-  PeeringCtx &&ctx)
-{
-  ceph_assert(col || ctx.transaction.empty());
-  return seastar::when_all_succeed(
-    dispatch_context_messages(
-      BufferedRecoveryMessages{ctx}),
-    col ? dispatch_context_transaction(col, ctx) : seastar::now()
-  ).then_unpack([] {
-    return seastar::now();
+  PeeringCtx &&pctx)
+{
+  return seastar::do_with(
+    std::move(pctx),
+    [this, col](auto &ctx) {
+    ceph_assert(col || ctx.transaction.empty());
+    return seastar::when_all_succeed(
+      dispatch_context_messages(
+       BufferedRecoveryMessages{ctx}),
+      col ? dispatch_context_transaction(col, ctx) : seastar::now()
+    ).then_unpack([] {
+      return seastar::now();
+    });
   });
 }
 
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index d3387267871..b8149718724 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -307,7 +307,7 @@ static void handle_oneshot_fatal_signal(int signum)
 
   char buf[1024];
   char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
-  int r = ceph_pthread_getname(pthread_self(), pthread_name, sizeof(pthread_name));
+  int r = ceph_pthread_getname(pthread_name, sizeof(pthread_name));
   (void)r;
 #if defined(__sun)
   char message[SIG2STR_MAX];
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 627f4a3e85b..137669c1963 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -1005,7 +1005,7 @@ extern const char *ceph_cap_op_name(int op);
 /* extra info for cap import/export */
 struct ceph_mds_cap_peer {
 	__le64 cap_id;
-	__le32 seq;
+	__le32 issue_seq;
 	__le32 mseq;
 	__le32 mds;
 	__u8   flags;
@@ -1058,7 +1058,7 @@ struct ceph_mds_cap_release {
 struct ceph_mds_cap_item {
 	__le64 ino;
 	__le64 cap_id;
-	__le32 migrate_seq, seq;
+	__le32 migrate_seq, issue_seq;
 } __attribute__ ((packed));
 
 #define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
diff --git a/src/include/compat.h b/src/include/compat.h
index 53285243d91..a7d10fc5425 100644
--- a/src/include/compat.h
+++ b/src/include/compat.h
@@ -179,60 +179,12 @@ struct cpu_set_t;
 #define MSG_DONTWAIT MSG_NONBLOCK
 #endif
 
-/* compiler warning free success noop */
-#define pthread_setname_noop_helper(thread, name) ({ \
-  int __i = 0;                                       \
-  __i; })
-
-#define pthread_getname_noop_helper(thread, name, len) ({ \
-  if (name != NULL)                                       \
-    *name = '\0';                                         \
-  0; })
-
 #define pthread_kill_unsupported_helper(thread, signal) ({ \
   int __i = -ENOTSUP;                                      \
   __i; })
 
 #if defined(_WIN32) && defined(__clang__) && \
     !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
-  // In this case, llvm doesn't use the pthread api for std::thread.
-  // We cannot use native_handle() with the pthread api, nor can we pass
-  // it to Windows API functions.
-  #define ceph_pthread_setname pthread_setname_noop_helper
-#elif defined(HAVE_PTHREAD_SETNAME_NP)
-  #if defined(__APPLE__)
-    #define ceph_pthread_setname(thread, name) ({ \
-      int __result = 0;                         \
-      if (thread == pthread_self())             \
-        __result = pthread_setname_np(name);    \
-      __result; })
-  #else
-    #define ceph_pthread_setname pthread_setname_np
-  #endif
-#elif defined(HAVE_PTHREAD_SET_NAME_NP)
-  /* Fix a small name diff and return 0 */
-  #define ceph_pthread_setname(thread, name) ({ \
-    pthread_set_name_np(thread, name);          \
-    0; })
-#else
-  #define ceph_pthread_setname pthread_setname_noop_helper
-#endif
-
-#if defined(_WIN32) && defined(__clang__) && \
-    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
-  #define ceph_pthread_getname pthread_getname_noop_helper
-#elif defined(HAVE_PTHREAD_GETNAME_NP)
-  #define ceph_pthread_getname pthread_getname_np
-#elif defined(HAVE_PTHREAD_GET_NAME_NP)
-  #define ceph_pthread_getname(thread, name, len) ({ \
-    pthread_get_name_np(thread, name, len);          \
-    0; })
-#else
-  #define ceph_pthread_getname pthread_getname_noop_helper
-#endif
-
-#if defined(_WIN32) && defined(__clang__) && \
-    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
   #define ceph_pthread_kill pthread_kill_unsupported_helper
 #else
   #define ceph_pthread_kill pthread_kill
@@ -244,6 +196,9 @@ int ceph_posix_fallocate(int fd, off_t offset, off_t len);
 extern "C" {
 #endif
 
+int ceph_pthread_getname(char* name, size_t size);
+int ceph_pthread_setname(const char* name);
+
 int pipe_cloexec(int pipefd[2], int flags);
 char *ceph_strerror_r(int errnum, char *buf, size_t buflen);
 unsigned get_page_size();
diff --git a/src/include/elist.h b/src/include/elist.h
index edfb7955494..e777873b045 100644
--- a/src/include/elist.h
+++ b/src/include/elist.h
@@ -15,6 +15,10 @@
 #ifndef CEPH_ELIST_H
 #define CEPH_ELIST_H
 
+#include <cstddef> // for size_t
+
+#include "include/ceph_assert.h"
+
 /*
  * elist: embedded list.
  *
diff --git a/src/include/str_list.h b/src/include/str_list.h
index cad76c1d6f5..a4c7432c6ef 100644
--- a/src/include/str_list.h
+++ b/src/include/str_list.h
@@ -2,7 +2,6 @@
 #define CEPH_STRLIST_H
 
 #include <list>
-#include <set>
 #include <string>
 #include <string_view>
 #include <vector>
diff --git a/src/include/uuid.h b/src/include/uuid.h
index f6ef9878dae..a5d63c37297 100644
--- a/src/include/uuid.h
+++ b/src/include/uuid.h
@@ -60,7 +60,11 @@ struct uuid_d {
   }
 
   const char *bytes() const {
+#if BOOST_VERSION >= 108600
+    return (const char*)uuid.data();
+#else
     return (const char*)uuid.data;
+#endif
   }
 
   void encode(::ceph::buffer::list::contiguous_appender& p) const {
diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h
index 9cfb4482706..858742d511e 100644
--- a/src/kv/KeyValueDB.h
+++ b/src/kv/KeyValueDB.h
@@ -24,6 +24,12 @@ class KeyValueDB {
 public:
   class TransactionImpl {
   public:
+    // amount of ops included
+    virtual size_t get_count() const = 0;
+
+    // total encoded data size
+    virtual size_t get_size_bytes() const = 0;
+
     /// Set Keys
     void set(
       const std::string &prefix,                      ///< [in] Prefix for keys, or CF name
diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h
index a8468a25d4d..477b209854c 100644
--- a/src/kv/RocksDBStore.h
+++ b/src/kv/RocksDBStore.h
@@ -299,6 +299,12 @@ public:
       const std::string &k,
       const ceph::bufferlist &to_set_bl);
   public:
+    size_t get_count() const override {
+      return bat.Count();
+    }
+    size_t get_size_bytes() const override {
+      return bat.GetDataSize();
+    }
     void set(
       const std::string &prefix,
       const std::string &k,
diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc
index f9bc3b8fd04..2167eeade3c 100644
--- a/src/librados/librados_cxx.cc
+++ b/src/librados/librados_cxx.cc
@@ -978,128 +978,127 @@ uint32_t librados::NObjectIterator::get_pg_hash_position() const
 const librados::NObjectIterator librados::NObjectIterator::__EndObjectIterator(NULL);
 
 ///////////////////////////// PoolAsyncCompletion //////////////////////////////
-librados::PoolAsyncCompletion::PoolAsyncCompletion::~PoolAsyncCompletion()
+librados::PoolAsyncCompletion::~PoolAsyncCompletion()
 {
   auto c = reinterpret_cast<PoolAsyncCompletionImpl *>(pc);
   c->release();
 }
 
-int librados::PoolAsyncCompletion::PoolAsyncCompletion::set_callback(void *cb_arg,
-								     rados_callback_t cb)
+int librados::PoolAsyncCompletion::set_callback(void *cb_arg, rados_callback_t cb)
 {
   PoolAsyncCompletionImpl *c = (PoolAsyncCompletionImpl *)pc;
   return c->set_callback(cb_arg, cb);
 }
 
-int librados::PoolAsyncCompletion::PoolAsyncCompletion::wait()
+int librados::PoolAsyncCompletion::wait()
 {
   PoolAsyncCompletionImpl *c = (PoolAsyncCompletionImpl *)pc;
   return c->wait();
 }
 
-bool librados::PoolAsyncCompletion::PoolAsyncCompletion::is_complete()
+bool librados::PoolAsyncCompletion::is_complete()
 {
   PoolAsyncCompletionImpl *c = (PoolAsyncCompletionImpl *)pc;
   return c->is_complete();
 }
 
-int librados::PoolAsyncCompletion::PoolAsyncCompletion::get_return_value()
+int librados::PoolAsyncCompletion::get_return_value()
 {
   PoolAsyncCompletionImpl *c = (PoolAsyncCompletionImpl *)pc;
   return c->get_return_value();
 }
 
-void librados::PoolAsyncCompletion::PoolAsyncCompletion::release()
+void librados::PoolAsyncCompletion::release()
 {
   delete this;
 }
 
 ///////////////////////////// AioCompletion //////////////////////////////
-librados::AioCompletion::AioCompletion::~AioCompletion()
+librados::AioCompletion::~AioCompletion()
 {
   auto c = reinterpret_cast<AioCompletionImpl *>(pc);
   c->release();
 }
 
-int librados::AioCompletion::AioCompletion::set_complete_callback(void *cb_arg, rados_callback_t cb)
+int librados::AioCompletion::set_complete_callback(void *cb_arg, rados_callback_t cb)
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->set_complete_callback(cb_arg, cb);
 }
 
-int librados::AioCompletion::AioCompletion::set_safe_callback(void *cb_arg, rados_callback_t cb)
+int librados::AioCompletion::set_safe_callback(void *cb_arg, rados_callback_t cb)
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->set_safe_callback(cb_arg, cb);
 }
 
-int librados::AioCompletion::AioCompletion::wait_for_complete()
+int librados::AioCompletion::wait_for_complete()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->wait_for_complete();
 }
 
-int librados::AioCompletion::AioCompletion::wait_for_safe()
+int librados::AioCompletion::wait_for_safe()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->wait_for_complete();
 }
 
-bool librados::AioCompletion::AioCompletion::is_complete()
+bool librados::AioCompletion::is_complete()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->is_complete();
 }
 
-bool librados::AioCompletion::AioCompletion::is_safe()
+bool librados::AioCompletion::is_safe()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->is_safe();
 }
 
-int librados::AioCompletion::AioCompletion::wait_for_complete_and_cb()
+int librados::AioCompletion::wait_for_complete_and_cb()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->wait_for_complete_and_cb();
 }
 
-int librados::AioCompletion::AioCompletion::wait_for_safe_and_cb()
+int librados::AioCompletion::wait_for_safe_and_cb()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->wait_for_safe_and_cb();
 }
 
-bool librados::AioCompletion::AioCompletion::is_complete_and_cb()
+bool librados::AioCompletion::is_complete_and_cb()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->is_complete_and_cb();
 }
 
-bool librados::AioCompletion::AioCompletion::is_safe_and_cb()
+bool librados::AioCompletion::is_safe_and_cb()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->is_safe_and_cb();
 }
 
-int librados::AioCompletion::AioCompletion::get_return_value()
+int librados::AioCompletion::get_return_value()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->get_return_value();
 }
 
-int librados::AioCompletion::AioCompletion::get_version()
+int librados::AioCompletion::get_version()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->get_version();
 }
 
-uint64_t librados::AioCompletion::AioCompletion::get_version64()
+uint64_t librados::AioCompletion::get_version64()
 {
   AioCompletionImpl *c = (AioCompletionImpl *)pc;
   return c->get_version();
 }
 
-void librados::AioCompletion::AioCompletion::release()
+void librados::AioCompletion::release()
 {
   delete this;
 }
diff --git a/src/log/Entry.h b/src/log/Entry.h
index db39eca0ef3..eeb25c5f593 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -24,6 +24,7 @@ namespace logging {
 class Entry {
 public:
   using time = log_time;
+  using thread_name_t = std::array<char, 16>;
 
   Entry() = delete;
   Entry(short pr, short sub) :
@@ -32,8 +33,7 @@ public:
     m_prio(pr),
     m_subsys(sub)
   {
-    strncpy(m_thread_name, Thread::get_thread_name().data(), 16);
-    m_thread_name[15] = '\0';
+    ceph_pthread_getname(m_thread_name.data(), m_thread_name.size());
   }
   Entry(const Entry &) = default;
   Entry& operator=(const Entry &) = default;
@@ -47,7 +47,7 @@ public:
   time m_stamp;
   pthread_t m_thread;
   short m_prio, m_subsys;
-  char m_thread_name[16];
+  thread_name_t m_thread_name{};
 
   static log_clock& clock() {
     static log_clock clock;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 49dd03c06c0..63d5205d9e2 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -31,6 +31,7 @@
 
 #include <fmt/format.h>
 #include <fmt/ostream.h>
+#include <fmt/ranges.h>
 
 #define MAX_LOG_BUF 65536
 
@@ -372,6 +373,7 @@ void Log::_flush_logbuf()
 
 void Log::_flush(EntryVector& t, bool crash)
 {
+  auto now = mono_clock::now();
   long len = 0;
   if (t.empty()) {
     assert(m_log_buf.empty());
@@ -443,10 +445,29 @@ void Log::_flush(EntryVector& t, bool crash)
       m_journald->log_entry(e);
     }
 
+    {
+      auto [it, _] = m_recent_thread_names.try_emplace(e.m_thread, now, DEFAULT_MAX_THREAD_NAMES);
+      auto& [t, names] = it->second;
+      if (names.size() == 0 || names.front() != e.m_thread_name.data()) {
+        names.push_front(e.m_thread_name.data());
+      }
+      t = now;
+    }
+
     m_recent.push_back(std::move(e));
   }
   t.clear();
 
+  for (auto it = m_recent_thread_names.begin(); it != m_recent_thread_names.end(); ) {
+    auto t = it->second.first;
+    auto since = now - t;
+    if (since > std::chrono::seconds(60*60*24)) {
+      it = m_recent_thread_names.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
   _flush_logbuf();
 }
 
@@ -493,14 +514,10 @@ void Log::dump_recent()
   _flush(m_flush, false);
 
   _log_message("--- begin dump of recent events ---", true);
-  std::set<std::pair<pthread_t, const char *>> recent_pthread_ids;
   {
     EntryVector t;
     t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end()));
     m_recent.clear();
-    for (const auto& e : t) {
-      recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name));
-    }
     _flush(t, true);
   }
 
@@ -515,11 +532,15 @@ void Log::dump_recent()
 			   m_stderr_log, m_stderr_crash), true);
 
   _log_message("--- pthread ID / name mapping for recent threads ---", true);
-  for (auto& [pthread_id, pthread_name] : recent_pthread_ids)
+  for (const auto& [tid, t_names] : m_recent_thread_names)
   {
+    [[maybe_unused]] auto [t, names] = t_names;
     // we want the ID to be printed in the same format as we use for a log entry.
     // The reason is easier grepping.
-    _log_message(fmt::format("  {:x} / {}", tid_to_int(pthread_id), pthread_name), true);
+    auto msg = fmt::format("  {:x} / {}",
+      tid_to_int(tid),
+      fmt::join(names, ", "));
+    _log_message(msg, true);
   }
 
   _log_message(fmt::format("  max_recent {:9}", m_recent.capacity()), true);
diff --git a/src/log/Log.h b/src/log/Log.h
index 3a60937af55..46d97734305 100644
--- a/src/log/Log.h
+++ b/src/log/Log.h
@@ -7,6 +7,7 @@
 #include <boost/circular_buffer.hpp>
 
 #include <condition_variable>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <queue>
@@ -14,6 +15,7 @@
 #include <string_view>
 
 #include "common/Thread.h"
+#include "common/ceph_time.h"
 #include "common/likely.h"
 
 #include "log/Entry.h"
@@ -86,9 +88,14 @@ protected:
 
 private:
   using EntryRing = boost::circular_buffer<ConcreteEntry>;
+  using mono_clock = ceph::coarse_mono_clock;
+  using mono_time = ceph::coarse_mono_time;
+
+  using RecentThreadNames = std::map<pthread_t, std::pair<mono_time, boost::circular_buffer<std::string> > >;
 
   static const std::size_t DEFAULT_MAX_NEW = 100;
   static const std::size_t DEFAULT_MAX_RECENT = 10000;
+  static constexpr std::size_t DEFAULT_MAX_THREAD_NAMES = 4;
 
   Log **m_indirect_this;
 
@@ -102,6 +109,7 @@ private:
   pthread_t m_queue_mutex_holder;
   pthread_t m_flush_mutex_holder;
 
+  RecentThreadNames m_recent_thread_names; // protected by m_flush_mutex
   EntryVector m_new;    ///< new entries
   EntryRing m_recent; ///< recent (less new) entries we've already written at low detail
   EntryVector m_flush; ///< entries to be flushed (here to optimize heap allocations)
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 642d3428a27..6fbfc79d416 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -61,6 +61,7 @@ void Beacon::shutdown()
   std::unique_lock<std::mutex> lock(mutex);
   if (!finished) {
     finished = true;
+    cvar.notify_all();
     lock.unlock();
     if (sender.joinable())
       sender.join();
@@ -74,7 +75,7 @@ void Beacon::init(const MDSMap &mdsmap)
   _notify_mdsmap(mdsmap);
 
   sender = std::thread([this]() {
-    ceph_pthread_setname(pthread_self(), "beacon");
+    ceph_pthread_setname("mds-beacon");
     std::unique_lock<std::mutex> lock(mutex);
     bool sent;
     while (!finished) {
@@ -320,16 +321,15 @@ void Beacon::notify_health(MDSRank const *mds)
   // Detect MDS_HEALTH_TRIM condition
   // Indicates MDS is not trimming promptly
   {
-    const auto log_max_segments = mds->mdlog->get_max_segments();
-    const auto log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
-    if (mds->mdlog->get_num_segments() > (size_t)(log_max_segments * log_warn_factor)) {
+    if (mds->mdlog->is_trim_slow()) {
+      auto num_segments = mds->mdlog->get_num_segments();
+      auto max_segments = mds->mdlog->get_max_segments();
       CachedStackStringStream css;
-      *css << "Behind on trimming (" << mds->mdlog->get_num_segments()
-        << "/" << log_max_segments << ")";
+      *css << "Behind on trimming (" << num_segments << "/" << max_segments << ")";
 
       MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, css->strv());
-      m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
-      m.metadata["max_segments"] = stringify(log_max_segments);
+      m.metadata["num_segments"] = stringify(num_segments);
+      m.metadata["max_segments"] = stringify(max_segments);
       health.metrics.push_back(m);
     }
   }
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 76e9fee68f8..af9f8edfffa 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -3808,10 +3808,13 @@ bool CDir::should_split_fast() const
     const CDentry *dn = p.second;
     if (!dn->get_projected_linkage()->is_null()) {
       effective_size++;
+
+      if (effective_size > fast_limit) [[unlikely]]
+	return true;
     }
   }
 
-  return effective_size > fast_limit;
+  return false;
 }
 
 bool CDir::should_merge() const
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 215375ca297..3e2b0adffb0 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -546,6 +546,16 @@ public:
 
   void maybe_finish_freeze();
 
+  size_t count_unfreeze_tree_waiters() {
+    size_t n = count_unfreeze_dir_waiters();
+    _walk_tree([&n](CDir *dir) {
+        n += dir->count_unfreeze_dir_waiters();
+        return true;
+      });
+    return n;
+  }
+  inline size_t count_unfreeze_dir_waiters() const { return count_waiters(WAIT_UNFREEZE); }
+
   std::pair<bool,bool> is_freezing_or_frozen_tree() const {
     if (freeze_tree_state) {
       if (freeze_tree_state->frozen)
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 9adcf3b25b9..0782464ad94 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -218,8 +218,6 @@ public:
   void set_cap_id(uint64_t i) { cap_id = i; }
   uint64_t get_cap_id() const { return cap_id; }
 
-  //ceph_seq_t get_last_issue() { return last_issue; }
-
   bool is_suppress() const { return suppress > 0; }
   void inc_suppress() { suppress++; }
   void dec_suppress() { suppress--; }
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 63608d48864..d64f176acb6 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2599,6 +2599,7 @@ int Locker::issue_caps(CInode *in, Capability *only_cap)
 					   in->find_snaprealm()->inode->ino(),
 					   cap->get_cap_id(), cap->get_last_seq(),
 					   pending, wanted, 0, cap->get_mseq(),
+                                           cap->get_last_issue(),
 					   mds->get_osd_epoch_barrier());
 	in->encode_cap_message(m, cap);
 
@@ -2649,6 +2650,7 @@ int Locker::issue_caps(CInode *in, Capability *only_cap)
 					 in->find_snaprealm()->inode->ino(),
 					 cap->get_cap_id(), cap->get_last_seq(),
 					 after, wanted, 0, cap->get_mseq(),
+                                         cap->get_last_issue(),
 					 mds->get_osd_epoch_barrier());
       in->encode_cap_message(m, cap);
 
@@ -2675,6 +2677,7 @@ void Locker::issue_truncate(CInode *in)
                                        cap->get_cap_id(), cap->get_last_seq(),
                                        cap->pending(), cap->wanted(), 0,
                                        cap->get_mseq(),
+                                       cap->get_last_issue(),
                                        mds->get_osd_epoch_barrier());
     in->encode_cap_message(m, cap);			     
     mds->send_message_client_counted(m, cap->get_session());
@@ -3165,6 +3168,7 @@ void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
                                          cap->pending(),
                                          cap->wanted(), 0,
                                          cap->get_mseq(),
+                                         cap->get_last_issue(),
                                          mds->get_osd_epoch_barrier());
       in->encode_cap_message(m, cap);
       mds->send_message_client_counted(m, cap->get_session());
@@ -3375,10 +3379,10 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
     ref_t<MClientCaps> ack;
     if (op == CEPH_CAP_OP_FLUSHSNAP) {
       if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
     } else {
       if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
     }
     ack->set_snap_follows(follows);
     ack->set_client_tid(m->get_client_tid());
@@ -3500,7 +3504,7 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
     // case we get a dup response, so whatever.)
     ref_t<MClientCaps> ack;
     if (dirty) {
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
       ack->set_snap_follows(follows);
       ack->set_client_tid(m->get_client_tid());
       ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
@@ -3589,7 +3593,7 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
       dout(7) << " flush client." << client << " dirty " << ccap_string(dirty)
 	      << " seq " << m->get_seq() << " on " << *in << dendl;
       ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
-          m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+          m->get_caps(), 0, dirty, 0, cap->get_last_issue(), mds->get_osd_epoch_barrier());
       ack->set_client_tid(m->get_client_tid());
       ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
     }
@@ -4222,7 +4226,7 @@ void Locker::handle_client_cap_release(const cref_t<MClientCapRelease> &m)
   Session *session = mds->get_session(m);
 
   for (const auto &cap : m->caps) {
-    _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq);
+    _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.issue_seq);
   }
 
   if (session) {
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index eb2b529dcfa..3af0d8c6b1e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5891,7 +5891,7 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
   auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
 					in->ino(), realm->inode->ino(), cap->get_cap_id(),
 					cap->get_last_seq(), cap->pending(), cap->wanted(),
-					0, cap->get_mseq(), mds->get_osd_epoch_barrier());
+					0, cap->get_mseq(), cap->get_last_issue(), mds->get_osd_epoch_barrier());
   in->encode_cap_message(reap, cap);
   reap->snapbl = mds->server->get_snap_trace(session, realm);
   reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
@@ -14378,6 +14378,7 @@ bool MDCache::is_ready_to_trim_cache(void)
 
 void MDCache::upkeep_main(void)
 {
+  ceph_pthread_setname("mds-cache-trim");
   std::unique_lock lock(upkeep_mutex);
 
   // create a "memory model" for the upkeep thread. The object maintains
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 0be568433ef..4bbf2a1a141 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -53,11 +53,12 @@ MDLog::MDLog(MDSRank* m)
   event_large_threshold = g_conf().get_val<uint64_t>("mds_log_event_large_threshold");
   events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
   pause = g_conf().get_val<bool>("mds_log_pause");
-  major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
   max_segments = g_conf().get_val<uint64_t>("mds_log_max_segments");
   max_events = g_conf().get_val<int64_t>("mds_log_max_events");
   skip_corrupt_events = g_conf().get_val<bool>("mds_log_skip_corrupt_events");
   skip_unbounded_events = g_conf().get_val<bool>("mds_log_skip_unbounded_events");
+  log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
+  minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
   upkeep_thread = std::thread(&MDLog::log_trim_upkeep, this);
 }
 
@@ -258,7 +259,7 @@ void MDLog::create(MDSContext *c)
   logger->set(l_mdl_expos, journaler->get_expire_pos());
   logger->set(l_mdl_wrpos, journaler->get_write_pos());
 
-  submit_thread.create("md_submit");
+  submit_thread.create("mds-log-submit");
 }
 
 void MDLog::open(MDSContext *c)
@@ -267,9 +268,9 @@ void MDLog::open(MDSContext *c)
 
   ceph_assert(!recovery_thread.is_started());
   recovery_thread.set_completion(c);
-  recovery_thread.create("md_recov_open");
+  recovery_thread.create("mds-log-recvr");
 
-  submit_thread.create("md_submit");
+  submit_thread.create("mds-log-submit");
   // either append() or replay() will follow.
 }
 
@@ -311,7 +312,7 @@ void MDLog::reopen(MDSContext *c)
   recovery_thread.join();
 
   recovery_thread.set_completion(new C_ReopenComplete(this, c));
-  recovery_thread.create("md_recov_reopen");
+  recovery_thread.create("mds-log-reopen");
 }
 
 void MDLog::append()
@@ -357,14 +358,15 @@ void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase* c)
   ceph_assert(!mds_is_shutting_down);
 
   event_seq++;
-  events_since_last_major_segment++;
 
   if (auto sb = dynamic_cast<SegmentBoundary*>(le); sb) {
     auto ls = _start_new_segment(sb);
     if (sb->is_major_segment_boundary()) {
       major_segments.insert(ls->seq);
       logger->set(l_mdl_segmjr, major_segments.size());
-      events_since_last_major_segment = 0;
+      minor_segments_since_last_major_segment = 0;
+    } else {
+      ++minor_segments_since_last_major_segment;
     }
   }
 
@@ -403,7 +405,7 @@ void MDLog::_segment_upkeep()
   uint64_t period = journaler->get_layout_period();
   auto ls = get_current_segment();
   // start a new segment?
-  if (events_since_last_major_segment > events_per_segment*major_segment_event_ratio) {
+  if (minor_segments_since_last_major_segment > minor_segments_per_major_segment) {
     dout(10) << __func__ << ": starting new major segment, current " << *ls << dendl;
     auto sle = mds->mdcache->create_subtree_map();
     _submit_entry(sle, NULL);
@@ -656,7 +658,13 @@ void MDLog::try_to_commit_open_file_table(uint64_t last_seq)
   }
 }
 
+bool MDLog::is_trim_slow() const {
+  return (segments.size() > (size_t)(max_segments * log_warn_factor));
+}
+
 void MDLog::log_trim_upkeep(void) {
+  ceph_pthread_setname("mds-log-trim");
+
   dout(10) << dendl;
 
   std::unique_lock mds_lock(mds->mds_lock);
@@ -1008,7 +1016,7 @@ void MDLog::replay(MDSContext *c)
   }
   already_replayed = true;
 
-  replay_thread.create("md_log_replay");
+  replay_thread.create("mds-log-replay");
 }
 
 
@@ -1474,7 +1482,6 @@ void MDLog::_replay_thread()
     }
     le->set_start_off(pos);
 
-    events_since_last_major_segment++;
     if (auto sb = dynamic_cast<SegmentBoundary*>(le.get()); sb) {
       auto seq = sb->get_seq();
       if (seq > 0) {
@@ -1487,7 +1494,9 @@ void MDLog::_replay_thread()
       if (sb->is_major_segment_boundary()) {
         major_segments.insert(event_seq);
         logger->set(l_mdl_segmjr, major_segments.size());
-        events_since_last_major_segment = 0;
+	minor_segments_since_last_major_segment = 0;
+      } else {
+	++minor_segments_since_last_major_segment;
       }
     } else {
       event_seq++;
@@ -1618,9 +1627,6 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
   if (changed.count("mds_log_events_per_segment")) {
     events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
   }
-  if (changed.count("mds_log_major_segment_event_ratio")) {
-    major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
-  }
   if (changed.count("mds_log_max_events")) {
     max_events = g_conf().get_val<int64_t>("mds_log_max_events");
   }
@@ -1642,4 +1648,10 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
   if (changed.count("mds_log_trim_decay_rate")){
     log_trim_counter = DecayCounter(g_conf().get_val<double>("mds_log_trim_decay_rate"));
   }
+  if (changed.count("mds_log_warn_factor")) {
+    log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
+  }
+  if (changed.count("mds_log_minor_segments_per_major_segment")) {
+    minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
+  }
 }
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index e2ab4e686cd..a858b40fa03 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -173,6 +173,9 @@ public:
   // replay state
   std::map<inodeno_t, std::set<inodeno_t>> pending_exports;
 
+  // beacon needs me too
+  bool is_trim_slow() const;
+
 protected:
   struct PendingEvent {
     PendingEvent(LogEvent *e, Context* c, bool f=false) : le(e), fin(c), flush(f) {}
@@ -302,9 +305,9 @@ private:
   bool debug_subtrees;
   std::atomic_uint64_t event_large_threshold; // accessed by submit thread
   uint64_t events_per_segment;
-  uint64_t major_segment_event_ratio;
   int64_t max_events;
   uint64_t max_segments;
+  uint64_t minor_segments_per_major_segment;
   bool pause;
   bool skip_corrupt_events;
   bool skip_unbounded_events;
@@ -312,7 +315,8 @@ private:
   std::set<uint64_t> major_segments;
   std::set<LogSegment*> expired_segments;
   std::set<LogSegment*> expiring_segments;
-  uint64_t events_since_last_major_segment = 0;
+  uint64_t minor_segments_since_last_major_segment = 0;
+  double log_warn_factor;
 
   // log trimming decay counter
   DecayCounter log_trim_counter;
diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h
index be84d142e9a..99db3a13810 100644
--- a/src/mds/MDSCacheObject.h
+++ b/src/mds/MDSCacheObject.h
@@ -260,6 +260,8 @@ class MDSCacheObject {
   }
   bool is_waiter_for(waitmask_t mask);
 
+  inline size_t count_waiters(uint64_t mask) const { return waiting.count(mask); }
+
   virtual void add_waiter(uint64_t mask, MDSContext *c) {
     add_waiter(waitmask_t(mask), c);
   }
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index 75b608ace77..9bb71756e2d 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -304,6 +304,10 @@ void MDSDaemon::set_up_admin_socket()
 				     asok_hook,
 				     "show recent ops, sorted by op duration");
   ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_export_states",
+				     asok_hook,
+				     "dump export states");
+  ceph_assert(r == 0);
   r = admin_socket->register_command("scrub_path name=path,type=CephString "
 				     "name=scrubops,type=CephChoices,"
 				     "strings=force|recursive|repair,n=N,req=false "
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index c2f3544f97b..52ed930d71b 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -496,7 +496,7 @@ MDSRank::MDSRank(
 
   objecter->unset_honor_pool_full();
 
-  finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
+  finisher = new Finisher(cct, "MDSRank", "mds-rank-fin");
 
   mdcache = new MDCache(this, purge_queue);
   mdlog = new MDLog(this);
@@ -581,7 +581,7 @@ void MDSRankDispatcher::init()
   // who is interested in it.
   handle_osd_map();
 
-  progress_thread.create("mds_rank_progr");
+  progress_thread.create("mds-rank-progr");
 
   purge_queue.init();
 
@@ -2769,6 +2769,9 @@ void MDSRankDispatcher::handle_asok_command(
     if (!op_tracker.dump_historic_ops(f, true)) {
       *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
     }
+  } else if (command == "dump_export_states") {
+    std::lock_guard l(mds_lock);
+    mdcache->migrator->dump_export_states(f);
   } else if (command == "osdmap barrier") {
     int64_t target_epoch = 0;
     bool got_val = cmd_getval(cmdmap, "target_epoch", target_epoch);
diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc
index 1d17bbf3e92..6cbd9a094c0 100644
--- a/src/mds/MetricAggregator.cc
+++ b/src/mds/MetricAggregator.cc
@@ -73,6 +73,7 @@ int MetricAggregator::init() {
   m_cct->get_perfcounters_collection()->add(m_perf_counters);
 
   pinger = std::thread([this]() {
+      ceph_pthread_setname("mds-ping");
       std::unique_lock locker(lock);
       while (!stopping) {
         ping_all_active_ranks();
diff --git a/src/mds/MetricsHandler.cc b/src/mds/MetricsHandler.cc
index 9ad10b9d6e6..9fc4c6122a4 100644
--- a/src/mds/MetricsHandler.cc
+++ b/src/mds/MetricsHandler.cc
@@ -51,6 +51,7 @@ void MetricsHandler::init() {
   dout(10) << dendl;
 
   updater = std::thread([this]() {
+      ceph_pthread_setname("mds-metrics");
       std::unique_lock locker(lock);
       while (!stopping) {
         double after = g_conf().get_val<std::chrono::seconds>("mds_metrics_update_interval").count();
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index cb77282e384..6b12f710db4 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -268,12 +268,12 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
   case EXPORT_LOCKING:
     dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
     num_locking_exports--;
-    it->second.state = EXPORT_CANCELLED;
+    it->second.set_state(EXPORT_CANCELLED);
     dir->auth_unpin(this);
     break;
   case EXPORT_DISCOVERING:
     dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
-    it->second.state = EXPORT_CANCELLED;
+    it->second.set_state(EXPORT_CANCELLED);
     dir->unfreeze_tree();  // cancel the freeze
     dir->auth_unpin(this);
     if (notify_peer &&
@@ -286,7 +286,7 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
 
   case EXPORT_FREEZING:
     dout(10) << "export state=freezing : canceling freeze" << dendl;
-    it->second.state = EXPORT_CANCELLED;
+    it->second.set_state(EXPORT_CANCELLED);
     dir->unfreeze_tree();  // cancel the freeze
     if (dir->is_subtree_root())
       mdcache->try_subtree_merge(dir);
@@ -301,13 +301,13 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
     // NOTE: state order reversal, warning comes after prepping
   case EXPORT_WARNING:
     dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
-    it->second.state = EXPORT_CANCELLING;
+    it->second.set_state(EXPORT_CANCELLING);
     // fall-thru
 
   case EXPORT_PREPPING:
     if (state != EXPORT_WARNING) {
       dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
-      it->second.state = EXPORT_CANCELLED;
+      it->second.set_state(EXPORT_CANCELLED);
     }
 
     {
@@ -340,7 +340,7 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
 
   case EXPORT_EXPORTING:
     dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
-    it->second.state = EXPORT_CANCELLING;
+    it->second.set_state(EXPORT_CANCELLING);
     export_reverse(dir, it->second);
     break;
 
@@ -865,7 +865,7 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest)
   ceph_assert(export_state.count(dir) == 0);
   export_state_t& stat = export_state[dir];
   num_locking_exports++;
-  stat.state = EXPORT_LOCKING;
+  stat.set_state(EXPORT_LOCKING);
   stat.peer = dest;
   stat.tid = mdr->reqid.tid;
   stat.mut = mdr;
@@ -1140,7 +1140,7 @@ void Migrator::dispatch_export_dir(const MDRequestRef& mdr, int count)
 
   if (results.size() == 1 && results.front().first == dir) {
     num_locking_exports--;
-    it->second.state = EXPORT_DISCOVERING;
+    it->second.set_state(EXPORT_DISCOVERING);
     // send ExportDirDiscover (ask target)
     filepath path;
     dir->inode->make_path(path);
@@ -1191,7 +1191,7 @@ void Migrator::dispatch_export_dir(const MDRequestRef& mdr, int count)
     ceph_assert(export_state.count(sub) == 0);
     auto& stat = export_state[sub];
     num_locking_exports++;
-    stat.state = EXPORT_LOCKING;
+    stat.set_state(EXPORT_LOCKING);
     stat.peer = dest;
     stat.tid = _mdr->reqid.tid;
     stat.mut = _mdr;
@@ -1244,7 +1244,7 @@ void Migrator::handle_export_discover_ack(const cref_t<MExportDirDiscoverAck> &m
 
     if (m->is_success()) {
       // move to freezing the subtree
-      it->second.state = EXPORT_FREEZING;
+      it->second.set_state(EXPORT_FREEZING);
       auto&& mdr = boost::static_pointer_cast<MDRequestImpl>(std::move(it->second.mut));
       ceph_assert(!it->second.mut); // should have been moved out of
 
@@ -1427,18 +1427,18 @@ void Migrator::export_frozen(CDir *dir, uint64_t tid)
   }
 
   // send.
-  it->second.state = EXPORT_PREPPING;
+  it->second.set_state(EXPORT_PREPPING);
   mds->send_message_mds(prep, it->second.peer);
   ceph_assert(g_conf()->mds_kill_export_at != 4);
 
   // make sure any new instantiations of caps are flushed out
   ceph_assert(it->second.warning_ack_waiting.empty());
 
-  set<client_t> export_client_set;
-  get_export_client_set(dir, export_client_set);
+  ceph_assert(it->second.export_client_set.empty());
+  get_export_client_set(dir, it->second.export_client_set);
 
   MDSGatherBuilder gather(g_ceph_context);
-  mds->server->flush_client_sessions(export_client_set, gather);
+  mds->server->flush_client_sessions(it->second.export_client_set, gather);
   if (gather.has_subs()) {
     it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
     gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
@@ -1537,7 +1537,7 @@ void Migrator::handle_export_prep_ack(const cref_t<MExportDirPrepAck> &m)
     
   }
 
-  it->second.state = EXPORT_WARNING;
+  it->second.set_state(EXPORT_WARNING);
 
   ceph_assert(g_conf()->mds_kill_export_at != 6);
   // nobody to warn?
@@ -1587,8 +1587,8 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
   dout(7) << *dir << " to " << dest << dendl;
 
   mdcache->show_subtrees();
-  
-  it->second.state = EXPORT_EXPORTING;
+
+  it->second.set_state(EXPORT_EXPORTING);
   ceph_assert(g_conf()->mds_kill_export_at != 7);
 
   ceph_assert(dir->is_frozen_tree_root());
@@ -1933,7 +1933,7 @@ void Migrator::handle_export_ack(const cref_t<MExportDirAck> &m)
   auto bp = m->imported_caps.cbegin();
   decode(it->second.peer_imported, bp);
 
-  it->second.state = EXPORT_LOGGINGFINISH;
+  it->second.set_state(EXPORT_LOGGINGFINISH);
   ceph_assert(g_conf()->mds_kill_export_at != 9);
   set<CDir*> bounds;
   mdcache->get_subtree_bounds(dir, bounds);
@@ -1957,10 +1957,10 @@ void Migrator::handle_export_ack(const cref_t<MExportDirAck> &m)
   // this keeps authority().first in sync with subtree auth state in the journal.
   mdcache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
 
+  ceph_assert(g_conf()->mds_kill_export_at != 10);
   // log export completion, then finish (unfreeze, trigger finish context, etc.)
   mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
   mds->mdlog->flush();
-  ceph_assert(g_conf()->mds_kill_export_at != 10);
 }
 
 void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
@@ -1970,7 +1970,7 @@ void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>&
   ceph_assert(stat.state == EXPORT_CANCELLING);
 
   if (stat.notify_ack_waiting.empty()) {
-    stat.state = EXPORT_CANCELLED;
+    stat.set_state(EXPORT_CANCELLED);
     return;
   }
 
@@ -2095,7 +2095,7 @@ void Migrator::export_logged_finish(CDir *dir)
   }
 
   // wait for notifyacks
-  stat.state = EXPORT_NOTIFYING;
+  stat.set_state(EXPORT_NOTIFYING);
   ceph_assert(g_conf()->mds_kill_export_at != 11);
   
   // no notifies to wait for?
@@ -2844,7 +2844,6 @@ void Migrator::import_reverse(CDir *dir)
   dout(7) << *dir << dendl;
 
   import_state_t& stat = import_state[dir->dirfrag()];
-  stat.state = IMPORT_ABORTING;
 
   set<CDir*> bounds;
   mdcache->get_subtree_bounds(dir, bounds);
@@ -2950,10 +2949,14 @@ void Migrator::import_reverse(CDir *dir)
       }
       in->put(CInode::PIN_IMPORTINGCAPS);
     }
+  }
+
+  if (stat.state == IMPORT_LOGGINGSTART || stat.state == IMPORT_ACKING) {
     for (auto& p : stat.session_map) {
       Session *session = p.second.first;
       session->dec_importing();
     }
+    mds->server->close_forced_opened_sessions(stat.session_map);
   }
 	 
   // log our failure
@@ -2962,6 +2965,7 @@ void Migrator::import_reverse(CDir *dir)
   mdcache->trim(num_dentries); // try trimming dentries
 
   // notify bystanders; wait in aborting state
+  stat.state = IMPORT_ABORTING;
   import_notify_abort(dir, bounds);
 }
 
@@ -3054,10 +3058,9 @@ void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
   dout(7) << *dir << dendl;
 
   map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
-  if (it == import_state.end() ||
-      it->second.state != IMPORT_LOGGINGSTART) {
+  ceph_assert(it != import_state.end());
+  if (it->second.state != IMPORT_LOGGINGSTART) {
     dout(7) << "import " << df << " must have aborted" << dendl;
-    mds->server->finish_force_open_sessions(imported_session_map);
     return;
   }
 
@@ -3217,6 +3220,79 @@ void Migrator::import_finish(CDir *dir, bool notify, bool last)
   }
 }
 
+void Migrator::dump_export_states(Formatter *f)
+{
+  f->open_array_section("states");
+  for (const auto& [dir, state] : export_state) {
+    f->open_object_section("state");
+
+    f->dump_unsigned("tid", state.tid);
+
+    dir->dump(f, CDir::DUMP_PATH | CDir::DUMP_DIRFRAG);
+
+    f->dump_string("state", get_export_statename(state.state));
+
+    f->open_object_section("state_history");
+    for (const auto& [s, _1] : state.state_history) {
+      f->open_object_section(get_export_statename(s));
+      f->dump_stream("start_at") << state.get_start_time(s);
+      f->dump_float("time_spent", state.get_time_spent(s));
+      f->close_section();
+    }
+    f->close_section();
+
+    f->dump_int("peer", state.peer);
+
+    switch (state.state) {
+    case EXPORT_DISCOVERING:
+    case EXPORT_FREEZING:
+      f->dump_stream("last_cum_auth_pins_change") << state.last_cum_auth_pins_change;
+      f->dump_int("last_cum_auth_pins", state.last_cum_auth_pins);
+      f->dump_int("num_remote_waiters", state.num_remote_waiters);
+
+      break;
+
+    case EXPORT_PREPPING:
+    case EXPORT_WARNING:
+      f->open_array_section("flushed_clients");
+      for (const auto &client : state.export_client_set)
+	f->dump_int("client", client.v);
+      f->close_section();
+
+      f->open_array_section("warning_ack_waiting");
+      for (const auto &rank : state.warning_ack_waiting)
+	f->dump_int("rank", rank);
+      f->close_section();
+
+      if (state.state == EXPORT_PREPPING)
+	break;
+      // fall-thru
+
+    case EXPORT_EXPORTING:
+    case EXPORT_LOGGINGFINISH:
+    case EXPORT_NOTIFYING:
+      f->open_array_section("notify_ack_waiting");
+      for (const auto &rank : state.notify_ack_waiting)
+	f->dump_int("rank", rank);
+      f->close_section();
+
+      break;
+
+    default:
+      break;
+    }
+
+    if (state.state >= EXPORT_DISCOVERING) {
+      f->dump_unsigned("approx_size", state.approx_size);
+      f->dump_unsigned("unfreeze_tree_waiters", dir->count_unfreeze_tree_waiters());
+      f->dump_float("freeze_tree_time", state.get_freeze_tree_time());
+    }
+
+    f->close_section();
+  }
+  f->close_section();
+}
+
 void Migrator::decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
 				   mds_rank_t oldauth, LogSegment *ls,
 				   map<CInode*, map<client_t,Capability::Export> >& peer_exports,
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index d6e599c06a9..f733dea76b3 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -252,6 +252,8 @@ public:
 
   void import_finish(CDir *dir, bool notify, bool last=true);
 
+  void dump_export_states(Formatter *f);
+
 protected:
   struct export_base_t {
     export_base_t(dirfrag_t df, mds_rank_t d, unsigned c, uint64_t g) :
@@ -267,7 +269,31 @@ protected:
   struct export_state_t {
     export_state_t() {}
 
-    int state = 0;
+    void set_state(int s) {
+      ceph_assert(s != state);
+      if (state != EXPORT_CANCELLED) {
+	auto& t = state_history.at(state);
+	t.second = double(ceph_clock_now()) - double(t.first);
+      }
+      state = s;
+      state_history[state] = std::pair<utime_t, double>(ceph_clock_now(), 0.0);
+    }
+    utime_t get_start_time(int s) const {
+      ceph_assert(state_history.count(s) > 0);
+      return state_history.at(s).first;
+    }
+    double get_time_spent(int s) const {
+      ceph_assert(state_history.count(s) > 0);
+      const auto& t = state_history.at(s);
+      return s == state ? double(ceph_clock_now()) - double(t.first) : t.second;
+    }
+    double get_freeze_tree_time() const {
+      ceph_assert(state >= EXPORT_DISCOVERING);
+      ceph_assert(state_history.count((int)EXPORT_DISCOVERING) > 0);
+      return double(ceph_clock_now()) - double(state_history.at((int)EXPORT_DISCOVERING).first);
+    };
+
+    int state = EXPORT_CANCELLED;
     mds_rank_t peer = MDS_RANK_NONE;
     uint64_t tid = 0;
     std::set<mds_rank_t> warning_ack_waiting;
@@ -275,6 +301,10 @@ protected:
     std::map<inodeno_t,std::map<client_t,Capability::Import> > peer_imported;
     MutationRef mut;
     size_t approx_size = 0;
+    // record the start time and time spent of each export state
+    std::map<int, std::pair<utime_t, double> > state_history;
+    // record the clients whose sessions need to be flushed
+    std::set<client_t> export_client_set;
     // for freeze tree deadlock detection
     utime_t last_cum_auth_pins_change;
     int last_cum_auth_pins = 0;
diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
index 925bff16542..4426d3ca6fe 100644
--- a/src/mds/PurgeQueue.cc
+++ b/src/mds/PurgeQueue.cc
@@ -122,7 +122,7 @@ PurgeQueue::PurgeQueue(
     cct(cct_),
     rank(rank_),
     metadata_pool(metadata_pool_),
-    finisher(cct, "PurgeQueue", "PQ_Finisher"),
+    finisher(cct, "PurgeQueue", "mds-pq-fin"),
     timer(cct, lock),
     filer(objecter_, &finisher),
     objecter(objecter_),
diff --git a/src/mds/QuiesceAgent.h b/src/mds/QuiesceAgent.h
index 5c07d6d8074..85900e8e71b 100644
--- a/src/mds/QuiesceAgent.h
+++ b/src/mds/QuiesceAgent.h
@@ -30,7 +30,7 @@ class QuiesceAgent {
         : quiesce_control(quiesce_control)
         , stop_agent_thread(false)
         , agent_thread(this) {
-      agent_thread.create("quiesce.agt");
+      agent_thread.create("mds-q-agt");
     };
 
     virtual ~QuiesceAgent() {
diff --git a/src/mds/QuiesceDbEncoding.h b/src/mds/QuiesceDbEncoding.h
index c76ed2d0c52..27c7e3ca2d0 100644
--- a/src/mds/QuiesceDbEncoding.h
+++ b/src/mds/QuiesceDbEncoding.h
@@ -15,7 +15,7 @@
 #include "include/encoding.h"
 #include <stdint.h>
 
-void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(v.epoch, bl, features);
@@ -23,7 +23,7 @@ void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(v.epoch, p);
@@ -31,33 +31,33 @@ void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceState const & state, bufferlist& bl, uint64_t features=0)
+inline void encode(QuiesceState const & state, bufferlist& bl, uint64_t features=0)
 {
   static_assert(QuiesceState::QS__MAX <= UINT8_MAX);
   uint8_t v = (uint8_t)state;
   encode(v, bl, features);
 }
 
-void decode(QuiesceState & state, bufferlist::const_iterator& p)
+inline void decode(QuiesceState & state, bufferlist::const_iterator& p)
 {
   uint8_t v = 0;
   decode(v, p);
   state = (QuiesceState)v;
 }
 
-void encode(QuiesceTimeInterval const & interval, bufferlist& bl, uint64_t features=0)
+inline void encode(QuiesceTimeInterval const & interval, bufferlist& bl, uint64_t features=0)
 {
   encode(interval.count(), bl, features);
 }
 
-void decode(QuiesceTimeInterval & interval, bufferlist::const_iterator& p)
+inline void decode(QuiesceTimeInterval & interval, bufferlist::const_iterator& p)
 {
   QuiesceClock::rep count;
   decode(count, p);
   interval = QuiesceTimeInterval { count };
 }
 
-void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t features = 0)
+inline void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(rstate.state, bl, features);
@@ -65,7 +65,7 @@ void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t feature
   ENCODE_FINISH(bl);
 }
 
-void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
+inline void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(rstate.state, p);
@@ -73,7 +73,7 @@ void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(member.rstate, bl, features);
@@ -81,7 +81,7 @@ void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t featu
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
+inline void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(member.rstate, p);
@@ -89,7 +89,7 @@ void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(set.version, bl, features);
@@ -100,7 +100,7 @@ void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceSet& set, bufferlist::const_iterator& p)
+inline void decode(QuiesceSet& set, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(set.version, p);
@@ -111,7 +111,7 @@ void decode(QuiesceSet& set, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(req.control.raw, bl, features);
@@ -124,7 +124,7 @@ void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(req.control.raw, p);
@@ -137,7 +137,7 @@ void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(listing.db_version, bl, features);
@@ -146,7 +146,7 @@ void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features =
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(listing.db_version, p);
@@ -155,7 +155,7 @@ void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(listing.origin, bl, features);
@@ -163,7 +163,7 @@ void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t featur
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(listing.origin, p);
@@ -171,7 +171,7 @@ void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(root.state, bl, features);
@@ -179,7 +179,7 @@ void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
+inline void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(root.state, p);
@@ -187,7 +187,7 @@ void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(map.db_version, bl, features);
@@ -195,7 +195,7 @@ void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceMap& map, bufferlist::const_iterator& p)
+inline void decode(QuiesceMap& map, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(map.db_version, p);
@@ -203,7 +203,7 @@ void decode(QuiesceMap& map, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(ack.origin, bl, features);
@@ -211,7 +211,7 @@ void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbPeerAck& ack, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbPeerAck& ack, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(ack.origin, p);
diff --git a/src/mds/QuiesceDbManager.cc b/src/mds/QuiesceDbManager.cc
index 12c83634e54..bb3ae93e378 100644
--- a/src/mds/QuiesceDbManager.cc
+++ b/src/mds/QuiesceDbManager.cc
@@ -200,7 +200,7 @@ void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_mem
     // start the thread
     dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
     db_thread_should_exit = false;
-    quiesce_db_thread.create("quiesce_db_mgr");
+    quiesce_db_thread.create("mds-q-db");
   } else if (quiesce_db_thread.is_started()) {
     submit_condition.notify_all();
   }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 65ac6f17b43..5874a3dce56 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -615,6 +615,9 @@ void Server::handle_client_session(const cref_t<MClientSession> &m)
       mds->send_message(reply, m->get_connection());
       return;
     }
+    if (!session->client_opened) {
+      session->client_opened = true;
+    }
     if (session->is_opening() ||
 	session->is_open() ||
 	session->is_stale() ||
@@ -1054,7 +1057,7 @@ version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
   return pv;
 }
 
-void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+void Server::finish_force_open_sessions(map<client_t,pair<Session*,uint64_t> >& smap,
 					bool dec_import)
 {
   /*
@@ -1073,7 +1076,7 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_
 	dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
       } else {
 	dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
-	mds->sessionmap.set_state(session, Session::STATE_OPEN);
+	it.second.second = mds->sessionmap.set_state(session, Session::STATE_OPEN);
 	mds->sessionmap.touch_session(session);
         metrics_handler->add_session(session);
 
@@ -1103,6 +1106,29 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_
   dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
 }
 
+void Server::close_forced_opened_sessions(const map<client_t,pair<Session*,uint64_t> >& smap)
+{
+  dout(10) << __func__ << " on " << smap.size() << " clients" << dendl;
+
+  for (auto &it : smap) {
+    Session *session = it.second.first;
+    uint64_t sseq = it.second.second;
+    if (sseq == 0)
+      continue;
+    if (session->get_state_seq() != sseq) {
+      dout(10) << "skipping changed session (" << session->get_state_name() << ") "
+	       << session->info.inst << dendl;
+      continue;
+    }
+    if (session->client_opened)
+      continue;
+    dout(10) << "closing forced opened session (" << session->get_state_name() << ") "
+	     << session->info.inst << dendl;
+    ceph_assert(!session->is_importing());
+    journal_close_session(session, Session::STATE_CLOSING, NULL);
+  }
+}
+
 class C_MDS_TerminatedSessions : public ServerContext {
   void finish(int r) override {
     server->terminating_sessions = false;
@@ -4464,7 +4490,6 @@ void Server::_lookup_ino_2(const MDRequestRef& mdr, int r)
 }
 
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_open(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -4702,7 +4727,6 @@ bool Server::is_valid_layout(file_layout_t *layout)
   return true;
 }
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_openc(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -7169,7 +7193,6 @@ void Server::handle_client_mknod(const MDRequestRef& mdr)
 
 
 // MKDIR
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_mkdir(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -8767,8 +8790,6 @@ public:
  * all other nodes have also replciated destdn and straydn.  note that
  * destdn replicas need not also replicate srci.  this only works when 
  * destdn is leader.
- *
- * This function takes responsibility for the passed mdr.
  */
 void Server::handle_client_rename(const MDRequestRef& mdr)
 {
@@ -10913,7 +10934,6 @@ void Server::_peer_rename_sessions_flushed(const MDRequestRef& mdr)
 }
 
 // snaps
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_lssnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11023,7 +11043,6 @@ struct C_MDS_mksnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_mksnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11220,7 +11239,6 @@ struct C_MDS_rmsnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_rmsnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11350,7 +11368,6 @@ struct C_MDS_renamesnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_renamesnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 68842ea01cb..5f9a763e550 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -129,8 +129,9 @@ public:
   version_t prepare_force_open_sessions(std::map<client_t,entity_inst_t> &cm,
 					std::map<client_t,client_metadata_t>& cmm,
 					std::map<client_t,std::pair<Session*,uint64_t> >& smap);
-  void finish_force_open_sessions(const std::map<client_t,std::pair<Session*,uint64_t> >& smap,
+  void finish_force_open_sessions(std::map<client_t,std::pair<Session*,uint64_t> >& smap,
 				  bool dec_import=true);
+  void close_forced_opened_sessions(const std::map<client_t,std::pair<Session*,uint64_t> >& smap);
   void flush_client_sessions(std::set<client_t>& client_set, MDSGatherBuilder& gather);
   void finish_flush_session(Session *session, version_t seq);
   void terminate_sessions();
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index ba0b0817738..0f6038eb82b 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -615,6 +615,7 @@ void Session::dump(Formatter *f, bool cap_dump) const
   f->dump_unsigned("num_completed_requests", get_num_completed_requests());
   f->dump_unsigned("num_completed_flushes", get_num_completed_flushes());
   f->dump_bool("reconnecting", reconnecting);
+  f->dump_int("importing_count", importing_count);
   f->dump_object("recall_caps", recall_caps);
   f->dump_object("release_caps", release_caps);
   f->dump_object("recall_caps_throttle", recall_caps_throttle);
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index 9e82f00a9bf..bfe7dcd4895 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -417,6 +417,10 @@ public:
   session_info_t info;                         ///< durable bits
   MDSAuthCaps auth_caps;
 
+  // True if the session is opened by the client.
+  // False if the session is forced to open, until it is opened again by the client.
+  bool client_opened = false;
+
   xlist<Session*>::item item_session_list;
 
   std::list<ceph::ref_t<Message>> preopen_out_queue;  ///< messages for client, queued before they connect
diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h
index 96b2cb7d8b8..b001032225e 100644
--- a/src/messages/MClientCaps.h
+++ b/src/messages/MClientCaps.h
@@ -117,9 +117,9 @@ private:
   void set_ctime(const utime_t &t) { ctime = t; }
   void set_atime(const utime_t &t) { atime = t; }
 
-  void set_cap_peer(uint64_t id, ceph_seq_t seq, ceph_seq_t mseq, int mds, int flags) {
+  void set_cap_peer(uint64_t id, ceph_seq_t issue_seq, ceph_seq_t mseq, int mds, int flags) {
     peer.cap_id = id;
-    peer.seq = seq;
+    peer.issue_seq = issue_seq;
     peer.mseq = mseq;
     peer.mds = mds;
     peer.flags = flags;
@@ -137,11 +137,12 @@ protected:
 	      inodeno_t ino,
 	      inodeno_t realm,
 	      uint64_t id,
-	      long seq,
+	      ceph_seq_t seq,
 	      int caps,
 	      int wanted,
 	      int dirty,
-	      int mseq,
+	      ceph_seq_t mseq,
+              ceph_seq_t issue_seq,
               epoch_t oeb)
     : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION},
       osd_epoch_barrier(oeb) {
@@ -155,11 +156,12 @@ protected:
     head.wanted = wanted;
     head.dirty = dirty;
     head.migrate_seq = mseq;
+    head.issue_seq = issue_seq;
     memset(&peer, 0, sizeof(peer));
   }
   MClientCaps(int op,
 	      inodeno_t ino, inodeno_t realm,
-	      uint64_t id, int mseq, epoch_t oeb)
+	      uint64_t id, ceph_seq_t mseq, epoch_t oeb)
     : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION},
       osd_epoch_barrier(oeb) {
     memset(&head, 0, sizeof(head));
@@ -181,7 +183,8 @@ public:
     out << "client_caps(" << ceph_cap_op_name(head.op)
 	<< " ino " << inodeno_t(head.ino)
 	<< " " << head.cap_id
-	<< " seq " << head.seq;
+	<< " seq " << head.seq
+	<< " issue_seq " << head.issue_seq;
     if (get_tid())
       out << " tid " << get_tid();
     out << " caps=" << ccap_string(head.caps)
diff --git a/src/messages/MOSDPGPCT.h b/src/messages/MOSDPGPCT.h
new file mode 100644
index 00000000000..b3f88314ec3
--- /dev/null
+++ b/src/messages/MOSDPGPCT.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2024 IBM, Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+
+#include "MOSDFastDispatchOp.h"
+
+class MOSDPGPCT final : public MOSDFastDispatchOp {
+private:
+  static constexpr int HEAD_VERSION = 1;
+  static constexpr int COMPAT_VERSION = 1;
+
+public:
+  /// epoch at which the message was sent
+  epoch_t map_epoch = 0;
+
+  /// start epoch of the interval in which the message was sent
+  epoch_t min_epoch = 0;
+
+  /// target pg
+  spg_t pgid;
+
+  /**
+   * pg_committed_to
+   *
+   * Propagates PeeringState::pg_committed_to to replicas as with
+   * MOSDRepOp, ECSubWrite, MOSDPGPCT.
+   */
+  eversion_t pg_committed_to;
+
+  epoch_t get_map_epoch() const override {
+    return map_epoch;
+  }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
+  spg_t get_spg() const override {
+    return pgid;
+  }
+
+  MOSDPGPCT()
+    : MOSDFastDispatchOp{MSG_OSD_PG_PCT, HEAD_VERSION,
+			 COMPAT_VERSION} {}
+  MOSDPGPCT(
+    spg_t pgid,
+    epoch_t epoch,
+    epoch_t min_epoch,
+    eversion_t pg_committed_to)
+    : MOSDFastDispatchOp{MSG_OSD_PG_PCT, HEAD_VERSION,
+			 COMPAT_VERSION},
+      map_epoch(epoch),
+      min_epoch(min_epoch),
+      pgid(pgid),
+      pg_committed_to(pg_committed_to)
+  {}
+
+private:
+  ~MOSDPGPCT() final {}
+
+public:
+  std::string_view get_type_name() const override { return "PGPCT"; }
+  void print(std::ostream& out) const override {
+    out << "pg_pct(" << pgid << " epoch " << map_epoch
+	<< "/" << min_epoch
+	<< " pg_committed_to " << pg_committed_to
+	<< ")";
+  }
+
+  void encode_payload(uint64_t features) override {
+    using ceph::encode;
+    encode(map_epoch, payload);
+    encode(min_epoch, payload);
+    encode(pgid, payload);
+    encode(pg_committed_to, payload);
+  }
+  void decode_payload() override {
+    using ceph::decode;
+    auto p = payload.cbegin();
+    decode(map_epoch, p);
+    decode(min_epoch, p);
+    decode(pgid, p);
+    decode(pg_committed_to, p);
+  }
+private:
+  template<class T, typename... Args>
+  friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
+};
diff --git a/src/messages/MOSDPGUpdateLogMissing.h b/src/messages/MOSDPGUpdateLogMissing.h
index 2a0011e8fb7..ebe678c6c31 100644
--- a/src/messages/MOSDPGUpdateLogMissing.h
+++ b/src/messages/MOSDPGUpdateLogMissing.h
@@ -31,7 +31,23 @@ public:
   mempool::osd_pglog::list<pg_log_entry_t> entries;
   // piggybacked osd/pg state
   eversion_t pg_trim_to; // primary->replica: trim to here
-  eversion_t pg_roll_forward_to; // primary->replica: trim rollback info to here
+
+  /**
+   * pg_committed_to
+   *
+   * Propagates PeeringState::pg_committed_to to replicas as with
+   * MOSDRepOp, ECSubWrite
+   *
+   * Historical Note: Prior to early 2024, this field was named
+   * pg_roll_forward_to. pg_committed_to is a safe value to rollforward to as
+   * it is a conservative bound on versions that can become divergent.  Switching
+   * it to be populated by pg_committed_to rather than mlcod mirrors MOSDRepOp
+   * and upgrade cases in both directions should be safe as mlcod is <= pct
+   * and replicas (both ec and replicated) only actually rely on versions <= this
+   * field being non-divergent. This note may be removed in main after U is
+   * released.
+   */
+  eversion_t pg_committed_to;
 
   epoch_t get_epoch() const { return map_epoch; }
   spg_t get_pgid() const { return pgid; }
@@ -59,7 +75,7 @@ public:
     epoch_t min_epoch,
     ceph_tid_t rep_tid,
     eversion_t pg_trim_to,
-    eversion_t pg_roll_forward_to)
+    eversion_t pg_committed_to)
     : MOSDFastDispatchOp{MSG_OSD_PG_UPDATE_LOG_MISSING, HEAD_VERSION,
 			 COMPAT_VERSION},
       map_epoch(epoch),
@@ -69,7 +85,7 @@ public:
       rep_tid(rep_tid),
       entries(entries),
       pg_trim_to(pg_trim_to),
-      pg_roll_forward_to(pg_roll_forward_to)
+      pg_committed_to(pg_committed_to)
   {}
 
 private:
@@ -83,7 +99,7 @@ public:
 	<< " rep_tid " << rep_tid
 	<< " entries " << entries
 	<< " trim_to " << pg_trim_to
-	<< " roll_forward_to " << pg_roll_forward_to
+	<< " pg_committed_to " << pg_committed_to
 	<< ")";
   }
 
@@ -96,7 +112,7 @@ public:
     encode(entries, payload);
     encode(min_epoch, payload);
     encode(pg_trim_to, payload);
-    encode(pg_roll_forward_to, payload);
+    encode(pg_committed_to, payload);
   }
   void decode_payload() override {
     using ceph::decode;
@@ -113,7 +129,7 @@ public:
     }
     if (header.version >= 3) {
       decode(pg_trim_to, p);
-      decode(pg_roll_forward_to, p);
+      decode(pg_committed_to, p);
     }
   }
 private:
diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h
index ecfe3294d1c..5e8b386ba0a 100644
--- a/src/messages/MOSDRepOp.h
+++ b/src/messages/MOSDRepOp.h
@@ -54,7 +54,30 @@ public:
 
   // piggybacked osd/og state
   eversion_t pg_trim_to;   // primary->replica: trim to here
-  eversion_t min_last_complete_ondisk; // lower bound on committed version
+
+  /**
+   * pg_committed_to
+   *
+   * Used by the primary to propagate pg_committed_to to replicas for use in
+   * serving replica reads.
+   *
+   * Because updates <= pg_committed_to cannot become divergent, replicas
+   * may safely serve reads on objects which do not have more recent updates.
+   *
+   * See PeeringState::pg_committed_to, PeeringState::can_serve_replica_read
+   *
+   * Historical note: Prior to early 2024, this field was named
+   * min_last_complete_ondisk.  The replica, however, only actually relied on
+   * a single property of this field -- that any objects not modified since
+   * mlcod couldn't have uncommitted state.  Weakening the field to the condition
+   * above is therefore safe -- mlcod is always <= pg_committed_to and
+   * sending pg_committed_to to a replica expecting mlcod will work correctly
+   * as it only actually uses mlcod to check replica reads. The primary difference
+   * between mlcod and pg_committed_to is simply that mlcod doesn't advance past
+   * objects missing on replicas, but we check for that anyway.  This note may be
+   * removed in main after U is released.
+   */
+  eversion_t pg_committed_to;
 
   hobject_t new_temp_oid;      ///< new temp object that we must now start tracking
   hobject_t discard_temp_oid;  ///< previously used temp object that we can now stop tracking
@@ -110,14 +133,8 @@ public:
     decode(from, p);
     decode(updated_hit_set_history, p);
 
-    if (header.version >= 3) {
-      decode(min_last_complete_ondisk, p);
-    } else {
-      /* This field used to mean pg_roll_foward_to, but ReplicatedBackend
-       * simply assumes that we're rolling foward to version. */
-      eversion_t pg_roll_forward_to;
-      decode(pg_roll_forward_to, p);
-    }
+    ceph_assert(header.version >= 3);
+    decode(pg_committed_to, p);
     final_decode_needed = false;
   }
 
@@ -141,7 +158,7 @@ public:
     encode(discard_temp_oid, payload);
     encode(from, payload);
     encode(updated_hit_set_history, payload);
-    encode(min_last_complete_ondisk, payload);
+    encode(pg_committed_to, payload);
   }
 
   MOSDRepOp()
@@ -164,10 +181,6 @@ public:
     set_tid(rtid);
   }
 
-  void set_rollback_to(const eversion_t &rollback_to) {
-    header.version = 2;
-    min_last_complete_ondisk = rollback_to;
-  }
 private:
   ~MOSDRepOp() final {}
 
@@ -180,11 +193,7 @@ public:
       out << " " << poid << " v " << version;
       if (updated_hit_set_history)
         out << ", has_updated_hit_set_history";
-      if (header.version < 3) {
-	out << ", rollback_to(legacy)=" << min_last_complete_ondisk;
-      } else {
-	out << ", mlcod=" << min_last_complete_ondisk;
-      }
+      out << ", pct=" << pg_committed_to;
     }
     out << ")";
   }
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc
index 0eb304e7353..08501568a2c 100644
--- a/src/mgr/PyModuleRegistry.cc
+++ b/src/mgr/PyModuleRegistry.cc
@@ -151,7 +151,8 @@ bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
     return false;
   } else {
     bool modules_changed = mgr_map_.modules != mgr_map.modules ||
-      mgr_map_.always_on_modules != mgr_map.always_on_modules;
+      mgr_map_.always_on_modules != mgr_map.always_on_modules ||
+      mgr_map_.force_disabled_modules != mgr_map.force_disabled_modules;
     mgr_map = mgr_map_;
 
     if (standby_modules != nullptr) {
@@ -240,10 +241,20 @@ void PyModuleRegistry::active_start(
     // Anything we're skipping because of !can_run will be flagged
     // to the user separately via get_health_checks
     if (!(i.second->is_enabled() && i.second->is_loaded())) {
+      dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+	      << "not enabled and loaded"  << dendl;
       continue;
     }
 
-    dout(4) << "Starting " << i.first << dendl;
+    // These are always-on modules but user force-disabled them.
+    if (mgr_map.force_disabled_modules.find(i.first) !=
+	mgr_map.force_disabled_modules.end()) {
+      dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+	      << "force-disabled" << dendl;
+      continue;
+    }
+
+    dout(4) << "Starting module '" << i.first << "'" << dendl;
     active_modules->start_one(i.second);
   }
 }
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
index b935ace4aff..6220a357ff0 100644
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -385,16 +385,16 @@ public:
       return -EINVAL;
     }
 
-  bool confirm = false;
-  cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
-  if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
-    ss << "One or more file system health warnings are present. Modifying "
-       << "the file system setting variable \"max_mds\" may not help "
-       << "troubleshoot or recover from these warnings and may further "
-       << "destabilize the system. If you really wish to proceed, run "
-       << "again with --yes-i-really-mean-it";
-    return -EPERM;
-  }
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+    if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
+      ss << "One or more file system health warnings are present. Modifying "
+	 << "the file system setting variable \"max_mds\" may not help "
+	 << "troubleshoot or recover from these warnings and may further "
+	 << "destabilize the system. If you really wish to proceed, run "
+	 << "again with --yes-i-really-mean-it";
+      return -EPERM;
+    }
 
     return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val);
   }
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index d8cca4ceb61..f742303c6e9 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -758,6 +758,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
     if (state == MDSMap::STATE_DNE) {
       dout(1) << __func__ << ": DNE from " << info << dendl;
+
+      /* send a beacon reply so MDSDaemon::suicide() finishes the
+         Beacon::send_and_wait() call */
+      auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+          m->get_global_id(), m->get_name(), get_fsmap().get_epoch(),
+          m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
+      mon.send_reply(op, beacon.detach());
+
       goto evict;
     }
 
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
index 82f6ea88046..1ab542a871f 100644
--- a/src/mon/MgrMap.h
+++ b/src/mon/MgrMap.h
@@ -297,6 +297,9 @@ public:
   // active version.
   std::map<uint32_t, std::set<std::string>> always_on_modules;
 
+  // Modules which are always-on but have been force-disabled by user.
+  std::set<std::string> force_disabled_modules;
+
   // Modules which are reported to exist
   std::vector<ModuleInfo> available_modules;
 
@@ -448,7 +451,7 @@ public:
       ENCODE_FINISH(bl);
       return;
     }
-    ENCODE_START(13, 6, bl);
+    ENCODE_START(14, 6, bl);
     encode(epoch, bl);
     encode(active_addrs, bl, features);
     encode(active_gid, bl);
@@ -473,13 +476,14 @@ public:
     encode(clients_addrs, bl, features);
     encode(clients_names, bl, features);
     encode(flags, bl);
+    encode(force_disabled_modules, bl);
     ENCODE_FINISH(bl);
     return;
   }
 
   void decode(ceph::buffer::list::const_iterator& p)
   {
-    DECODE_START(13, p);
+    DECODE_START(14, p);
     decode(epoch, p);
     decode(active_addrs, p);
     decode(active_gid, p);
@@ -549,6 +553,11 @@ public:
     if (struct_v >= 13) {
       decode(flags, p);
     }
+
+    if (struct_v >= 14) {
+      decode(force_disabled_modules, p);
+    }
+
     DECODE_FINISH(p);
   }
 
@@ -603,6 +612,13 @@ public:
       f->close_section();
     }
     f->close_section(); // always_on_modules
+
+    f->open_object_section("force_disabled_modules");
+    for (auto& m : force_disabled_modules) {
+        f->dump_string("module", m);
+      }
+    f->close_section();
+
     f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
     f->open_array_section("active_clients");
     for (const auto& i : clients) {
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index 30382b97052..b89878dddb7 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -146,10 +146,12 @@ void MgrMonitor::create_initial()
   }
   pending_map.always_on_modules = always_on_modules();
   pending_command_descs = mgr_commands;
-  dout(10) << __func__ << " initial modules " << pending_map.modules
-	   << ", always on modules " << pending_map.get_always_on_modules()
-           << ", " << pending_command_descs.size() << " commands"
+  dout(10) << __func__ << " initial enabled modules: " << pending_map.modules
 	   << dendl;
+  dout(10) << __func__ << "always on modules: " <<
+	     pending_map.get_always_on_modules() << dendl;
+  dout(10) << __func__ << "total " << pending_command_descs.size() <<
+	      " commands" << dendl;
 }
 
 void MgrMonitor::get_store_prefixes(std::set<string>& s) const
@@ -1019,6 +1021,13 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
           f->dump_string("module", p);
         }
         f->close_section();
+
+        f->open_array_section("force_disabled_modules");
+        for (auto& p : map.force_disabled_modules) {
+          f->dump_string("module", p);
+        }
+        f->close_section();
+
         f->open_array_section("enabled_modules");
         for (auto& p : map.modules) {
           if (map.get_always_on_modules().count(p) > 0)
@@ -1048,7 +1057,11 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
 
       for (auto& p : map.get_always_on_modules()) {
         tbl << p;
-        tbl << "on (always on)";
+	if (map.force_disabled_modules.find(p) == map.force_disabled_modules.end()) {
+	  tbl << "on (always on)";
+	} else  {
+	  tbl << "off (always on but force-disabled)";
+	}
         tbl << TextTable::endrow;
       }
       for (auto& p : map.modules) {
@@ -1269,10 +1282,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       r = -EINVAL;
       goto out;
     }
-    if (pending_map.get_always_on_modules().count(module) > 0) {
+
+    if (pending_map.get_always_on_modules().count(module) > 0 &&
+        !pending_map.force_disabled_modules.contains(module)) {
       ss << "module '" << module << "' is already enabled (always-on)";
       goto out;
     }
+
     bool force = false;
     cmd_getval_compat_cephbool(cmdmap, "force", force);
     if (!pending_map.all_support_module(module) &&
@@ -1296,7 +1312,12 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       ss << "module '" << module << "' is already enabled";
       r = 0;
       goto out;
+    } else if (pending_map.force_disabled_modules.contains(module)) {
+      pending_map.force_disabled_modules.erase(module);
+      r = 0;
+      goto out;
     }
+
     pending_map.modules.insert(module);
   } else if (prefix == "mgr module disable") {
     string module;
@@ -1306,8 +1327,9 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       goto out;
     }
     if (pending_map.get_always_on_modules().count(module) > 0) {
-      ss << "module '" << module << "' cannot be disabled (always-on)";
-      r = -EINVAL;
+      ss << "module '" << module << "' cannot be disabled (always-on), use " <<
+	 "'ceph mgr module force disable' command to disable an always-on module";
+      r = -EPERM;
       goto out;
     }
     if (!pending_map.module_enabled(module)) {
@@ -1318,7 +1340,52 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
     if (!pending_map.modules.count(module)) {
       ss << "module '" << module << "' is not enabled";
     }
+    dout(8) << __func__ << " disabling module " << module << " from new " << dendl;
     pending_map.modules.erase(module);
+  } else if (prefix == "mgr module force disable") {
+    string mod;
+    cmd_getval(cmdmap, "module", mod);
+
+    bool confirmation_flag = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirmation_flag);
+
+    if (mod.empty()) {
+      ss << "Module name wasn't passed!";
+      r = -EINVAL;
+      goto out;
+    }
+
+    if (!pending_map.get_always_on_modules().contains(mod)) {
+      ss << "Always-on module named \"" << mod << "\" does not exist";
+      r = -EINVAL;
+      goto out;
+    } else if (pending_map.modules.contains(mod)) {
+      ss << "Module '" << mod << "' is not an always-on module, only always-on " <<
+	 "modules can be disabled through this command.";
+      r = -EINVAL;
+      goto out;
+    }
+
+    if (pending_map.force_disabled_modules.contains(mod)) {
+      ss << "Module \"" << mod << "\"is already disabled";
+      r = 0;
+      goto out;
+    }
+
+    if (!confirmation_flag) {
+      ss << "This command will disable operations and remove commands that "
+	 << "other Ceph utilities expect to be available. Do not continue "
+	 << "unless your cluster is already experiencing an event due to "
+	 << "which it is advised to disable this module as part of "
+	 << "troubleshooting. If you are sure that you wish to continue, "
+	 << "run again with --yes-i-really-mean-it";
+      r = -EPERM;
+      goto out;
+    }
+
+    dout(8) << __func__ << " force-disabling module '" << mod << "'" << dendl;
+    pending_map.force_disabled_modules.insert(mod);
+    pending_map.modules.erase(mod);
   } else {
     ss << "Command '" << prefix << "' not implemented!";
     r = -ENOSYS;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index b5de8837cb7..5564042eaf7 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -558,6 +558,11 @@ COMMAND("mon enable_stretch_mode " \
 	"as the tiebreaker and setting <dividing_bucket> locations "
 	"as the units for stretching across",
 	"mon", "rw")
+COMMAND("mon disable_stretch_mode " \
+	"name=crush_rule,type=CephString,req=false, "
+	"name=yes_i_really_mean_it,type=CephBool,req=false, ",
+	"disable stretch mode, reverting to normal peering rules",
+	"mon", "rw")
 COMMAND("mon set_new_tiebreaker " \
 	"name=name,type=CephString "
 	"name=yes_i_really_mean_it,type=CephBool,req=false",
@@ -1161,11 +1166,11 @@ COMMAND("osd pool rename "
 	"rename <srcpool> to <destpool>", "osd", "rw")
 COMMAND("osd pool get "
 	"name=pool,type=CephPoolname "
-	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio",
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay",
 	"get pool parameter <var>", "osd", "r")
 COMMAND("osd pool set "
 	"name=pool,type=CephPoolname "
-	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio "
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay "
 	"name=val,type=CephString "
 	"name=yes_i_really_mean_it,type=CephBool,req=false",
 	"set pool parameter <var> to <val>", "osd", "rw")
@@ -1357,6 +1362,10 @@ COMMAND("mgr module enable "
 COMMAND("mgr module disable "
 	"name=module,type=CephString",
 	"disable mgr module", "mgr", "rw")
+COMMAND("mgr module force disable "
+	"name=module,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"force disable a always-on mgr module", "mgr", "rw")
 COMMAND("mgr metadata name=who,type=CephString,req=false",
 	"dump metadata for all daemons or a specific daemon",
 	"mgr", "r")
diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc
index 6eb37df171a..8d0540d71f2 100644
--- a/src/mon/MonMap.cc
+++ b/src/mon/MonMap.cc
@@ -196,7 +196,12 @@ void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const
   if (!HAVE_FEATURE(con_features, MONENC) ||
       !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
     for (auto& [name, info] : mon_info) {
-      legacy_mon_addr[name] = info.public_addrs.legacy_addr();
+      // see note in mon_info_t::encode()
+      auto addr = info.public_addrs.legacy_addr();
+      if (addr == entity_addr_t()) {
+        addr = info.public_addrs.as_legacy_addr();
+      }
+      legacy_mon_addr[name] = addr;
     }
   }
 
@@ -431,10 +436,10 @@ void MonMap::dump(Formatter *f) const
   f->dump_unsigned("min_mon_release", to_integer<unsigned>(min_mon_release));
   f->dump_string("min_mon_release_name", to_string(min_mon_release));
   f->dump_int ("election_strategy", strategy);
-  f->dump_stream("disallowed_leaders: ") << disallowed_leaders;
+  f->dump_stream("disallowed_leaders") << disallowed_leaders;
   f->dump_bool("stretch_mode", stretch_mode_enabled);
   f->dump_string("tiebreaker_mon", tiebreaker_mon);
-  f->dump_stream("removed_ranks: ") << removed_ranks;
+  f->dump_stream("removed_ranks") << removed_ranks;
   f->open_object_section("features");
   persistent_features.dump(f, "persistent");
   optional_features.dump(f, "optional");
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 0d25c4b96ac..03826e3dd48 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -6688,6 +6688,8 @@ void Monitor::notify_new_monmap(bool can_change_external_state, bool remove_rank
 
   if (monmap->stretch_mode_enabled) {
     try_engage_stretch_mode();
+  } else {
+    try_disable_stretch_mode();
   }
 
   if (is_stretch_mode()) {
@@ -6746,6 +6748,32 @@ void Monitor::try_engage_stretch_mode()
     disconnect_disallowed_stretch_sessions();
   }
 }
+struct CMonDisableStretchMode : public Context {
+  Monitor *m;
+  CMonDisableStretchMode(Monitor *mon) : m(mon) {}
+  void finish(int r) {
+    m->try_disable_stretch_mode();
+  }
+};
+void Monitor::try_disable_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  if (!stretch_mode_engaged) return;
+  if (!osdmon()->is_readable()) {
+    dout(20) << "osdmon is not readable" << dendl;
+    osdmon()->wait_for_readable_ctx(new CMonDisableStretchMode(this));
+    return;
+  }
+  if (!osdmon()->osdmap.stretch_mode_enabled &&
+      !monmap->stretch_mode_enabled) {
+    dout(10) << "Disabling stretch mode!" << dendl;
+    stretch_mode_engaged = false;
+    stretch_bucket_divider.clear();
+    degraded_stretch_mode = false;
+    recovering_stretch_mode = false;
+  }
+
+}
 
 void Monitor::do_stretch_mode_election_work()
 {
@@ -6802,6 +6830,7 @@ struct CMonGoRecovery : public Context {
 void Monitor::go_recovery_stretch_mode()
 {
   dout(20) << __func__ << dendl;
+  if (!is_stretch_mode()) return;
   dout(20) << "is_leader(): " << is_leader() << dendl;
   if (!is_leader()) return;
   dout(20) << "is_degraded_stretch_mode(): " << is_degraded_stretch_mode() << dendl;
@@ -6832,6 +6861,7 @@ void Monitor::go_recovery_stretch_mode()
 
 void Monitor::set_recovery_stretch_mode()
 {
+  if (!is_stretch_mode()) return;
   degraded_stretch_mode = true;
   recovering_stretch_mode = true;
   osdmon()->set_recovery_stretch_mode();
@@ -6840,6 +6870,7 @@ void Monitor::set_recovery_stretch_mode()
 void Monitor::maybe_go_degraded_stretch_mode()
 {
   dout(20) << __func__ << dendl;
+  if (!is_stretch_mode()) return;
   if (is_degraded_stretch_mode()) return;
   if (!is_leader()) return;
   if (dead_mon_buckets.empty()) return;
@@ -6878,6 +6909,7 @@ void Monitor::trigger_degraded_stretch_mode(const set<string>& dead_mons,
 					    const set<int>& dead_buckets)
 {
   dout(20) << __func__ << dendl;
+  if (!is_stretch_mode()) return;
   ceph_assert(osdmon()->is_writeable());
   ceph_assert(monmon()->is_writeable());
 
@@ -6898,6 +6930,7 @@ void Monitor::trigger_degraded_stretch_mode(const set<string>& dead_mons,
 void Monitor::set_degraded_stretch_mode()
 {
   dout(20) << __func__ << dendl;
+  if (!is_stretch_mode()) return;
   degraded_stretch_mode = true;
   recovering_stretch_mode = false;
   osdmon()->set_degraded_stretch_mode();
@@ -6915,6 +6948,7 @@ struct CMonGoHealthy : public Context {
 void Monitor::trigger_healthy_stretch_mode()
 {
   dout(20) << __func__ << dendl;
+  if (!is_stretch_mode()) return;
   if (!is_degraded_stretch_mode()) return;
   if (!is_leader()) return;
   if (!osdmon()->is_writeable()) {
@@ -6935,6 +6969,7 @@ void Monitor::trigger_healthy_stretch_mode()
 
 void Monitor::set_healthy_stretch_mode()
 {
+  if (!is_stretch_mode()) return;
   degraded_stretch_mode = false;
   recovering_stretch_mode = false;
   osdmon()->set_healthy_stretch_mode();
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 90fbc8f09c0..8c152fe108f 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -293,6 +293,7 @@ public:
    * updates across the entire cluster.
    */
   void try_engage_stretch_mode();
+  void try_disable_stretch_mode();
   void maybe_go_degraded_stretch_mode();
   void trigger_degraded_stretch_mode(const std::set<std::string>& dead_mons,
 				     const std::set<int>& dead_buckets);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 1226c8a8241..732238f4358 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -1187,6 +1187,42 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
       ceph_assert(okay == true);
     }
     request_proposal(mon.osdmon());
+  } else if (prefix == "mon disable_stretch_mode") {
+    if (!mon.osdmon()->is_writeable()) {
+      dout(10) << __func__
+        << ":  waiting for osdmon writeable for stretch mode" << dendl;
+      mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
+      return false;  /* do not propose, yet */
+    }
+    bool sure = false;
+    bool okay = false;
+    int errcode = 0;
+    if (!pending_map.stretch_mode_enabled) {
+      ss << "stretch mode is already disabled";
+      err = -EINVAL;
+      goto reply_no_propose;
+    }
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << " This command will disable stretch mode, "
+      "which means all your pools will be reverted back "
+      "to the default size, min_size and crush_rule. "
+      "Pass --yes-i-really-mean-it to proceed.";
+      err = -EPERM;
+      goto reply_no_propose;
+    }
+    string crush_rule = cmd_getval_or<string>(cmdmap, "crush_rule", string{});
+    mon.osdmon()->try_disable_stretch_mode(ss, &okay, &errcode, crush_rule);
+    if (!okay) {
+      err = errcode;
+      goto reply_no_propose;
+    }
+    pending_map.stretch_mode_enabled = false;
+    pending_map.tiebreaker_mon = "";
+    pending_map.disallowed_leaders.clear();
+    pending_map.stretch_marked_down_mons.clear();
+    pending_map.last_changed = ceph_clock_now();
+    request_proposal(mon.osdmon());
   } else {
     ss << "unknown command " << prefix;
     err = -EINVAL;
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index c01ea9e7103..719403925ad 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -16,7 +16,9 @@
 #include "NVMeofGwMon.h"
 #include "NVMeofGwMap.h"
 #include "OSDMonitor.h"
+#include "mon/health_check.h"
 
+using std::list;
 using std::map;
 using std::make_pair;
 using std::ostream;
@@ -665,6 +667,8 @@ void NVMeofGwMap::fsm_handle_gw_no_subsystems(
   break;
 
   case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
+  {
+    auto& gw_id_st = created_gws[group_key][gw_id];
     cancel_timer(gw_id, group_key,  grpid);
     map_modified = true;
     for (auto& gw_st: created_gws[group_key]) {
@@ -673,13 +677,18 @@ void NVMeofGwMap::fsm_handle_gw_no_subsystems(
       if (st.sm_state[grpid] ==
       gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
     dout(4) << "Warning: Outgoing Failback when GW is without subsystems"
-        << " - to rollback it" <<" GW " << gw_id << "for ANA Group "
+        <<" Owner GW set to standby state " << gw_st.first << "for ANA Group "
         << grpid << dendl;
     st.standby_state(grpid);
     break;
       }
     }
-    break;
+    dout(4) << "Warning: Outgoing Failback when GW is without subsystems"
+       <<" Failback GW set to standby state " << gw_id << "for ANA Group "
+       << grpid << dendl;
+    gw_id_st.standby_state(grpid);
+  }
+  break;
 
   case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
   case gw_states_per_group_t::GW_ACTIVE_STATE:
@@ -716,6 +725,8 @@ void NVMeofGwMap::fsm_handle_gw_down(
   break;
 
   case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
+  {
+    auto& gw_id_st = created_gws[group_key][gw_id];
     cancel_timer(gw_id, group_key,  grpid);
     map_modified = true;
     for (auto& gw_st: created_gws[group_key]) {
@@ -724,13 +735,18 @@ void NVMeofGwMap::fsm_handle_gw_down(
       if (st.sm_state[grpid] ==
 	  gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
 	dout(4) << "Warning: Outgoing Failback when GW is down back"
-		<< " - to rollback it" <<" GW " << gw_id << "for ANA Group "
+		<<"Owner GW set to standby state " << gw_id << "for ANA Group "
 		<< grpid << dendl;
 	st.standby_state(grpid);
 	break;
       }
     }
-    break;
+    dout(4) << "Warning: Outgoing Failback when GW is down back"
+       <<" Failback GW set to standby state " << gw_id << "for ANA Group "
+       << grpid << dendl;
+    gw_id_st.standby_state(grpid);
+  }
+  break;
 
   case gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED:
     // nothing to do - let failback timer expire
@@ -879,6 +895,47 @@ struct CMonRequestProposal : public Context {
   }
 };
 
+void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const 
+{
+  list<string> singleGatewayDetail;
+  list<string> gatewayDownDetail;
+  for (const auto& created_map_pair: created_gws) {
+    const auto& group_key = created_map_pair.first;
+    auto& group = group_key.second;
+    const NvmeGwMonStates& gw_created_map = created_map_pair.second;
+    if ( gw_created_map.size() == 1) {
+      ostringstream ss;
+      ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ;
+      singleGatewayDetail.push_back(ss.str());
+    }
+    for (const auto& gw_created_pair: gw_created_map) {
+      const auto& gw_id = gw_created_pair.first;
+      const auto& gw_created  = gw_created_pair.second;
+      if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) {
+        ostringstream ss;
+        ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
+        gatewayDownDetail.push_back(ss.str());
+      }
+    }
+  }
+  if (!singleGatewayDetail.empty()) {
+    ostringstream ss;
+    ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
+      << "; HA is not possible with single gateway.";
+    auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN,
+        ss.str(), singleGatewayDetail.size());
+    d.detail.swap(singleGatewayDetail);
+  }
+  if (!gatewayDownDetail.empty()) {
+    ostringstream ss;
+    ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state"
+      << "; gateway might be down, try to redeploy.";
+    auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN,
+        ss.str(), gatewayDownDetail.size());
+    d.detail.swap(gatewayDownDetail);
+  }
+}
+
 int NVMeofGwMap::blocklist_gw(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 267d85b10f9..5f657733012 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -27,6 +27,9 @@
 #include "NVMeofGwTypes.h"
 
 using ceph::coarse_mono_clock;
+
+class health_check_map_t;
+
 class Monitor;
 /*-------------------*/
 class NVMeofGwMap
@@ -140,6 +143,8 @@ public:
     decode(fsm_timers, bl);
     DECODE_FINISH(bl);
   }
+
+  void get_health_checks(health_check_map_t *checks) const;
 };
 
 #include "NVMeofGwSerialize.h"
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index d9e936e27df..0fe5c3e655f 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -66,6 +66,11 @@ void NVMeofGwMon::on_shutdown()
 
 void NVMeofGwMon::tick()
 {
+  if (++tick_ratio == 10) {
+    global_rebalance_index++;
+    dout(20) <<  "rebalance index " << global_rebalance_index << dendl;
+    tick_ratio = 0;
+  }
   if (!is_active() || !mon.is_leader()) {
     dout(10) << "NVMeofGwMon leader : " << mon.is_leader()
 	     << "active : " << is_active()  << dendl;
@@ -176,6 +181,11 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
 	   << HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl;
   put_version(t, pending_map.epoch, bl);
   put_last_committed(t, pending_map.epoch);
+
+  //health
+  health_check_map_t checks;
+  pending_map.get_health_checks(&checks);
+  encode_health(checks, t);
 }
 
 void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
@@ -188,6 +198,7 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
     bufferlist bl;
     int err = get_version(version, bl);
     ceph_assert(err == 0);
+    load_health();
 
     auto p = bl.cbegin();
     map.decode(p);
@@ -317,6 +328,11 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
     f->dump_string("group", group);
     if (HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA)) {
       f->dump_string("features", "LB");
+      if (map.created_gws[group_key].size()) {
+        uint32_t index = (global_rebalance_index %
+              map.created_gws[group_key].size()) + 1;
+        f->dump_unsigned("rebalance_ana_group", index);
+      }
     }
     f->dump_unsigned("num gws", map.created_gws[group_key].size());
     if (map.created_gws[group_key].size() == 0) {
@@ -607,11 +623,13 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   if (sub.size() == 0) {
     avail = gw_availability_t::GW_CREATED;
+    dout(20) << "No-subsystems condition detected for GW " << gw_id <<dendl;
   } else {
-    bool listener_found = false;
+    bool listener_found = true;
     for (auto &subs: sub) {
-      if (subs.listeners.size()) {
-        listener_found = true;
+      if (subs.listeners.size() == 0) {
+        listener_found = false;
+        dout(10) << "No-listeners condition detected for GW " << gw_id << " for nqn " << subs.nqn << dendl;
         break;
       }
     }
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h
index 7fae8b766a5..2d13e153bd2 100644
--- a/src/mon/NVMeofGwMon.h
+++ b/src/mon/NVMeofGwMon.h
@@ -83,6 +83,9 @@ public:
   void check_sub(Subscription *sub);
 
 private:
+  // used for calculate pool & group GW responsible for rebalance
+  uint32_t global_rebalance_index = 1;
+  uint8_t  tick_ratio = 0;
   void synchronize_last_beacon();
   void process_gw_down(const NvmeGwId &gw_id,
      const NvmeGroupKey& group_key, bool &propose_pending,
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index ecde838a74c..69be79b3a8f 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -983,6 +983,8 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
       mon.maybe_go_degraded_stretch_mode();
     }
+  } else {
+    mon.try_disable_stretch_mode();
   }
 }
 
@@ -15079,6 +15081,65 @@ void OSDMonitor::convert_pool_priorities(void)
   }
 }
 
+void OSDMonitor::try_disable_stretch_mode(stringstream& ss,
+     bool *okay,
+     int *errcode,
+     const string& crush_rule)
+{
+  dout(20) << __func__ << dendl;
+  *okay = false;
+  if (!osdmap.stretch_mode_enabled) {
+    ss << "stretch mode is already disabled";
+    *errcode = -EINVAL;
+    return;
+  }
+  if (osdmap.recovering_stretch_mode) {
+    ss << "stretch mode is currently recovering and cannot be disabled";
+    *errcode = -EBUSY;
+    return;
+  }
+  for (const auto& pi : osdmap.get_pools()) {
+    pg_pool_t *pool = pending_inc.get_new_pool(pi.first, &pi.second);
+    pool->peering_crush_bucket_count = 0;
+    pool->peering_crush_bucket_target = 0;
+    pool->peering_crush_bucket_barrier = 0;
+    pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+    pool->size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+    pool->min_size = g_conf().get_osd_pool_default_min_size(pool->size);
+    // if crush rule is supplied, use it if it exists in crush map
+    if (!crush_rule.empty()) {
+      int crush_rule_id = osdmap.crush->get_rule_id(crush_rule);
+      if (crush_rule_id < 0) {
+        ss << "unrecognized crush rule " << crush_rule;
+        *errcode = -EINVAL;
+        return;
+      }
+      if (!osdmap.crush->rule_valid_for_pool_type(crush_rule_id, pool->get_type())) {
+        ss << "crush rule " << crush_rule << " type does not match pool type";
+        *errcode = -EINVAL;
+        return;
+      }
+      if (crush_rule_id == pool->crush_rule) {
+        ss << "You can't disable stretch mode with the same crush rule you are using";
+        *errcode = -EINVAL;
+        return;
+      }
+      pool->crush_rule = crush_rule_id;
+    } else {
+      // otherwise, use the default rule
+      pool->crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
+    }
+  }
+  pending_inc.change_stretch_mode = true;
+  pending_inc.stretch_mode_enabled = false;
+  pending_inc.new_stretch_bucket_count = 0;
+  pending_inc.new_degraded_stretch_mode = 0;
+  pending_inc.new_stretch_mode_bucket = 0;
+  pending_inc.new_recovering_stretch_mode = 0;
+  *okay = true;
+  return;
+}
+
 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
 					       int *errcode,
 					       set<pg_pool_t*>* pools,
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index ccd11be8a83..c82373c634d 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -845,6 +845,20 @@ public:
 			       const std::set<pg_pool_t*>& pools,
 			       const std::string& new_crush_rule);
   /**
+  *
+  * Set all stretch mode values of all pools back to pre-stretch mode values.
+  * Set all stretch mode values of OSDMap back to pre-stretch mode values.
+  * If crush_rule is not empty, set the crush rule to that value, else use
+  * the default replicated crush rule.
+  * @param ss: a stringstream to write errors into
+  * @param errcode: filled with -errno if there's a problem
+  * @param crush_rule: the crush rule that will used after disabling stretch mode
+  */
+  void try_disable_stretch_mode(std::stringstream& ss,
+          bool *okay,
+          int *errcode,
+          const std::string& crush_rule);
+  /**
    * Check the input dead_buckets mapping (buckets->dead monitors) to see
    * if the OSDs are also down. If so, fill in really_down_buckets and
    * really_down_mons and return true; else return false.
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index f649e0f3d3e..50af00db28d 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -219,6 +219,8 @@
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 
+#include "messages/MOSDPGPCT.h"
+
 #include "messages/MNVMeofGwBeacon.h"
 #include "messages/MNVMeofGwMap.h"
 
@@ -549,6 +551,9 @@ Message *decode_message(CephContext *cct,
   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
     m = make_message<MOSDPGUpdateLogMissingReply>();
     break;
+  case MSG_OSD_PG_PCT:
+    m = make_message<MOSDPGPCT>();
+    break;
   case CEPH_MSG_OSD_BACKOFF:
     m = make_message<MOSDBackoff>();
     break;
diff --git a/src/msg/Message.h b/src/msg/Message.h
index bb67ff3eef5..80d2295c89f 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -136,6 +136,8 @@
 #define MSG_OSD_PG_UPDATE_LOG_MISSING  114
 #define MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY  115
 
+#define MSG_OSD_PG_PCT 136
+
 #define MSG_OSD_PG_CREATED      116
 #define MSG_OSD_REP_SCRUBMAP    117
 #define MSG_OSD_PG_RECOVERY_DELETE 118
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
index 6739968f4e2..5f8bbc172df 100644
--- a/src/msg/async/Stack.h
+++ b/src/msg/async/Stack.h
@@ -352,7 +352,7 @@ class NetworkStack {
     static constexpr int TASK_COMM_LEN = 16;
     char tp_name[TASK_COMM_LEN];
     sprintf(tp_name, "msgr-worker-%u", id);
-    ceph_pthread_setname(pthread_self(), tp_name);
+    ceph_pthread_setname(tp_name);
   }
 
  protected:
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
index 12db599d684..789a624cf90 100644
--- a/src/msg/async/rdma/RDMAStack.cc
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -92,7 +92,6 @@ void RDMADispatcher::polling_start()
   ceph_assert(rx_cq);
 
   t = std::thread(&RDMADispatcher::polling, this);
-  ceph_pthread_setname(t.native_handle(), "rdma-polling");
 }
 
 void RDMADispatcher::polling_stop()
@@ -263,6 +262,7 @@ int RDMADispatcher::post_chunks_to_rq(int num, QueuePair *qp)
 
 void RDMADispatcher::polling()
 {
+  ceph_pthread_setname("rdma-polling");
   static int MAX_COMPLETIONS = 32;
   ibv_wc wc[MAX_COMPLETIONS];
 
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 535cf166f0a..a024a0c2105 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -14129,6 +14129,7 @@ void BlueStore::_txc_state_proc(TransContext *txc)
 	if (txc->had_ios)
 	  kv_ios++;
 	kv_throttle_costs += txc->cost;
+	++kv_throttle_txcs;
       }
       return;
     case TransContext::STATE_KV_SUBMITTED:
@@ -14375,7 +14376,18 @@ void BlueStore::_txc_committed_kv(TransContext *txc)
     mono_clock::now() - txc->start,
     cct->_conf->bluestore_log_op_age,
     [&](auto lat) {
-      return ", txc = " + stringify(txc);
+      return ", txc = " + stringify(txc) +
+             ", txc bytes = " + stringify(txc->bytes) +
+             ", txc ios = " + stringify(txc->ios) +
+             ", txc cost = " + stringify(txc->cost) +
+             ", txc onodes = " + stringify(txc->onodes.size()) +
+             ", DB updates = " + stringify(txc->t->get_count()) +
+             ", DB bytes = " + stringify(txc->t->get_size_bytes()) +
+             ", cost max = " + stringify(throttle.bytes_observed_max) +
+               " on " + stringify(throttle.bytes_max_ts) +
+             ", txc max = " + stringify(throttle.transactions_observed_max) +
+               " on " + stringify(throttle.transactions_max_ts)
+             ;
     },
     l_bluestore_slow_committed_kv_count
   );
@@ -14725,7 +14737,7 @@ void BlueStore::_kv_sync_thread()
     } else {
       deque<TransContext*> kv_submitting;
       deque<DeferredBatch*> deferred_done, deferred_stable;
-      uint64_t aios = 0, costs = 0;
+      uint64_t aios = 0, costs = 0, txcs = 0;
 
       dout(20) << __func__ << " committing " << kv_queue.size()
 	       << " submitting " << kv_queue_unsubmitted.size()
@@ -14738,8 +14750,10 @@ void BlueStore::_kv_sync_thread()
       deferred_stable.swap(deferred_stable_queue);
       aios = kv_ios;
       costs = kv_throttle_costs;
+      txcs = kv_throttle_txcs;
       kv_ios = 0;
       kv_throttle_costs = 0;
+      kv_throttle_txcs = 0;
       l.unlock();
 
       dout(30) << __func__ << " committing " << kv_committing << dendl;
@@ -14835,7 +14849,7 @@ void BlueStore::_kv_sync_thread()
       // iteration there will already be ops awake.  otherwise, we
       // end up going to sleep, and then wake up when the very first
       // transaction is ready for commit.
-      throttle.release_kv_throttle(costs);
+      throttle.release_kv_throttle(costs, txcs);
 
       // cleanup sync deferred keys
       for (auto b : deferred_stable) {
@@ -18637,6 +18651,20 @@ bool BlueStore::BlueStoreThrottle::try_start_transaction(
   TransContext &txc,
   mono_clock::time_point start_throttle_acquire)
 {
+  {
+    std::lock_guard l(lock);
+    auto cost0 = throttle_bytes.get_current();
+    if (cost0 + txc.cost > bytes_observed_max) {
+      bytes_observed_max = cost0 + txc.cost;
+      bytes_max_ts = ceph_clock_now();
+    }
+    auto txcs = ++transactions;
+    if (txcs > transactions_observed_max) {
+      transactions_observed_max = txcs;
+      transactions_max_ts = ceph_clock_now();
+    }
+  }
+
   throttle_bytes.get(txc.cost);
 
   if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index 207ae2ec7a2..99f8d057cf0 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -2096,6 +2096,20 @@ public:
     Throttle throttle_deferred_bytes;  ///< submit to deferred complete
 
   public:
+    ceph::mutex lock = ceph::make_mutex("BlueStoreThrottle::max_lock");
+
+    std::atomic<uint64_t> transactions = 0;
+
+    int64_t  bytes_observed_max = 0;
+    utime_t  bytes_max_ts;
+    uint64_t transactions_observed_max = 0;
+    utime_t  transactions_max_ts;
+
+    uint64_t get_current() {
+      return throttle_bytes.get_current();
+    }
+
+  public:
     BlueStoreThrottle(CephContext *cct) :
       throttle_bytes(cct, "bluestore_throttle_bytes", 0),
       throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
@@ -2121,8 +2135,9 @@ public:
       KeyValueDB &db,
       TransContext &txc,
       ceph::mono_clock::time_point);
-    void release_kv_throttle(uint64_t cost) {
+    void release_kv_throttle(uint64_t cost, uint64_t txcs) {
       throttle_bytes.put(cost);
+      transactions -= txcs;
     }
     void release_deferred_throttle(uint64_t cost) {
       throttle_deferred_bytes.put(cost);
@@ -2485,6 +2500,7 @@ private:
 
   uint64_t kv_ios = 0;
   uint64_t kv_throttle_costs = 0;
+  uint64_t kv_throttle_txcs = 0;
 
   // cache trim control
   uint64_t cache_size = 0;       ///< total cache size
diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc
index c24bf2161aa..d62721b4366 100644
--- a/src/os/bluestore/bluestore_tool.cc
+++ b/src/os/bluestore/bluestore_tool.cc
@@ -746,20 +746,25 @@ int main(int argc, char **argv)
   else if (action == "show-label") {
     JSONFormatter jf(true);
     jf.open_object_section("devices");
+    bool any_success = false;
     for (auto& i : devs) {
+      jf.open_object_section(i.c_str());
       bluestore_bdev_label_t label;
       int r = BlueStore::read_bdev_label(cct.get(), i, &label);
       if (r < 0) {
-	cerr << "unable to read label for " << i << ": "
-	     << cpp_strerror(r) << std::endl;
-	exit(EXIT_FAILURE);
+        cerr << "unable to read label for " << i << ": "
+             << cpp_strerror(r) << std::endl;
+      } else {
+        any_success = true;
+        label.dump(&jf);
       }
-      jf.open_object_section(i.c_str());
-      label.dump(&jf);
       jf.close_section();
     }
     jf.close_section();
     jf.flush(cout);
+    if (!any_success) {
+      exit(EXIT_FAILURE);
+    }
   }
   else if (action == "set-label-key") {
     bluestore_bdev_label_t label;
diff --git a/src/os/fs/btrfs_ioctl.h b/src/os/fs/btrfs_ioctl.h
deleted file mode 100644
index 277498ca8dc..00000000000
--- a/src/os/fs/btrfs_ioctl.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __IOCTL_
-#define __IOCTL_
-
-#if defined(__linux__)
-#include <linux/ioctl.h>
-#elif defined(__FreeBSD__)
-#include <sys/ioctl.h>
-#endif
-
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
-	__s64 fd;
-	char name[BTRFS_PATH_NAME_MAX + 1];
-};
-
-#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
-
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
-	__s64 fd;
-	__u64 transid;
-	__u64 flags;
-	__u64 unused[4];
-	char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
-	__u64 treeid;
-	__u64 objectid;
-	char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-
-struct btrfs_ioctl_search_key {
-	/* which root are we searching.  0 is the tree of tree roots */
-	__u64 tree_id;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_objectid;
-	__u64 max_objectid;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_offset;
-	__u64 max_offset;
-
-	/* max and min transids to search for */
-	__u64 min_transid;
-	__u64 max_transid;
-
-	/* keys returned will be >= min and <= max */
-	__u32 min_type;
-	__u32 max_type;
-
-	/*
-	 * how many items did userland ask for, and how many are we
-	 * returning
-	 */
-	__u32 nr_items;
-
-	/* align to 64 bits */
-	__u32 unused;
-
-	/* some extra for later */
-	__u64 unused1;
-	__u64 unused2;
-	__u64 unused3;
-	__u64 unused4;
-};
-
-struct btrfs_ioctl_search_header {
-	__u64 transid;
-	__u64 objectid;
-	__u64 offset;
-	__u32 type;
-	__u32 len;
-};
-
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
-	struct btrfs_ioctl_search_key key;
-	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
-
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-
-struct btrfs_ioctl_defrag_range_args {
-	/* start of the defrag operation */
-	__u64 start;
-
-	/* number of bytes to defrag, use (u64)-1 to say all */
-	__u64 len;
-
-	/*
-	 * flags for the operation, which can include turning
-	 * on compression for this one defrag
-	 */
-	__u64 flags;
-
-	/*
-	 * any extent bigger than this will be considered
-	 * already defragged.  Use 0 to take the kernel default
-	 * Use 1 to say every single extent must be rewritten
-	 */
-	__u32 extent_thresh;
-
-	/* spare for later */
-	__u32 unused[5];
-};
-
-struct btrfs_ioctl_space_info {
-	__u64 flags;
-	__u64 total_bytes;
-	__u64 used_bytes;
-};
-
-struct btrfs_ioctl_space_args {
-	__u64 space_slots;
-	__u64 total_spaces;
-	struct btrfs_ioctl_space_info spaces[0];
-};
-
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
-				   struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
-
-#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
-				   struct btrfs_ioctl_vol_args)
-
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
-				  struct btrfs_ioctl_clone_range_args)
-
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
-				struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
-				struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
-				   struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
-				   struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
-				    struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-				   struct btrfs_ioctl_vol_args_v2)
-#endif
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index beb9eacfd2a..fa2570aba42 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -983,8 +983,8 @@ void ECBackend::handle_sub_write(
     std::move(op.log_entries),
     op.updated_hit_set_history,
     op.trim_to,
-    op.roll_forward_to,
-    op.roll_forward_to,
+    op.pg_committed_to,
+    op.pg_committed_to,
     !op.backfill_or_async_recovery,
     localt,
     async);
@@ -1470,7 +1470,7 @@ void ECBackend::submit_transaction(
   const eversion_t &at_version,
   PGTransactionUPtr &&t,
   const eversion_t &trim_to,
-  const eversion_t &min_last_complete_ondisk,
+  const eversion_t &pg_committed_to,
   vector<pg_log_entry_t>&& log_entries,
   std::optional<pg_hit_set_history_t> &hset_history,
   Context *on_all_commit,
@@ -1485,7 +1485,15 @@ void ECBackend::submit_transaction(
   op->delta_stats = delta_stats;
   op->version = at_version;
   op->trim_to = trim_to;
-  op->roll_forward_to = std::max(min_last_complete_ondisk, rmw_pipeline.committed_to);
+  /* We update PeeringState::pg_committed_to via the callback
+   * invoked from ECBackend::handle_sub_write_reply immediately
+   * before updating rmw_pipeline.commited_to via
+   * rmw_pipeline.check_ops()->try_finish_rmw(), so these will
+   * *usually* match.  However, the PrimaryLogPG::submit_log_entries
+   * pathway can perform an out-of-band log update which updates
+   * PeeringState::pg_committed_to independently.  Thus, the value
+   * passed in is the right one to use. */
+  op->pg_committed_to = pg_committed_to;
   op->log_entries = log_entries;
   std::swap(op->updated_hit_set_history, hset_history);
   op->on_all_commit = on_all_commit;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 910cdc064e4..46317b60832 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -106,7 +106,7 @@ public:
     const eversion_t &at_version,
     PGTransactionUPtr &&t,
     const eversion_t &trim_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     std::vector<pg_log_entry_t>&& log_entries,
     std::optional<pg_hit_set_history_t> &hset_history,
     Context *on_all_commit,
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 1fc87610502..609ac3141ae 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -158,7 +158,7 @@ ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::Op &rhs)
     rhs.client_op->get_req()->print(lhs);
   }
 #endif
-  lhs << " roll_forward_to=" << rhs.roll_forward_to
+  lhs << " pg_committed_to=" << rhs.pg_committed_to
       << " temp_added=" << rhs.temp_added
       << " temp_cleared=" << rhs.temp_cleared
       << " pending_read=" << rhs.pending_read
@@ -895,7 +895,7 @@ bool ECCommon::RMWPipeline::try_reads_to_commit()
       should_send ? iter->second : empty,
       op->version,
       op->trim_to,
-      op->roll_forward_to,
+      op->pg_committed_to,
       op->log_entries,
       op->updated_hit_set_history,
       op->temp_added,
@@ -970,8 +970,8 @@ bool ECCommon::RMWPipeline::try_finish_rmw()
   dout(10) << __func__ << ": " << *op << dendl;
   dout(20) << __func__ << ": " << cache << dendl;
 
-  if (op->roll_forward_to > completed_to)
-    completed_to = op->roll_forward_to;
+  if (op->pg_committed_to > completed_to)
+    completed_to = op->pg_committed_to;
   if (op->version > committed_to)
     committed_to = op->version;
 
@@ -984,7 +984,7 @@ bool ECCommon::RMWPipeline::try_finish_rmw()
       auto nop = std::make_unique<ECDummyOp>();
       nop->hoid = op->hoid;
       nop->trim_to = op->trim_to;
-      nop->roll_forward_to = op->version;
+      nop->pg_committed_to = op->version;
       nop->tid = tid;
       nop->reqid = op->reqid;
       waiting_reads.push_back(*nop);
diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h
index 88f2940111e..7ff9cae7646 100644
--- a/src/osd/ECCommon.h
+++ b/src/osd/ECCommon.h
@@ -200,7 +200,7 @@ struct ECListener {
     const std::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     bool transaction_applied,
     ceph::os::Transaction &t,
     bool async = false) = 0;
@@ -522,7 +522,17 @@ struct ECCommon {
       osd_reqid_t reqid;
       ZTracer::Trace trace;
 
-      eversion_t roll_forward_to; /// Soon to be generated internally
+      /**
+       * pg_commited_to
+       *
+       * Represents a version v such that all v' < v handled by RMWPipeline
+       * have fully committed. This may actually lag
+       * PeeringState::pg_committed_to if PrimaryLogPG::submit_log_entries
+       * submits an out-of-band log update.
+       *
+       * Soon to be generated internally.
+       */
+      eversion_t pg_committed_to;
 
       /// Ancillary also provided from submit_transaction caller
       std::map<hobject_t, ObjectContextRef> obc_map;
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
index a656766432f..ae0636f7d49 100644
--- a/src/osd/ECMsgTypes.cc
+++ b/src/osd/ECMsgTypes.cc
@@ -37,7 +37,7 @@ void ECSubWrite::encode(bufferlist &bl) const
   encode(temp_added, bl);
   encode(temp_removed, bl);
   encode(updated_hit_set_history, bl);
-  encode(roll_forward_to, bl);
+  encode(pg_committed_to, bl);
   encode(backfill_or_async_recovery, bl);
   ENCODE_FINISH(bl);
 }
@@ -60,9 +60,9 @@ void ECSubWrite::decode(bufferlist::const_iterator &bl)
     decode(updated_hit_set_history, bl);
   }
   if (struct_v >= 3) {
-    decode(roll_forward_to, bl);
+    decode(pg_committed_to, bl);
   } else {
-    roll_forward_to = trim_to;
+    pg_committed_to = trim_to;
   }
   if (struct_v >= 4) {
     decode(backfill_or_async_recovery, bl);
@@ -80,7 +80,7 @@ std::ostream &operator<<(
       << ", reqid=" << rhs.reqid
       << ", at_version=" << rhs.at_version
       << ", trim_to=" << rhs.trim_to
-      << ", roll_forward_to=" << rhs.roll_forward_to;
+      << ", pg_committed_to=" << rhs.pg_committed_to;
   if (rhs.updated_hit_set_history)
     lhs << ", has_updated_hit_set_history";
   if (rhs.backfill_or_async_recovery)
@@ -94,7 +94,7 @@ void ECSubWrite::dump(Formatter *f) const
   f->dump_stream("reqid") << reqid;
   f->dump_stream("at_version") << at_version;
   f->dump_stream("trim_to") << trim_to;
-  f->dump_stream("roll_forward_to") << roll_forward_to;
+  f->dump_stream("pg_committed_to") << pg_committed_to;
   f->dump_bool("has_updated_hit_set_history",
       static_cast<bool>(updated_hit_set_history));
   f->dump_bool("backfill_or_async_recovery", backfill_or_async_recovery);
@@ -116,7 +116,7 @@ void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
   o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
   o.back()->at_version = eversion_t(10, 300);
   o.back()->trim_to = eversion_t(5, 42);
-  o.back()->roll_forward_to = eversion_t(8, 250);
+  o.back()->pg_committed_to = eversion_t(8, 250);
 }
 
 void ECSubWriteReply::encode(bufferlist &bl) const
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
index 2d0bc5c1221..d0df1ad6fa1 100644
--- a/src/osd/ECMsgTypes.h
+++ b/src/osd/ECMsgTypes.h
@@ -31,7 +31,7 @@ struct ECSubWrite {
   ObjectStore::Transaction t;
   eversion_t at_version;
   eversion_t trim_to;
-  eversion_t roll_forward_to;
+  eversion_t pg_committed_to;
   std::vector<pg_log_entry_t> log_entries;
   std::set<hobject_t> temp_added;
   std::set<hobject_t> temp_removed;
@@ -47,7 +47,7 @@ struct ECSubWrite {
     const ObjectStore::Transaction &t,
     eversion_t at_version,
     eversion_t trim_to,
-    eversion_t roll_forward_to,
+    eversion_t pg_committed_to,
     std::vector<pg_log_entry_t> log_entries,
     std::optional<pg_hit_set_history_t> updated_hit_set_history,
     const std::set<hobject_t> &temp_added,
@@ -56,7 +56,7 @@ struct ECSubWrite {
     : from(from), tid(tid), reqid(reqid),
       soid(soid), stats(stats), t(t),
       at_version(at_version),
-      trim_to(trim_to), roll_forward_to(roll_forward_to),
+      trim_to(trim_to), pg_committed_to(pg_committed_to),
       log_entries(log_entries),
       temp_added(temp_added),
       temp_removed(temp_removed),
@@ -72,7 +72,7 @@ struct ECSubWrite {
     t.swap(other.t);
     at_version = other.at_version;
     trim_to = other.trim_to;
-    roll_forward_to = other.roll_forward_to;
+    pg_committed_to = other.pg_committed_to;
     log_entries.swap(other.log_entries);
     temp_added.swap(other.temp_added);
     temp_removed.swap(other.temp_removed);
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index ce46bb245ea..97fefc5e54a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -504,6 +504,8 @@ void OSDService::shutdown_reserver()
 
 void OSDService::shutdown()
 {
+  pg_timer.stop();
+
   mono_timer.suspend();
 
   {
@@ -1064,7 +1066,12 @@ void OSDService::inc_osd_stat_repaired()
 {
   std::lock_guard l(stat_lock);
   osd_stat.num_shards_repaired++;
-  return;
+}
+
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired = count;
 }
 
 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
@@ -3219,6 +3226,11 @@ will start to track new ops received afterwards.";
     scrub_purged_snaps();
   }
 
+  else if (prefix == "clear_shards_repaired") {
+    int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 0);
+    service.set_osd_stat_repaired(count);
+  }
+
   else if (prefix == "reset_purged_snaps_last") {
     lock_guard l(osd_lock);
     superblock.purged_snaps_last = 0;
@@ -4440,6 +4452,12 @@ void OSD::final_init()
     asok_hook,
     "debug the scrubber");
   ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "clear_shards_repaired "
+    "name=count,type=CephInt,req=false,range=0",
+    asok_hook,
+    "clear num_shards_repaired to clear health warning");
+  ceph_assert(r == 0);
 
   // -- pg commands --
   // old form: ceph pg <pgid> command ...
@@ -9485,7 +9503,8 @@ void OSD::handle_pg_query_nopg(const MQuery& q)
 			 q.query.epoch_sent,
 			 osdmap->get_epoch(),
 			 empty,
-			 PastIntervals()};
+			 PastIntervals(),
+			 PG_FEATURE_CLASSIC_ALL};
       m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
 			    std::move(notify));
     }
@@ -10154,22 +10173,28 @@ void OSD::maybe_override_max_osd_capacity_for_qos()
             << dendl;
 
     // Get the threshold IOPS set for the underlying hdd/ssd.
-    double threshold_iops = 0.0;
+    double hi_threshold_iops = 0.0;
+    double lo_threshold_iops = 0.0;
     if (store_is_rotational) {
-      threshold_iops = cct->_conf.get_val<double>(
+      hi_threshold_iops = cct->_conf.get_val<double>(
         "osd_mclock_iops_capacity_threshold_hdd");
+      lo_threshold_iops = cct->_conf.get_val<double>(
+        "osd_mclock_iops_capacity_low_threshold_hdd");
     } else {
-      threshold_iops = cct->_conf.get_val<double>(
+      hi_threshold_iops = cct->_conf.get_val<double>(
         "osd_mclock_iops_capacity_threshold_ssd");
+      lo_threshold_iops = cct->_conf.get_val<double>(
+        "osd_mclock_iops_capacity_low_threshold_ssd");
     }
 
     // Persist the iops value to the MON store or throw cluster warning
-    // if the measured iops exceeds the set threshold. If the iops exceed
-    // the threshold, the default value is used.
-    if (iops > threshold_iops) {
+    // if the measured iops is not in the threshold range. If the iops is
+    // not within the threshold range, the current/default value is retained.
+    if (iops < lo_threshold_iops || iops > hi_threshold_iops) {
       clog->warn() << "OSD bench result of " << std::to_string(iops)
-                   << " IOPS exceeded the threshold limit of "
-                   << std::to_string(threshold_iops) << " IOPS for osd."
+                   << " IOPS is not within the threshold limit range of "
+                   << std::to_string(lo_threshold_iops) << " IOPS and "
+                   << std::to_string(hi_threshold_iops) << " IOPS for osd."
                    << std::to_string(whoami) << ". IOPS capacity is unchanged"
                    << " at " << std::to_string(cur_iops) << " IOPS. The"
                    << " recommendation is to establish the osd's IOPS capacity"
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 7c9aed7c6ba..25ca7236808 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -48,6 +48,7 @@
 
 #include "include/unordered_map.h"
 
+#include "common/intrusive_timer.h"
 #include "common/shared_cache.hpp"
 #include "common/simple_cache.hpp"
 #include "messages/MOSDOp.h"
@@ -731,6 +732,7 @@ public:
     osd_alert_list_t& alerts);
   osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs);
   void inc_osd_stat_repaired(void);
+  void set_osd_stat_repaired(int64_t count);
   float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
   osd_stat_t get_osd_stat() {
     std::lock_guard l(stat_lock);
@@ -877,6 +879,8 @@ public:
   bool prepare_to_stop();
   void got_stop_ack();
 
+  // -- PG timer --
+  common::intrusive_timer pg_timer;
 
 #ifdef PG_DEBUG_REFS
   ceph::mutex pgid_lock = ceph::make_mutex("OSDService::pgid_lock");
@@ -1941,6 +1945,7 @@ private:
     case MSG_OSD_REP_SCRUBMAP:
     case MSG_OSD_PG_UPDATE_LOG_MISSING:
     case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    case MSG_OSD_PG_PCT:
     case MSG_OSD_PG_RECOVERY_DELETE:
     case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
     case MSG_OSD_PG_LEASE:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 71b9b713385..307651fd627 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -43,6 +43,7 @@
 #include "messages/MOSDECSubOpReadReply.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MOSDPGPCT.h"
 #include "messages/MOSDBackoff.h"
 #include "messages/MOSDScrubReserve.h"
 #include "messages/MOSDRepOp.h"
@@ -212,6 +213,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
     p,
     _pool,
     curmap,
+    PG_FEATURE_CLASSIC_ALL,
     this,
     this),
   pool(recovery_state.get_pgpool()),
@@ -2091,6 +2093,9 @@ bool PG::can_discard_request(OpRequestRef& op)
   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
     return can_discard_replica_op<
       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
+  case MSG_OSD_PG_PCT:
+    return can_discard_replica_op<
+      MOSDPGPCT, MSG_OSD_PG_PCT>(op);
 
   case MSG_OSD_PG_SCAN:
     return can_discard_scan(op);
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 9cbb5e8e97c..b87aa1da677 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -20,6 +20,8 @@
 
 #include "ECCommon.h"
 #include "osd_types.h"
+#include "pg_features.h"
+#include "common/intrusive_timer.h"
 #include "common/WorkQueue.h"
 #include "include/Context.h"
 #include "os/ObjectStore.h"
@@ -137,6 +139,17 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        Context *on_complete) = 0;
 
      /**
+      * pg_lock, pg_unlock, pg_add_ref, pg_dec_ref
+      *
+      * Utilities for locking and manipulating refcounts on
+      * implementation.
+      */
+     virtual void pg_lock() = 0;
+     virtual void pg_unlock() = 0;
+     virtual void pg_add_ref() = 0;
+     virtual void pg_dec_ref() = 0;
+
+     /**
       * Bless a context
       *
       * Wraps a context in whatever outer layers the parent usually
@@ -193,6 +206,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      virtual epoch_t pgb_get_osdmap_epoch() const = 0;
      virtual const pg_info_t &get_info() const = 0;
      virtual const pg_pool_t &get_pool() const = 0;
+     virtual eversion_t get_pg_committed_to() const = 0;
 
      virtual ObjectContextRef get_obc(
        const hobject_t &hoid,
@@ -219,7 +233,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        const std::optional<pg_hit_set_history_t> &hset_history,
        const eversion_t &trim_to,
        const eversion_t &roll_forward_to,
-       const eversion_t &min_last_complete_ondisk,
+       const eversion_t &pg_committed_to,
        bool transaction_applied,
        ObjectStore::Transaction &t,
        bool async = false) = 0;
@@ -240,6 +254,9 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      virtual void update_last_complete_ondisk(
        eversion_t lcod) = 0;
 
+     virtual void update_pct(
+       eversion_t pct) = 0;
+
      virtual void update_stats(
        const pg_stat_t &stat) = 0;
 
@@ -247,6 +264,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        GenContext<ThreadPool::TPHandle&> *c,
        uint64_t cost) = 0;
 
+     virtual common::intrusive_timer &get_pg_timer() = 0;
+
      virtual pg_shard_t whoami_shard() const = 0;
      int whoami() const {
        return whoami_shard().osd;
@@ -259,6 +278,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      virtual pg_shard_t primary_shard() const = 0;
      virtual uint64_t min_peer_features() const = 0;
      virtual uint64_t min_upacting_features() const = 0;
+     virtual pg_feature_vec_t get_pg_acting_features() const = 0;
      virtual hobject_t get_temp_recovery_object(const hobject_t& target,
 						eversion_t version) = 0;
 
@@ -435,8 +455,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      const eversion_t &at_version,        ///< [in] version
      PGTransactionUPtr &&t,               ///< [in] trans to execute (move)
      const eversion_t &trim_to,           ///< [in] trim log to here
-     const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on
-                                                 ///  committed version
+     const eversion_t &pg_committed_to,   ///< [in] lower bound on
+                                          ///       committed version
      std::vector<pg_log_entry_t>&& log_entries, ///< [in] log entries for t
      /// [in] hitset history (if updated with this transaction)
      std::optional<pg_hit_set_history_t> &hset_history,
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 8d768ec4a66..334d202d207 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -109,6 +109,7 @@ PeeringState::PeeringState(
   spg_t spgid,
   const PGPool &_pool,
   OSDMapRef curmap,
+  pg_feature_vec_t supported_pg_acting_features,
   DoutPrefixProvider *dpp,
   PeeringListener *pl)
   : state_history(*pl),
@@ -122,6 +123,8 @@ PeeringState::PeeringState(
     pg_whoami(pg_whoami),
     info(spgid),
     pg_log(cct),
+    local_pg_acting_features(supported_pg_acting_features),
+    pg_acting_features(local_pg_acting_features),
     last_require_osd_release(curmap->require_osd_release),
     missing_loc(spgid, this, dpp, cct),
     machine(this, cct, spgid, dpp, pl, &state_history)
@@ -314,9 +317,11 @@ void PeeringState::query_unfound(Formatter *f, string state)
   return;
 }
 
-bool PeeringState::proc_replica_info(
-  pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
+bool PeeringState::proc_replica_notify(const pg_shard_t &from, const pg_notify_t &notify)
 {
+  const pg_info_t &oinfo = notify.info;
+  const epoch_t send_epoch = notify.epoch_sent;
+
   auto p = peer_info.find(from);
   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
     psdout(10) << " got dup osd." << from << " info "
@@ -346,6 +351,10 @@ bool PeeringState::proc_replica_info(
     }
   }
 
+  if (is_acting(from)) {
+    pg_acting_features &= notify.pg_features;
+  }
+
   // was this a new info?  if so, update peers!
   if (p == peer_info.end())
     update_heartbeat_peers();
@@ -746,6 +755,7 @@ void PeeringState::on_new_interval()
   // initialize features
   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  pg_acting_features = local_pg_acting_features;
   for (auto p = acting.begin(); p != acting.end(); ++p) {
     if (*p == CRUSH_ITEM_NONE)
       continue;
@@ -900,7 +910,7 @@ void PeeringState::clear_primary_state()
 
   clear_recovery_state();
 
-  last_update_ondisk = eversion_t();
+  pg_committed_to = eversion_t();
   missing_loc.clear();
   pl->clear_primary_state();
 }
@@ -1404,9 +1414,8 @@ bool PeeringState::needs_backfill() const
 bool PeeringState::can_serve_replica_read(const hobject_t &hoid)
 {
   ceph_assert(!is_primary());
-  eversion_t min_last_complete_ondisk = get_min_last_complete_ondisk();
   if (!pg_log.get_log().has_write_since(
-      hoid, min_last_complete_ondisk)) {
+      hoid, pg_committed_to)) {
     psdout(20) << "can be safely read on this replica" << dendl;
     return true;
   } else {
@@ -2663,6 +2672,10 @@ void PeeringState::activate(
 	     info.last_epoch_started <= activation_epoch);
       info.last_epoch_started = activation_epoch;
       info.last_interval_started = info.history.same_interval_since;
+
+      // updating last_epoch_started ensures that last_update will not
+      // become divergent after activation completes.
+      pg_committed_to = info.last_update;
     }
   } else if (is_acting(pg_whoami)) {
     /* update last_epoch_started on acting replica to whatever the primary sent
@@ -2671,15 +2684,16 @@ void PeeringState::activate(
     if (info.last_epoch_started < activation_epoch) {
       info.last_epoch_started = activation_epoch;
       info.last_interval_started = info.history.same_interval_since;
+
+      // updating last_epoch_started ensures that last_update will not
+      // become divergent after activation completes.
+      pg_committed_to = info.last_update;
     }
   }
 
   auto &missing = pg_log.get_missing();
 
   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
-  if (is_primary()) {
-    last_update_ondisk = info.last_update;
-  }
   last_update_applied = info.last_update;
   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
 
@@ -3033,9 +3047,7 @@ void PeeringState::proc_primary_info(
   ceph_assert(!is_primary());
 
   update_history(oinfo.history);
-  bool has_scrub_error = (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors);
-  info.stats = oinfo.stats;
-  if (has_scrub_error) {
+  if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
     info.stats.stats.sum.num_scrub_errors = 0;
     info.stats.stats.sum.num_shallow_scrub_errors = 0;
     info.stats.stats.sum.num_deep_scrub_errors = 0;
@@ -3203,7 +3215,8 @@ void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
 	query.query_epoch,
 	get_osdmap_epoch(),
 	notify_info.second,
-	past_intervals));
+	past_intervals,
+	local_pg_acting_features));
   } else {
     update_history(query.query.history);
     fulfill_log(query.from, query.query, query.query_epoch);
@@ -4072,7 +4085,7 @@ void PeeringState::update_stats_wo_resched(
 bool PeeringState::append_log_entries_update_missing(
   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
-  std::optional<eversion_t> roll_forward_to)
+  std::optional<eversion_t> pg_committed_to)
 {
   ceph_assert(!entries.empty());
   ceph_assert(entries.begin()->version > info.last_update);
@@ -4084,12 +4097,12 @@ bool PeeringState::append_log_entries_update_missing(
       entries,
       rollbacker.get());
 
-  if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
+  if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) {
     pg_log.roll_forward(rollbacker.get());
   }
-  if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
-    pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
-    last_rollback_info_trimmed_to_applied = *roll_forward_to;
+  if (pg_committed_to && *pg_committed_to > pg_log.get_can_rollback_to()) {
+    pg_log.roll_forward_to(*pg_committed_to, rollbacker.get());
+    last_rollback_info_trimmed_to_applied = *pg_committed_to;
   }
 
   info.last_update = pg_log.get_head();
@@ -4113,12 +4126,13 @@ void PeeringState::merge_new_log_entries(
   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
   ObjectStore::Transaction &t,
   std::optional<eversion_t> trim_to,
-  std::optional<eversion_t> roll_forward_to)
+  std::optional<eversion_t> pg_committed_to)
 {
   psdout(10) << entries << dendl;
   ceph_assert(is_primary());
 
-  bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
+  bool rebuild_missing = append_log_entries_update_missing(
+    entries, t, trim_to, pg_committed_to);
   for (auto i = acting_recovery_backfill.begin();
        i != acting_recovery_backfill.end();
        ++i) {
@@ -4186,7 +4200,7 @@ void PeeringState::append_log(
   vector<pg_log_entry_t>&& logv,
   eversion_t trim_to,
   eversion_t roll_forward_to,
-  eversion_t mlcod,
+  eversion_t pct,
   ObjectStore::Transaction &t,
   bool transaction_applied,
   bool async)
@@ -4252,7 +4266,7 @@ void PeeringState::append_log(
   write_if_dirty(t);
 
   if (!is_primary())
-    min_last_complete_ondisk = mlcod;
+    pg_committed_to = pct;
 }
 
 void PeeringState::recover_got(
@@ -4439,7 +4453,7 @@ void PeeringState::recovery_committed_to(eversion_t version)
 
 void PeeringState::complete_write(eversion_t v, eversion_t lc)
 {
-  last_update_ondisk = v;
+  pg_committed_to = v;
   last_complete_ondisk = lc;
   calc_min_last_complete_ondisk();
 }
@@ -4486,7 +4500,7 @@ void PeeringState::calc_trim_to_aggressive()
   eversion_t limit = std::min({
     pg_log.get_head(),
     pg_log.get_can_rollback_to(),
-    last_update_ondisk});
+    pg_committed_to});
   psdout(10) << "limit = " << limit << dendl;
 
   if (limit != eversion_t() &&
@@ -4646,8 +4660,7 @@ PeeringState::Initial::Initial(my_context ctx)
 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
 {
   DECLARE_LOCALS;
-  ps->proc_replica_info(
-    notify.from, notify.notify.info, notify.notify.epoch_sent);
+  ps->proc_replica_notify(notify.from, notify.notify);
   ps->set_last_peering_reset();
   return transit< Primary >();
 }
@@ -4800,7 +4813,8 @@ boost::statechart::result PeeringState::Reset::react(const ActMap&)
 	ps->get_osdmap_epoch(),
 	ps->get_osdmap_epoch(),
 	ps->info,
-	ps->past_intervals));
+	ps->past_intervals,
+	ps->local_pg_acting_features));
   }
 
   ps->update_heartbeat_peers();
@@ -4890,8 +4904,7 @@ boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
 {
   DECLARE_LOCALS;
   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
-  ps->proc_replica_info(
-    notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+  ps->proc_replica_notify(notevt.from, notevt.notify);
   return discard_event();
 }
 
@@ -6106,10 +6119,9 @@ boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
 		       << dendl;
   } else {
     psdout(10) << "Active: got notify from " << notevt.from
-		       << ", calling proc_replica_info and discover_all_missing"
+		       << ", calling proc_replica_notify and discover_all_missing"
 		       << dendl;
-    ps->proc_replica_info(
-      notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+    ps->proc_replica_notify(notevt.from, notevt.notify);
     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
       ps->discover_all_missing(
 	context<PeeringMachine>().get_recovery_ctx().msgs);
@@ -6532,7 +6544,8 @@ boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
 	ps->get_osdmap_epoch(),
 	ps->get_osdmap_epoch(),
 	ps->info,
-	ps->past_intervals));
+	ps->past_intervals,
+	ps->local_pg_acting_features));
   }
   return discard_event();
 }
@@ -6669,7 +6682,8 @@ boost::statechart::result PeeringState::Stray::react(const ActMap&)
 	ps->get_osdmap_epoch(),
 	ps->get_osdmap_epoch(),
 	ps->info,
-	ps->past_intervals));
+	ps->past_intervals,
+	ps->local_pg_acting_features));
   }
   return discard_event();
 }
@@ -6868,8 +6882,7 @@ boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt
   }
 
   epoch_t old_start = ps->info.history.last_epoch_started;
-  if (ps->proc_replica_info(
-	infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
+  if (ps->proc_replica_notify(infoevt.from, infoevt.notify)) {
     // we got something new ...
     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
     if (old_start < ps->info.history.last_epoch_started) {
@@ -6900,6 +6913,7 @@ boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt
       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
+      psdout(20) << "Common pg_acting_features: " << hex << ps->get_pg_acting_features() << dec << dendl;
       post_event(GotInfo());
     }
   }
@@ -7262,8 +7276,7 @@ boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap)
 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
   DECLARE_LOCALS;
   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
-  if (ps->proc_replica_info(
-    notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
+  if (ps->proc_replica_notify(notevt.from, notevt.notify)) {
     // We got something new, try again!
     return transit< GetLog >();
   } else {
@@ -7553,8 +7566,8 @@ ostream &operator<<(ostream &out, const PeeringState &ps) {
   }
 
   if (ps.is_peered()) {
-    if (ps.last_update_ondisk != ps.info.last_update)
-      out << " luod=" << ps.last_update_ondisk;
+    if (ps.pg_committed_to != ps.info.last_update)
+      out << " pct=" << ps.pg_committed_to;
     if (ps.last_update_applied != ps.info.last_update)
       out << " lua=" << ps.last_update_applied;
   }
@@ -7577,7 +7590,8 @@ ostream &operator<<(ostream &out, const PeeringState &ps) {
   if (ps.last_complete_ondisk != ps.info.last_complete)
     out << " lcod " << ps.last_complete_ondisk;
 
-  out << " mlcod " << ps.min_last_complete_ondisk;
+  if (ps.is_primary())
+    out << " mlcod " << ps.min_last_complete_ondisk;
 
   out << " " << pg_state_string(ps.get_state());
   if (ps.should_send_notify())
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index 11ac084a054..4b5285b1878 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -1470,8 +1470,24 @@ public:
 
   epoch_t last_peering_reset = 0;   ///< epoch of last peering reset
 
-  /// last_update that has committed; ONLY DEFINED WHEN is_active()
-  eversion_t  last_update_ondisk;
+  /**
+   * pg_committed_to
+   *
+   * Maintained on the primary while pg is active (and not merely peered).
+   *
+   * Forall e <= pg_committed_to, e has been committed on all replicas.
+   *
+   * As a consequence:
+   * - No version e <= pg_committed_to can become divergent
+   * - It is safe for replicas to read any object whose most recent update is
+   *   <= pg_committed_to
+   *
+   * Note that if the PG is only peered, pg_committed_to not be set
+   * and will remain eversion_t{} as we cannot guarantee that last_update
+   * at activation will not later become divergent.
+   */
+  eversion_t  pg_committed_to;
+
   eversion_t  last_complete_ondisk; ///< last_complete that has committed.
   eversion_t  last_update_applied;  ///< last_update readable
   /// last version to which rollback_info trimming has been applied
@@ -1491,6 +1507,18 @@ public:
   std::set<pg_shard_t> peer_log_requested; ///< logs i've requested (and start stamps)
   std::set<pg_shard_t> peer_missing_requested; ///< missing sets requested
 
+  /// not constexpr because classic/crimson might differ
+  const pg_feature_vec_t local_pg_acting_features;
+  
+  /**
+   * acting_pg_features
+   *
+   * PG specific features common to entire acting set.  Valid only on primary
+   * after activation.
+   */
+  pg_feature_vec_t pg_acting_features;
+
+
   /// features supported by all peers
   uint64_t peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
   /// features supported by acting set
@@ -1541,8 +1569,7 @@ public:
 
   void update_heartbeat_peers();
   void query_unfound(Formatter *f, std::string state);
-  bool proc_replica_info(
-    pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch);
+  bool proc_replica_notify(const pg_shard_t &from, const pg_notify_t &notify);
   void remove_down_peer_info(const OSDMapRef &osdmap);
   void check_recovery_sources(const OSDMapRef& map);
   void set_last_peering_reset();
@@ -1750,6 +1777,7 @@ public:
     spg_t spgid,
     const PGPool &pool,
     OSDMapRef curmap,
+    pg_feature_vec_t supported_pg_acting_features,
     DoutPrefixProvider *dpp,
     PeeringListener *pl);
 
@@ -1899,18 +1927,7 @@ public:
     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
     ObjectStore::Transaction &t,
     std::optional<eversion_t> trim_to,
-    std::optional<eversion_t> roll_forward_to);
-
-  void append_log_with_trim_to_updated(
-    std::vector<pg_log_entry_t>&& log_entries,
-    eversion_t roll_forward_to,
-    ObjectStore::Transaction &t,
-    bool transaction_applied,
-    bool async) {
-    update_trim_to();
-    append_log(std::move(log_entries), pg_trim_to, roll_forward_to,
-	min_last_complete_ondisk, t, transaction_applied, async);
-  }
+    std::optional<eversion_t> pg_committed_to);
 
   /**
    * Updates local log to reflect new write from primary.
@@ -1919,12 +1936,22 @@ public:
     std::vector<pg_log_entry_t>&& logv,
     eversion_t trim_to,
     eversion_t roll_forward_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     ObjectStore::Transaction &t,
     bool transaction_applied,
     bool async);
 
   /**
+   * update_pct
+   *
+   * Updates pg_committed_to.  Generally invoked on replica on
+   * receipt of MODPGPCT from primary.
+   */
+  void update_pct(eversion_t pct) {
+    pg_committed_to = pct;
+  }
+
+  /**
    * retrieve the min last_backfill among backfill targets
    */
   hobject_t earliest_backfill() const;
@@ -1937,7 +1964,7 @@ public:
     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
     ObjectStore::Transaction &t,
     std::optional<eversion_t> trim_to,
-    std::optional<eversion_t> roll_forward_to);
+    std::optional<eversion_t> pg_committed_to);
 
   /// Update missing set to reflect e (TODOSAM: not sure why this is needed)
   void add_local_next_event(const pg_log_entry_t& e) {
@@ -2412,10 +2439,6 @@ public:
     return missing_loc.get_missing_by_count();
   }
 
-  eversion_t get_min_last_complete_ondisk() const {
-    return min_last_complete_ondisk;
-  }
-
   eversion_t get_pg_trim_to() const {
     return pg_trim_to;
   }
@@ -2424,8 +2447,8 @@ public:
     return last_update_applied;
   }
 
-  eversion_t get_last_update_ondisk() const {
-    return last_update_ondisk;
+  eversion_t get_pg_committed_to() const {
+    return pg_committed_to;
   }
 
   bool debug_has_dirty_state() const {
@@ -2467,6 +2490,8 @@ public:
   /// Get feature vector common to up/acting set
   uint64_t get_min_upacting_features() const { return upacting_features; }
 
+  /// Get pg features common to acting set
+  pg_feature_vec_t get_pg_acting_features() const { return pg_acting_features; }
 
   // Flush control interface
 private:
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 2f2ae4a22db..44f8e85b5ef 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -543,6 +543,11 @@ void PrimaryLogPG::schedule_recovery_work(
     recovery_state.get_recovery_op_priority());
 }
 
+common::intrusive_timer &PrimaryLogPG::get_pg_timer()
+{
+  return osd->pg_timer;
+}
+
 void PrimaryLogPG::replica_clear_repop_obc(
   const vector<pg_log_entry_t> &logv,
   ObjectStore::Transaction &t)
@@ -2053,6 +2058,10 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
     }
   }
 
+  if (!is_primary()) {
+    osd->logger->inc(l_osd_replica_read);
+  }
+
   if (!check_laggy(op)) {
     return;
   }
@@ -2183,6 +2192,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   // missing object?
   if (is_unreadable_object(head)) {
     if (!is_primary()) {
+      osd->logger->inc(l_osd_replica_read_redirect_missing);
       osd->reply_op_error(op, -EAGAIN);
       return;
     }
@@ -2314,11 +2324,13 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
       dout(20) << __func__
                << ": unstable write on replica, bouncing to primary "
 	       << *m << dendl;
+      osd->logger->inc(l_osd_replica_read_redirect_conflict);
       osd->reply_op_error(op, -EAGAIN);
       return;
     }
     dout(20) << __func__ << ": serving replica read on oid " << oid
              << dendl;
+    osd->logger->inc(l_osd_replica_read_served);
   }
 
   int r = find_object_context(
@@ -5994,7 +6006,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
   object_info_t& oi = obs.oi;
   const hobject_t& soid = oi.soid;
   const bool skip_data_digest = osd->store->has_builtin_csum() &&
-    osd->osd_skip_data_digest;
+    *osd->osd_skip_data_digest;
 
   PGTransaction* t = ctx->op_t.get();
 
@@ -6057,9 +6069,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
     if (op.op == CEPH_OSD_OP_ZERO &&
         obs.exists &&
-        op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
+        op.extent.offset < *osd->osd_max_object_size &&
         op.extent.length >= 1 &&
-        op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
+        op.extent.length <= *osd->osd_max_object_size &&
 	op.extent.offset + op.extent.length >= oi.size) {
       if (op.extent.offset >= oi.size) {
         // no-op
@@ -6769,7 +6781,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-	  static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+	  *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6826,7 +6838,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	result = check_offset_and_length(
 	  0, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6876,7 +6888,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       { // zero
 	result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6941,7 +6953,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
         result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
         if (result < 0)
 	  break;
 
@@ -11491,7 +11503,7 @@ void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
     ctx->at_version,
     std::move(ctx->op_t),
     recovery_state.get_pg_trim_to(),
-    recovery_state.get_min_last_complete_ondisk(),
+    recovery_state.get_pg_committed_to(),
     std::move(ctx->log),
     ctx->updated_hset_history,
     on_all_commit,
@@ -11623,7 +11635,7 @@ void PrimaryLogPG::submit_log_entries(
       eversion_t old_last_update = info.last_update;
       recovery_state.merge_new_log_entries(
 	entries, t, recovery_state.get_pg_trim_to(),
-	recovery_state.get_min_last_complete_ondisk());
+	recovery_state.get_pg_committed_to());
 
       set<pg_shard_t> waiting_on;
       for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
@@ -11643,7 +11655,7 @@ void PrimaryLogPG::submit_log_entries(
 	    get_last_peering_reset(),
 	    repop->rep_tid,
 	    recovery_state.get_pg_trim_to(),
-	    recovery_state.get_min_last_complete_ondisk());
+	    recovery_state.get_pg_committed_to());
 	  osd->send_message_osd_cluster(
 	    peer.osd, m, get_osdmap_epoch());
 	  waiting_on.insert(peer);
@@ -12644,17 +12656,18 @@ void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
     op->get_req());
   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
   ObjectStore::Transaction t;
-  std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+  std::optional<eversion_t> op_trim_to, op_pg_committed_to;
   if (m->pg_trim_to != eversion_t())
     op_trim_to = m->pg_trim_to;
-  if (m->pg_roll_forward_to != eversion_t())
-    op_roll_forward_to = m->pg_roll_forward_to;
+  if (m->pg_committed_to != eversion_t())
+    op_pg_committed_to = m->pg_committed_to;
 
   dout(20) << __func__
-	   << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
+	   << " op_trim_to = " << op_trim_to << " op_pg_committed_to = "
+	   << op_pg_committed_to << dendl;
 
   recovery_state.append_log_entries_update_missing(
-    m->entries, t, op_trim_to, op_roll_forward_to);
+    m->entries, t, op_trim_to, op_pg_committed_to);
   eversion_t new_lcod = info.last_complete;
 
   Context *complete = new LambdaContext(
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index 323b66e02a7..f66b5c6e16a 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -27,6 +27,7 @@
 #include "messages/MOSDOpReply.h"
 #include "common/admin_finisher.h"
 #include "common/Checksummer.h"
+#include "common/intrusive_timer.h"
 #include "common/sharedptr_registry.hpp"
 #include "common/shared_cache.hpp"
 #include "ReplicatedBackend.h"
@@ -349,6 +350,19 @@ public:
 			     eversion_t v,
 			     Context *on_complete) override;
 
+  void pg_lock() override {
+    lock();
+  }
+  void pg_unlock() override {
+    unlock();
+  }
+  void pg_add_ref() override {
+    intrusive_ptr_add_ref(this);
+  }
+  void pg_dec_ref() override {
+    intrusive_ptr_release(this);
+  }
+
   template<class T> class BlessedGenContext;
   template<class T> class UnlockedBlessedGenContext;
   class BlessedContext;
@@ -439,6 +453,9 @@ public:
   const pg_pool_t &get_pool() const override {
     return pool.info;
   }
+  eversion_t get_pg_committed_to() const override {
+    return recovery_state.get_pg_committed_to();
+  }
 
   ObjectContextRef get_obc(
     const hobject_t &hoid,
@@ -497,12 +514,12 @@ public:
     const std::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     bool transaction_applied,
     ObjectStore::Transaction &t,
     bool async = false) override {
     if (is_primary()) {
-      ceph_assert(trim_to <= recovery_state.get_last_update_ondisk());
+      ceph_assert(trim_to <= pg_committed_to);
     }
     if (hset_history) {
       recovery_state.update_hset(*hset_history);
@@ -519,7 +536,7 @@ public:
       replica_clear_repop_obc(logv, t);
     }
     recovery_state.append_log(
-      std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk,
+      std::move(logv), trim_to, roll_forward_to, pg_committed_to,
       t, transaction_applied, async);
   }
 
@@ -552,6 +569,10 @@ public:
     recovery_state.update_last_complete_ondisk(lcod);
   }
 
+  void update_pct(eversion_t pct) override {
+    recovery_state.update_pct(pct);
+  }
+
   void update_stats(
     const pg_stat_t &stat) override {
     recovery_state.update_stats(
@@ -565,6 +586,8 @@ public:
     GenContext<ThreadPool::TPHandle&> *c,
     uint64_t cost) override;
 
+  common::intrusive_timer &get_pg_timer() override;
+
   pg_shard_t whoami_shard() const override {
     return pg_whoami;
   }
@@ -580,6 +603,9 @@ public:
   uint64_t min_upacting_features() const override {
     return recovery_state.get_min_upacting_features();
   }
+  pg_feature_vec_t get_pg_acting_features() const override {
+    return recovery_state.get_pg_acting_features();
+  }
   void send_message_osd_cluster(
     int peer, Message *m, epoch_t from_epoch) override {
     osd->send_message_osd_cluster(peer, m, from_epoch);
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 3702490fb61..7ce8fbcd210 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -14,6 +14,7 @@
 #include "common/errno.h"
 #include "ReplicatedBackend.h"
 #include "messages/MOSDOp.h"
+#include "messages/MOSDPGPCT.h"
 #include "messages/MOSDRepOp.h"
 #include "messages/MOSDRepOpReply.h"
 #include "messages/MOSDPGPush.h"
@@ -124,7 +125,9 @@ ReplicatedBackend::ReplicatedBackend(
   ObjectStore::CollectionHandle &c,
   ObjectStore *store,
   CephContext *cct) :
-  PGBackend(cct, pg, store, coll, c) {}
+  PGBackend(cct, pg, store, coll, c),
+  pct_callback(this)
+{}
 
 void ReplicatedBackend::run_recovery_op(
   PGBackend::RecoveryHandle *_h,
@@ -229,6 +232,10 @@ bool ReplicatedBackend::_handle_message(
     return true;
   }
 
+  case MSG_OSD_PG_PCT:
+    do_pct(op);
+    return true;
+
   default:
     break;
   }
@@ -261,6 +268,7 @@ void ReplicatedBackend::on_change()
   }
   in_progress_ops.clear();
   clear_recovery_state();
+  cancel_pct_update();
 }
 
 int ReplicatedBackend::objects_read_sync(
@@ -462,13 +470,86 @@ void generate_transaction(
     });
 }
 
+void ReplicatedBackend::do_pct(OpRequestRef op)
+{
+  const MOSDPGPCT *m = static_cast<const MOSDPGPCT*>(op->get_req());
+  dout(10) << __func__ << ": received pct update to "
+	   << m->pg_committed_to << dendl;
+  parent->update_pct(m->pg_committed_to);
+}
+
+void ReplicatedBackend::send_pct_update()
+{
+  dout(10) << __func__ << ": sending pct update" << dendl;
+  ceph_assert(
+    PG_HAVE_FEATURE(parent->get_pg_acting_features(), PCT));
+  for (const auto &i: parent->get_acting_shards()) {
+    if (i == parent->whoami_shard()) continue;
+
+    auto *pct_update = new MOSDPGPCT(
+      spg_t(parent->whoami_spg_t().pgid, i.shard),
+      get_osdmap_epoch(), parent->get_interval_start_epoch(),
+      parent->get_pg_committed_to()
+    );
+
+    dout(10) << __func__ << ": sending pct update to i " << i
+	     << ", i.osd " << i.osd << dendl;
+    parent->send_message_osd_cluster(
+      i.osd, pct_update, get_osdmap_epoch());
+  }
+  dout(10) << __func__ << ": sending pct update complete" << dendl;
+}
+
+void ReplicatedBackend::maybe_kick_pct_update()
+{
+  if (!in_progress_ops.empty()) {
+    dout(20) << __func__ << ": not scheduling pct update, "
+	     << in_progress_ops.size() << " ops pending" << dendl;
+    return;
+  }
+
+  if (!PG_HAVE_FEATURE(parent->get_pg_acting_features(), PCT)) {
+    dout(20) << __func__ << ": not scheduling pct update, PCT feature not"
+	     << " supported" << dendl;
+    return;
+  }
+
+  if (pct_callback.is_scheduled()) {
+    derr << __func__
+	 << ": pct_callback is already scheduled, this should be impossible"
+	 << dendl;
+    return;
+  }
+
+  int64_t pct_delay;
+  if (!parent->get_pool().opts.get(
+	pool_opts_t::PCT_UPDATE_DELAY, &pct_delay)) {
+    dout(20) << __func__ << ": not scheduling pct update, PCT_UPDATE_DELAY not"
+	     << " set" << dendl;
+    return;
+  }
+
+  dout(10) << __func__ << ": scheduling pct update after "
+	   << pct_delay << " seconds" << dendl;
+  parent->get_pg_timer().schedule_after(
+    pct_callback, std::chrono::seconds(pct_delay));
+}
+
+void ReplicatedBackend::cancel_pct_update()
+{
+  if (pct_callback.is_scheduled()) {
+    dout(10) << __func__ << ": canceling pct update" << dendl;
+    parent->get_pg_timer().cancel(pct_callback);
+  }
+}
+
 void ReplicatedBackend::submit_transaction(
   const hobject_t &soid,
   const object_stat_sum_t &delta_stats,
   const eversion_t &at_version,
   PGTransactionUPtr &&_t,
   const eversion_t &trim_to,
-  const eversion_t &min_last_complete_ondisk,
+  const eversion_t &pg_committed_to,
   vector<pg_log_entry_t>&& _log_entries,
   std::optional<pg_hit_set_history_t> &hset_history,
   Context *on_all_commit,
@@ -476,6 +557,8 @@ void ReplicatedBackend::submit_transaction(
   osd_reqid_t reqid,
   OpRequestRef orig_op)
 {
+  cancel_pct_update();
+
   parent->apply_stats(
     soid,
     delta_stats);
@@ -517,7 +600,7 @@ void ReplicatedBackend::submit_transaction(
     tid,
     reqid,
     trim_to,
-    min_last_complete_ondisk,
+    pg_committed_to,
     added.size() ? *(added.begin()) : hobject_t(),
     removed.size() ? *(removed.begin()) : hobject_t(),
     log_entries,
@@ -533,7 +616,7 @@ void ReplicatedBackend::submit_transaction(
     hset_history,
     trim_to,
     at_version,
-    min_last_complete_ondisk,
+    pg_committed_to,
     true,
     op_t);
   
@@ -572,6 +655,7 @@ void ReplicatedBackend::op_commit(const ceph::ref_t<InProgressOp>& op)
     op->on_commit = 0;
     in_progress_ops.erase(op->tid);
   }
+  maybe_kick_pct_update();
 }
 
 void ReplicatedBackend::do_repop_reply(OpRequestRef op)
@@ -628,6 +712,7 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
       in_progress_ops.erase(iter);
     }
   }
+  maybe_kick_pct_update();
 }
 
 int ReplicatedBackend::be_deep_scrub(
@@ -953,7 +1038,7 @@ Message * ReplicatedBackend::generate_subop(
   ceph_tid_t tid,
   osd_reqid_t reqid,
   eversion_t pg_trim_to,
-  eversion_t min_last_complete_ondisk,
+  eversion_t pg_committed_to,
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   const bufferlist &log_entries,
@@ -990,13 +1075,9 @@ Message * ReplicatedBackend::generate_subop(
 
   wr->pg_trim_to = pg_trim_to;
 
-  if (HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD)) {
-    wr->min_last_complete_ondisk = min_last_complete_ondisk;
-  } else {
-    /* Some replicas need this field to be at_version.  New replicas
-     * will ignore it */
-    wr->set_rollback_to(at_version);
-  }
+  // this feature is from 2019 (6f12bf27cb91), assume present
+  ceph_assert(HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD));
+  wr->pg_committed_to = pg_committed_to;
 
   wr->new_temp_oid = new_temp_oid;
   wr->discard_temp_oid = discard_temp_oid;
@@ -1010,7 +1091,7 @@ void ReplicatedBackend::issue_op(
   ceph_tid_t tid,
   osd_reqid_t reqid,
   eversion_t pg_trim_to,
-  eversion_t min_last_complete_ondisk,
+  eversion_t pg_committed_to,
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   const vector<pg_log_entry_t> &log_entries,
@@ -1043,7 +1124,7 @@ void ReplicatedBackend::issue_op(
 	  tid,
 	  reqid,
 	  pg_trim_to,
-	  min_last_complete_ondisk,
+	  pg_committed_to,
 	  new_temp_oid,
 	  discard_temp_oid,
 	  logs,
@@ -1145,7 +1226,7 @@ void ReplicatedBackend::do_repop(OpRequestRef op)
     m->updated_hit_set_history,
     m->pg_trim_to,
     m->version, /* Replicated PGs don't have rollback info */
-    m->min_last_complete_ondisk,
+    m->pg_committed_to,
     update_snaps,
     rm->localt,
     async);
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index aab75d21c73..3dcae206059 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -341,6 +341,40 @@ private:
 	op(op), v(v) {}
   };
   std::map<ceph_tid_t, ceph::ref_t<InProgressOp>> in_progress_ops;
+
+  /// Invoked by pct_callback to update PCT after a pause in IO
+  void send_pct_update();
+
+  /// Handle MOSDPGPCT message
+  void do_pct(OpRequestRef op);
+
+  /// Kick pct timer if repop_queue is empty
+  void maybe_kick_pct_update();
+
+  /// Kick pct timer if repop_queue is empty
+  void cancel_pct_update();
+
+  struct pct_callback_t final : public common::intrusive_timer::callback_t {
+    ReplicatedBackend *backend;
+
+    pct_callback_t(ReplicatedBackend *backend) : backend(backend) {}
+
+    void lock() override {
+      return backend->parent->pg_lock();
+    }
+    void unlock() override {
+      return backend->parent->pg_unlock();
+    }
+    void add_ref() override {
+      return backend->parent->pg_add_ref();
+    }
+    void dec_ref() override {
+      return backend->parent->pg_dec_ref();
+    }
+    void invoke() override {
+      return backend->send_pct_update();
+    }
+  } pct_callback;
 public:
   friend class C_OSD_OnOpCommit;
 
@@ -356,7 +390,7 @@ public:
     const eversion_t &at_version,
     PGTransactionUPtr &&t,
     const eversion_t &trim_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     std::vector<pg_log_entry_t>&& log_entries,
     std::optional<pg_hit_set_history_t> &hset_history,
     Context *on_all_commit,
@@ -372,7 +406,7 @@ private:
     ceph_tid_t tid,
     osd_reqid_t reqid,
     eversion_t pg_trim_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     const ceph::buffer::list &log_entries,
@@ -386,7 +420,7 @@ private:
     ceph_tid_t tid,
     osd_reqid_t reqid,
     eversion_t pg_trim_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     const std::vector<pg_log_entry_t> &log_entries,
diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc
index d585159649f..def85209c4e 100644
--- a/src/osd/osd_perf_counters.cc
+++ b/src/osd/osd_perf_counters.cc
@@ -133,6 +133,22 @@ PerfCounters *build_osd_logger(CephContext *cct) {
   osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
     "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
 
+
+  osd_plb.add_u64_counter(
+    l_osd_replica_read, "replica_read", "Count of replica reads received");
+  osd_plb.add_u64_counter(
+    l_osd_replica_read_redirect_missing,
+    "replica_read_redirect_missing",
+    "Count of replica reads redirected to primary due to missing object");
+  osd_plb.add_u64_counter(
+    l_osd_replica_read_redirect_conflict,
+    "replica_read_redirect_conflict",
+    "Count of replica reads redirected to primary due to unstable write");
+  osd_plb.add_u64_counter(
+    l_osd_replica_read_served,
+    "replica_read_served",
+    "Count of replica reads served");
+
   osd_plb.add_u64_counter(
     l_osd_sop, "subop", "Suboperations");
   osd_plb.add_u64_counter(
diff --git a/src/osd/osd_perf_counters.h b/src/osd/osd_perf_counters.h
index 367da1712fb..cccdb87a538 100644
--- a/src/osd/osd_perf_counters.h
+++ b/src/osd/osd_perf_counters.h
@@ -43,6 +43,11 @@ enum {
   l_osd_op_before_queue_op_lat,
   l_osd_op_before_dequeue_op_lat,
 
+  l_osd_replica_read,
+  l_osd_replica_read_redirect_missing,
+  l_osd_replica_read_redirect_conflict,
+  l_osd_replica_read_served,
+
   l_osd_sop,
   l_osd_sop_inb,
   l_osd_sop_lat,
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c9f3f7d1464..5c2cf8b16b0 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1378,7 +1378,9 @@ static opt_mapping_t opt_mapping = boost::assign::map_list_of
 	   ("pg_num_max", pool_opts_t::opt_desc_t(
              pool_opts_t::PG_NUM_MAX, pool_opts_t::INT))
 	   ("read_ratio", pool_opts_t::opt_desc_t(
-             pool_opts_t::READ_RATIO, pool_opts_t::INT));
+             pool_opts_t::READ_RATIO, pool_opts_t::INT))
+	   ("pct_update_delay", pool_opts_t::opt_desc_t(
+             pool_opts_t::PCT_UPDATE_DELAY, pool_opts_t::INT));
 
 bool pool_opts_t::is_opt_name(const std::string& name)
 {
@@ -3677,13 +3679,14 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
 // -- pg_notify_t --
 void pg_notify_t::encode(ceph::buffer::list &bl) const
 {
-  ENCODE_START(3, 2, bl);
+  ENCODE_START(4, 2, bl);
   encode(query_epoch, bl);
   encode(epoch_sent, bl);
   encode(info, bl);
   encode(to, bl);
   encode(from, bl);
   encode(past_intervals, bl);
+  encode(pg_features, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -3698,6 +3701,9 @@ void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
   if (struct_v >= 3) {
     decode(past_intervals, bl);
   }
+  if (struct_v >= 4) {
+    decode(pg_features, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -3719,9 +3725,11 @@ void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
 {
   o.push_back(new pg_notify_t);
   o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
-            pg_info_t(spg_t(pg_t(0,10), shard_id_t(-1))), PastIntervals()));
+	    pg_info_t(spg_t(pg_t(0,10), shard_id_t(-1))), PastIntervals(),
+            PG_FEATURE_CLASSIC_ALL));
   o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(2), 3, 10,
-            pg_info_t(spg_t(pg_t(10,10), shard_id_t(2))), PastIntervals()));
+	    pg_info_t(spg_t(pg_t(10,10), shard_id_t(2))), PastIntervals(),
+            PG_FEATURE_CLASSIC_ALL));
 }
 
 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index e2edaa39dfc..b6f5335a0f5 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -51,6 +51,7 @@
 #include "librados/ListObjectImpl.h"
 #include "compressor/Compressor.h"
 #include "osd_perf_counters.h"
+#include "pg_features.h"
 
 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
 
@@ -1106,6 +1107,21 @@ public:
     DEDUP_CDC_CHUNK_SIZE,
     PG_NUM_MAX, // max pg_num
     READ_RATIO, // read ration for the read balancer work [0-100]
+    /**
+     * PCT_UPDATE_DELAY
+     *
+     * Time to wait (seconds) after there are no in progress writes before
+     * updating pg_committed_to on replicas.  If the period between writes on
+     * a PG is usually longer than this value, most writes will trigger an
+     * extra message.
+     *
+     * The primary reason to enable this feature would be to limit the time
+     * between a write and when that write is available to be read on replicas.
+     *
+     * A value <= 0 will cause the update to be sent immediately upon write
+     * completion if there are no other in progress writes.
+     */
+    PCT_UPDATE_DELAY,
   };
 
   enum type_t {
@@ -3790,6 +3806,7 @@ struct pg_notify_t {
   shard_id_t to;
   shard_id_t from;
   PastIntervals past_intervals;
+  pg_feature_vec_t pg_features = PG_FEATURE_NONE;
   pg_notify_t() :
     query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
     from(shard_id_t::NO_SHARD) {}
@@ -3799,11 +3816,12 @@ struct pg_notify_t {
     epoch_t query_epoch,
     epoch_t epoch_sent,
     const pg_info_t &info,
-    const PastIntervals& pi)
+    const PastIntervals& pi,
+    pg_feature_vec_t pg_features)
     : query_epoch(query_epoch),
       epoch_sent(epoch_sent),
       info(info), to(to), from(from),
-      past_intervals(pi) {
+      past_intervals(pi), pg_features(pg_features) {
     ceph_assert(from == info.pgid.shard);
   }
   void encode(ceph::buffer::list &bl) const;
diff --git a/src/osd/pg_features.h b/src/osd/pg_features.h
new file mode 100644
index 00000000000..e601c84ee68
--- /dev/null
+++ b/src/osd/pg_features.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+/* This feature set defines a set of features supported by OSDs once a PG has
+ * gone active.
+ * Mechanically, pretty much the same as include/ceph_features.h */
+
+using pg_feature_vec_t = uint64_t;
+static constexpr pg_feature_vec_t PG_FEATURE_INCARNATION_1 = 0ull;
+
+#define DEFINE_PG_FEATURE(bit, incarnation, name)			\
+  static constexpr pg_feature_vec_t PG_FEATURE_##name = (1ull << bit);	\
+  static constexpr pg_feature_vec_t PG_FEATUREMASK_##name =		\
+    (1ull << bit) | PG_FEATURE_INCARNATION_##incarnation;
+
+#define PG_HAVE_FEATURE(x, name)				\
+  (((x) & (PG_FEATUREMASK_##name)) == (PG_FEATUREMASK_##name))
+
+DEFINE_PG_FEATURE(0, 1, PCT)
+
+static constexpr pg_feature_vec_t PG_FEATURE_NONE = 0ull;
+static constexpr pg_feature_vec_t PG_FEATURE_CRIMSON_ALL = 0ull;
+static constexpr pg_feature_vec_t PG_FEATURE_CLASSIC_ALL =
+  PG_FEATURE_PCT;
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc
index c67d2fca5fc..110c2c7d266 100644
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -65,7 +65,7 @@ void OsdScrub::dump_scrubs(ceph::Formatter* f) const
 void OsdScrub::dump_scrub_reservations(ceph::Formatter* f) const
 {
   m_resource_bookkeeper.dump_scrub_reservations(f);
-  f->open_array_section("remote_scrub_reservations");
+  f->open_object_section("remote_scrub_reservations");
   m_osd_svc.get_scrub_reserver().dump(f);
   f->close_section();
 }
@@ -220,8 +220,6 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing(
     env_conditions.restricted_time = !scrub_time_permit(scrub_clock_now);
     env_conditions.cpu_overloaded =
 	!m_load_tracker.scrub_load_below_threshold();
-    env_conditions.only_deadlined =
-	env_conditions.restricted_time || env_conditions.cpu_overloaded;
   }
 
   return env_conditions;
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc
index 8ff0d1ff7d8..c116bcbb4c2 100644
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -86,8 +86,6 @@ std::optional<Scrub::SchedEntry> ScrubQueue::pop_ready_entry(
     OSDRestrictions restrictions,
     utime_t time_now)
 {
-  /// \todo must handle 'only_deadlined'!
-
   auto eligible_filtr = [&, rst = restrictions](
 				  const SchedEntry& e) -> bool {
       return eligibility_pred(e, rst, time_now);
@@ -142,9 +140,10 @@ bool ScrubQueue::remove_entry_unlocked(spg_t pgid, scrub_level_t s_or_d)
 void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
 {
   ceph_assert(f != nullptr);
+  const auto query_time = ceph_clock_now();
   f->open_array_section("scrubs");
   for_each_job(
-      [&f](const Scrub::SchedEntry& e) {
+      [&f, query_time](const Scrub::SchedEntry& e) {
 	f->open_object_section("scrub");
 	f->dump_stream("pgid") << e.pgid;
 	f->dump_stream("sched_time") << e.schedule.not_before;
@@ -153,6 +152,15 @@ void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
 	f->dump_bool(
 	    "forced",
 	    e.schedule.scheduled_at == PgScrubber::scrub_must_stamp());
+
+        f->dump_stream("level") << (e.level == scrub_level_t::shallow
+                                       ? "shallow"
+                                       : "deep");
+        f->dump_stream("urgency") << fmt::format("{}", e.urgency);
+        f->dump_bool("eligible", e.schedule.not_before <= query_time);
+        f->dump_bool("overdue", e.schedule.deadline < query_time);
+        f->dump_stream("last_issue") << fmt::format("{}", e.last_issue);
+
 	f->close_section();
       },
       std::numeric_limits<int>::max());
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 594ffb15e2b..aa53df5ae8a 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -3,13 +3,13 @@
 
 #include "./pg_scrubber.h"  // '.' notation used to affect clang-format order
 
+#include <fmt/ranges.h>
+
 #include <cmath>
 #include <iostream>
 #include <span>
 #include <vector>
 
-#include <fmt/ranges.h>
-
 #include "debug.h"
 
 #include "common/ceph_time.h"
@@ -766,8 +766,13 @@ void PgScrubber::on_operator_periodic_cmd(
   asok_response_section(f, true, scrub_level, stamp);
 
   if (scrub_level == scrub_level_t::deep) {
+    const auto saved_shallow_stamp = m_pg->info.history.last_scrub_stamp;
     // this call sets both stamps
     m_pg->set_last_deep_scrub_stamp(stamp);
+    // restore the shallow stamp, as otherwise it will be scheduled before
+    // the deep, failing whatever test code called us (this is a test-only
+    // interface).
+    m_pg->set_last_scrub_stamp(saved_shallow_stamp);
   } else {
     m_pg->set_last_scrub_stamp(stamp);
   }
@@ -819,21 +824,21 @@ namespace {
  * an aux function to be used in select_range() below, to
  * select the correct chunk size based on the type of scrub
  */
-int size_from_conf(
+int64_t size_from_conf(
     bool is_deep,
     const ceph::common::ConfigProxy& conf,
-    std::string_view deep_opt,
-    std::string_view shallow_opt)
+    const md_config_cacher_t<int64_t>& deep_opt,
+    const md_config_cacher_t<int64_t>& shallow_opt)
 {
   if (!is_deep) {
-    auto sz = conf.get_val<int64_t>(shallow_opt);
+    auto sz = *shallow_opt;
     if (sz != 0) {
       // assuming '0' means that no distinction was yet configured between
       // deep and shallow scrubbing
-      return static_cast<int>(sz);
+      return sz;
     }
   }
-  return static_cast<int>(conf.get_val<int64_t>(deep_opt));
+  return *deep_opt;
 }
 }  // anonymous namespace
 
@@ -912,16 +917,16 @@ std::optional<uint64_t> PgScrubber::select_range()
   dout(20) << fmt::format(
 		  "{} {} mins: {}d {}s, max: {}d {}s", __func__,
 		  (m_is_deep ? "D" : "S"),
-		  conf.get_val<int64_t>("osd_scrub_chunk_min"),
-		  conf.get_val<int64_t>("osd_shallow_scrub_chunk_min"),
-		  conf.get_val<int64_t>("osd_scrub_chunk_max"),
-		  conf.get_val<int64_t>("osd_shallow_scrub_chunk_max"))
+		  *osd_scrub_chunk_min,
+		  *osd_shallow_scrub_chunk_min,
+		  *osd_scrub_chunk_max,
+		  *osd_shallow_scrub_chunk_max)
 	   << dendl;
 
-  const int min_from_conf = size_from_conf(
-      m_is_deep, conf, "osd_scrub_chunk_min", "osd_shallow_scrub_chunk_min");
-  const int max_from_conf = size_from_conf(
-      m_is_deep, conf, "osd_scrub_chunk_max", "osd_shallow_scrub_chunk_max");
+  const int min_from_conf = static_cast<int>(size_from_conf(
+      m_is_deep, conf, osd_scrub_chunk_min, osd_shallow_scrub_chunk_min));
+  const int max_from_conf = static_cast<int>(size_from_conf(
+      m_is_deep, conf, osd_scrub_chunk_max, osd_shallow_scrub_chunk_max));
 
   const int divisor = static_cast<int>(preemption_data.chunk_divisor());
   const int min_chunk_sz = std::max(3, min_from_conf / divisor);
@@ -1635,7 +1640,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   advance_token();
   const auto& conf = m_pg->get_cct()->_conf;
   const int max_from_conf = size_from_conf(
-    m_is_deep, conf, "osd_scrub_chunk_max", "osd_shallow_scrub_chunk_max");
+    m_is_deep, conf, osd_scrub_chunk_max, osd_shallow_scrub_chunk_max);
   auto cost = get_scrub_cost(max_from_conf);
   m_osds->queue_for_rep_scrub(m_pg,
 			      m_replica_request_priority,
@@ -2541,6 +2546,16 @@ PgScrubber::PgScrubber(PG* pg)
     , m_pg_id{pg->pg_id}
     , m_osds{m_pg->osd}
     , m_pg_whoami{pg->pg_whoami}
+    , osd_scrub_chunk_max{m_osds->cct->_conf, "osd_scrub_chunk_max"}
+    , osd_shallow_scrub_chunk_max{m_osds->cct->_conf,
+				  "osd_shallow_scrub_chunk_max"}
+    , osd_scrub_chunk_min{m_osds->cct->_conf, "osd_scrub_chunk_min"}
+    , osd_shallow_scrub_chunk_min{m_osds->cct->_conf,
+				  "osd_shallow_scrub_chunk_min"}
+    , osd_stats_update_period_scrubbing{
+	m_osds->cct->_conf, "osd_stats_update_period_scrubbing"}
+    , osd_stats_update_period_not_scrubbing{
+	m_osds->cct->_conf, "osd_stats_update_period_not_scrubbing"}
     , preemption_data{pg}
 {
   m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
@@ -2669,7 +2684,8 @@ const OSDMapRef& PgScrubber::get_osdmap() const
 
 LoggerSinkSet& PgScrubber::get_logger() const { return *m_osds->clog.get(); }
 
-ostream &operator<<(ostream &out, const PgScrubber &scrubber) {
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
   return out << scrubber.m_flags;
 }
 
@@ -2687,9 +2703,53 @@ void PgScrubber::log_cluster_warning(const std::string& warning) const
   m_osds->clog->do_log(CLOG_WARN, warning);
 }
 
-ostream& PgScrubber::show(ostream& out) const
+
+ostream& PgScrubber::show_concise(ostream& out) const
 {
-  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+  /*
+  * 'show_concise()' is only used when calling operator<< thru the ScrubPgIF,
+  * i.e. only by the PG when creating a standard log entry.
+  *
+  * desired outcome (only relevant for Primaries):
+  *
+  * if scrubbing:
+  *   (urgency,flags)
+  *   or (if blocked)
+  *   (*blocked*,urgency,flags)
+  *
+  * if not scrubbing:
+  *   either nothing (if only periodic scrubs are scheduled)
+  *   or [next-scrub: effective-lvl, urgency]
+  */
+  if (!is_primary()) {
+    return out;
+  }
+
+  if (m_active) {
+    const auto flags_txt = fmt::format("{}", m_flags);
+    const std::string sep = (flags_txt.empty() ? "" : ",");
+    if (m_active_target) {
+      return out << fmt::format(
+		 "({}{}{}{})", (m_scrub_job->blocked ? "*blocked*," : ""),
+		 m_active_target->urgency(), sep, flags_txt);
+    } else {
+      // only expected in a couple of messages during scrub termination
+      return out << fmt::format(
+		 "(teardown{}{}{})", (m_scrub_job->blocked ? "-*blocked*" : ""),
+		 sep, flags_txt);
+    }
+  }
+
+  // not actively scrubbing now. Show some info about the next scrub
+  const auto now_is = ceph_clock_now();
+  const auto& next_scrub = m_scrub_job->earliest_target(now_is);
+  if (!next_scrub.is_high_priority()) {
+    // no interesting flags to report
+    return out;
+  }
+  return out << fmt::format(
+	     "[next-scrub:{},{:10.10}]", (next_scrub.is_deep() ? "dp" : "sh"),
+	     next_scrub.urgency());
 }
 
 int PgScrubber::asok_debug(std::string_view cmd,
@@ -2739,16 +2799,14 @@ void PgScrubber::update_scrub_stats(ceph::coarse_real_clock::time_point now_is)
   using clock = ceph::coarse_real_clock;
   using namespace std::chrono;
 
-  const seconds period_active = seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
-    "osd_stats_update_period_scrubbing"));
+  const seconds period_active = seconds(*osd_stats_update_period_scrubbing);
   if (!period_active.count()) {
     // a way for the operator to disable these stats updates
     return;
   }
-  const seconds period_inactive =
-    seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
-	      "osd_stats_update_period_not_scrubbing") +
-	    m_pg_id.pgid.m_seed % 30);
+  const seconds period_inactive = seconds(
+      *osd_stats_update_period_not_scrubbing +
+      m_pg_id.pgid.m_seed % 30);
 
   // determine the required update period, based on our current state
   auto period{period_inactive};
@@ -2782,10 +2840,10 @@ void PgScrubber::update_scrub_stats(ceph::coarse_real_clock::time_point now_is)
 
 // ///////////////////// preemption_data_t //////////////////////////////////
 
-PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg},
+  osd_scrub_max_preemptions{pg->cct->_conf, "osd_scrub_max_preemptions"}
 {
-  m_left = static_cast<int>(
-    m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_left = *osd_scrub_max_preemptions;
 }
 
 void PgScrubber::preemption_data_t::reset()
@@ -2794,8 +2852,7 @@ void PgScrubber::preemption_data_t::reset()
 
   m_preemptable = false;
   m_preempted = false;
-  m_left = static_cast<int>(
-    m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_left = *osd_scrub_max_preemptions;
   m_size_divisor = 1;
 }
 
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index 1a5813bd923..0d9e8c1e9f6 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -75,6 +75,8 @@ Main Scrubber interfaces:
 #include <string_view>
 #include <vector>
 
+#include "common/config_proxy.h"
+#include "common/config_cacher.h"
 #include "osd/PG.h"
 #include "osd/scrubber_common.h"
 
@@ -164,7 +166,7 @@ template <>
 struct formatter<scrub_flags_t> {
   constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
   template <typename FormatContext>
-  auto format(scrub_flags_t& sf, FormatContext& ctx) const
+  auto format(const scrub_flags_t& sf, FormatContext& ctx) const
   {
     std::string txt;
     bool sep{false};
@@ -528,7 +530,7 @@ class PgScrubber : public ScrubPgIF,
   /// to complete (in order to perform an 'after-repair' scrub)
   bool m_after_repair_scrub_required{false};
 
-  ostream& show(ostream& out) const override;
+  ostream& show_concise(ostream& out) const override;
 
  public:
   //  ------------------  the I/F used by the ScrubBackend (ScrubBeListener)
@@ -741,6 +743,12 @@ class PgScrubber : public ScrubPgIF,
   bool m_publish_sessions{false};  //< will the counter be part of 'query'
 				   //output?
 
+  /**
+   * the scrub operation flags.
+   * Set at scrub start. Checked in multiple locations - mostly
+   * at finish.
+   * Note: replicas only use the 'priority' field.
+   */
   scrub_flags_t m_flags;
 
   bool m_active{false};
@@ -889,6 +897,24 @@ class PgScrubber : public ScrubPgIF,
   // scrub state.
   ceph::coarse_real_clock::time_point m_last_stat_upd{};
 
+  // ------------------ cached (frequently used) configuration values
+
+  /// initial (& max) number of objects to scrub in one pass - deep scrub
+  md_config_cacher_t<int64_t> osd_scrub_chunk_max;
+  /// initial (& max) number of objects to scrub in one pass - shallow
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_max;
+
+  /// chunk size won't be reduced (when preempted) below this
+  /// value (deep scrub)
+  md_config_cacher_t<int64_t> osd_scrub_chunk_min;
+  /// chunk size won't be reduced below this value (shallow scrub)
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_min;
+
+  /// stats update (publish_stats_to_osd()) interval while scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_scrubbing;
+  /// stats update interval while not scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_not_scrubbing;
+
   // ------------ members used if we are a replica
 
   epoch_t m_replica_min_epoch;	///< the min epoch needed to handle this message
@@ -985,6 +1011,9 @@ class PgScrubber : public ScrubPgIF,
     mutable ceph::mutex m_preemption_lock = ceph::make_mutex("preemption_lock");
     bool m_preemptable{false};
     bool m_preempted{false};
+
+    /// the number of preemptions allowed before we start blocking
+    md_config_cacher_t<uint64_t> osd_scrub_max_preemptions;
     int m_left;
     size_t m_size_divisor{1};
     bool are_preemptions_left() const { return m_left > 0; }
diff --git a/src/osd/scrubber/scrub_queue_entry.h b/src/osd/scrubber/scrub_queue_entry.h
index 03d959769b2..aeb76c104fe 100644
--- a/src/osd/scrubber/scrub_queue_entry.h
+++ b/src/osd/scrubber/scrub_queue_entry.h
@@ -98,11 +98,6 @@ static inline std::weak_ordering cmp_ripe_entries(
   if (auto cmp = r.urgency <=> l.urgency; cmp != 0) {
     return cmp;
   }
-  // if we are comparing the two targets of the same PG, once both are
-  // ripe - the 'deep' scrub is considered 'higher' than the 'shallow' one.
-  if (l.pgid == r.pgid && r.level < l.level) {
-    return std::weak_ordering::less;
-  }
   // the 'utime_t' operator<=> is 'partial_ordering', it seems.
   if (auto cmp = std::weak_order(
 	  double(l.schedule.scheduled_at), double(r.schedule.scheduled_at));
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index d1a0fbdccb5..809107e593b 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -92,10 +92,10 @@ struct OSDRestrictions {
   /// the OSD is performing recovery & osd_repair_during_recovery is 'true'
   bool allow_requested_repair_only:1{false};
 
-  /// the load is high, or the time is not right. For periodic scrubs,
-  /// only the overdue ones are allowed.
-  bool only_deadlined:1{false};
+  /// the CPU load is high. No regular scrubs are allowed.
   bool cpu_overloaded:1{false};
+
+  /// outside of allowed scrubbing hours/days
   bool restricted_time:1{false};
 
   /// the OSD is performing a recovery, osd_scrub_during_recovery is 'false',
@@ -299,12 +299,11 @@ struct ScrubPgIF {
 
   virtual ~ScrubPgIF() = default;
 
-  friend std::ostream& operator<<(std::ostream& out, const ScrubPgIF& s)
-  {
-    return s.show(out);
+  friend std::ostream& operator<<(std::ostream& out, const ScrubPgIF& s) {
+    return s.show_concise(out);
   }
 
-  virtual std::ostream& show(std::ostream& out) const = 0;
+  virtual std::ostream& show_concise(std::ostream& out) const = 0;
 
   // --------------- triggering state-machine events:
 
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index d881c6e1dc3..087b623333b 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -180,6 +180,10 @@ enum {
   l_osdc_osdop_omap_rd,
   l_osdc_osdop_omap_del,
 
+  l_osdc_replica_read_sent,
+  l_osdc_replica_read_bounced,
+  l_osdc_replica_read_completed,
+
   l_osdc_last,
 };
 
@@ -378,6 +382,13 @@ void Objecter::init()
     pcb.add_u64_counter(l_osdc_osdop_omap_del, "omap_del",
 			"OSD OMAP delete operations");
 
+    pcb.add_u64_counter(l_osdc_replica_read_sent, "replica_read_sent",
+			"Operations sent to replica");
+    pcb.add_u64_counter(l_osdc_replica_read_bounced, "replica_read_bounced",
+			"Operations bounced by replica to be resent to primary");
+    pcb.add_u64_counter(l_osdc_replica_read_completed, "replica_read_completed",
+			"Operations completed by replica");
+
     logger = pcb.create_perf_counters();
     cct->get_perfcounters_collection()->add(logger);
   }
@@ -2328,6 +2339,10 @@ void Objecter::_send_op_account(Op *op)
     ldout(cct, 20) << " note: not requesting reply" << dendl;
   }
 
+  if (op->target.used_replica) {
+    logger->inc(l_osdc_replica_read_sent);
+  }
+
   logger->inc(l_osdc_op_active);
   logger->inc(l_osdc_op);
   logger->inc(l_osdc_oplen_avg, op->ops.size());
@@ -3477,6 +3492,15 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
     return;
   }
 
+  if (op->target.flags & (CEPH_OSD_FLAG_BALANCE_READS |
+			  CEPH_OSD_FLAG_LOCALIZE_READS)) {
+    if (rc == -EAGAIN) {
+      logger->inc(l_osdc_replica_read_bounced);
+    } else {
+      logger->inc(l_osdc_replica_read_completed);
+    }
+  }
+
   if (rc == -EAGAIN) {
     ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
     if (op->has_completion())
diff --git a/src/pybind/cephfs/cephfs.pyx b/src/pybind/cephfs/cephfs.pyx
index 793d88b9850..798ea3f902a 100644
--- a/src/pybind/cephfs/cephfs.pyx
+++ b/src/pybind/cephfs/cephfs.pyx
@@ -923,12 +923,12 @@ cdef class LibCephFS(object):
 
         :param fd: the file descriptor of the file to fallocate.
         :param mode: the flags determines the operation to be performed on the given
-                     range. default operation (0) allocate and initialize to zero
-                     the file in the byte range, and the file size will be changed
-                     if offset + length is greater than the file size. if the
-                     FALLOC_FL_KEEP_SIZE flag is specified in the mode, the file size
-                     will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is specified
-                     in the mode, the operation is deallocate space and zero the byte range.
+                     range. default operation (0) is to return -EOPNOTSUPP since
+                     cephfs does not allocate disk blocks to provide write guarantees.
+                     if the FALLOC_FL_KEEP_SIZE flag is specified in the mode,
+                     the file size will not be changed.  if the FALLOC_FL_PUNCH_HOLE
+                     flag is specified in the mode, the operation is deallocate
+                     space and zero the byte range.
         :param offset: the byte range starting.
         :param length: the length of the range.
         """
diff --git a/src/pybind/mgr/CMakeLists.txt b/src/pybind/mgr/CMakeLists.txt
index b2a8ac9a325..9e900f859d7 100644
--- a/src/pybind/mgr/CMakeLists.txt
+++ b/src/pybind/mgr/CMakeLists.txt
@@ -42,7 +42,6 @@ set(mgr_modules
   progress
   prometheus
   rbd_support
-  restful
   rgw
   # rook (optional)
   selftest
@@ -54,8 +53,7 @@ set(mgr_modules
   telemetry
   # tests (for testing purpose only)
   test_orchestrator
-  volumes
-  zabbix)
+  volumes)
 
 install(DIRECTORY ${mgr_modules}
   DESTINATION ${CEPH_INSTALL_DATADIR}/mgr
diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index c98ce9aec41..476304275c1 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -325,6 +325,11 @@ class Module(MgrModule):
                type='str',
                default='',
                desc='pools which the automatic balancing will be limited to',
+               runtime=True),
+        Option(name='update_pg_upmap_activity',
+               type='bool',
+               default=False,
+               desc='Updates pg_upmap activity stats to be used in `balancer status detail`',
                runtime=True)
     ]
 
@@ -339,12 +344,10 @@ class Module(MgrModule):
     no_optimization_needed = False
     success_string = 'Optimization plan created successfully'
     in_progress_string = 'in progress'
-    last_pg_upmap: List[Dict[str, Any]] = []
     pg_upmap_items_added: List[Dict[str, Any]] = []
     pg_upmap_items_removed: List[Dict[str, Any]] = []
-    last_pg_upmap_primaries: List[Dict[str, Any]] = []
     pg_upmap_primaries_added: List[Dict[str, Any]] = []
-    pg_upmap_activity_initalized = False
+    pg_upmap_primaries_removed: List[Dict[str, Any]] = []
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super(Module, self).__init__(*args, **kwargs)
@@ -371,6 +374,11 @@ class Module(MgrModule):
         """
         Show balancer status (detailed)
         """
+        pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+        if not pg_upmap_activity:
+            msg = 'This command is disabled.\n' \
+                  'To enable, run `ceph config set mgr mgr/balancer/update_pg_upmap_activity True`.\n'
+            return 0, msg, ''
         s = {
             'plans': list(self.plans.keys()),
             'active': self.active,
@@ -665,7 +673,9 @@ class Module(MgrModule):
         if not plan_:
             return (-errno.ENOENT, '', f'plan {plan} not found')
         r, detail = self.execute(plan_)
-        self.update_pg_upmap_activity()  # update pg activity in `balancer status detail`
+        pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+        if pg_upmap_activity:
+            self.update_pg_upmap_activity(plan_)  # update pg activity in `balancer status detail`
         self.plan_rm(plan)
         return (r, '', detail)
 
@@ -757,7 +767,9 @@ class Module(MgrModule):
                     self.execute(plan)
                 else:
                     self.optimize_result = detail
-                self.update_pg_upmap_activity()  # update pg activity in `balancer status detail`
+                pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+                if pg_upmap_activity:
+                    self.update_pg_upmap_activity(plan)  # update pg activity in `balancer status detail`
                 self.optimizing = False
             self.log.debug('Sleeping for %d', sleep_interval)
             self.event.wait(sleep_interval)
@@ -1582,22 +1594,16 @@ class Module(MgrModule):
             'mode': self.mode,
         }
 
-    def update_pg_upmap_activity(self) -> None:
-        osdmap = self.get_osdmap()
-        if not self.pg_upmap_activity_initalized:
-            self.last_pg_upmap = osdmap.dump().get('pg_upmap_items', '')
-            self.last_pg_upmap_primaries = osdmap.dump().get('pg_upmap_primaries', '')
-            self.pg_upmap_activity_initalized = True
+    def update_pg_upmap_activity(self, plan: Plan) -> None:
+        incdump = plan.inc.dump()
 
         # update pg_upmap_items
-        self.pg_upmap_items_added = [pg for pg in osdmap.dump().get('pg_upmap_items', '') if pg not in self.last_pg_upmap]
-        self.pg_upmap_items_removed = [pg for pg in self.last_pg_upmap if pg not in osdmap.dump().get('pg_upmap_items', '')]
-        self.last_pg_upmap = osdmap.dump().get('pg_upmap_items', '')
+        self.pg_upmap_items_added = incdump.get('new_pg_upmap_items', [])
+        self.pg_upmap_items_removed = incdump.get('old_pg_upmap_items', [])
 
         # update pg_upmap_primaries
-        self.pg_upmap_primaries_added = [pg for pg in osdmap.dump().get('pg_upmap_primaries', '') if pg not in self.last_pg_upmap_primaries]
-        self.pg_upmap_primaries_removed = [pg for pg in self.last_pg_upmap_primaries if pg not in osdmap.dump().get('pg_upmap_primaries', '')]
-        self.last_pg_upmap_primaries = osdmap.dump().get('pg_upmap_primaries', '')
+        self.pg_upmap_primaries_added = incdump.get('new_pg_upmap_primaries', [])
+        self.pg_upmap_primaries_removed = incdump.get('old_pg_upmap_primaries', [])
 
     def self_test(self) -> None:
         # turn balancer on
diff --git a/src/pybind/mgr/cephadm/cert_mgr.py b/src/pybind/mgr/cephadm/cert_mgr.py
index 9b68e85ca44..0c56c704788 100644
--- a/src/pybind/mgr/cephadm/cert_mgr.py
+++ b/src/pybind/mgr/cephadm/cert_mgr.py
@@ -1,6 +1,6 @@
 
 from cephadm.ssl_cert_utils import SSLCerts, SSLConfigException
-from typing import TYPE_CHECKING, Tuple, Union, List
+from typing import TYPE_CHECKING, Tuple, Union, List, Optional
 
 if TYPE_CHECKING:
     from cephadm.module import CephadmOrchestrator
@@ -28,5 +28,10 @@ class CertMgr:
     def get_root_ca(self) -> str:
         return self.ssl_certs.get_root_cert()
 
-    def generate_cert(self, host_fqdn: Union[str, List[str]], node_ip: Union[str, List[str]]) -> Tuple[str, str]:
-        return self.ssl_certs.generate_cert(host_fqdn, node_ip)
+    def generate_cert(
+        self,
+        host_fqdn: Union[str, List[str]],
+        node_ip: Union[str, List[str]],
+        custom_san_list: Optional[List[str]] = None,
+    ) -> Tuple[str, str]:
+        return self.ssl_certs.generate_cert(host_fqdn, node_ip, custom_san_list=custom_san_list)
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index 8a16ef8ae80..c3051240713 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -637,6 +637,9 @@ class TunedProfileStore():
             logger.error(
                 f'Attempted to set setting "{setting}" for nonexistent os tuning profile "{profile}"')
 
+    def add_settings(self, profile: str, settings: dict) -> None:
+        self.process_settings(profile, settings, action='add')
+
     def rm_setting(self, profile: str, setting: str) -> None:
         if profile in self.profiles:
             if setting in self.profiles[profile].settings:
@@ -650,6 +653,39 @@ class TunedProfileStore():
             logger.error(
                 f'Attempted to remove setting "{setting}" from nonexistent os tuning profile "{profile}"')
 
+    def rm_settings(self, profile: str, settings: List[str]) -> None:
+        self.process_settings(profile, settings, action='remove')
+
+    def process_settings(self, profile: str, settings: Union[dict, list], action: str) -> None:
+        """
+        Process settings by either adding or removing them based on the action specified.
+        """
+        if profile not in self.profiles:
+            logger.error(f'Attempted to {action} settings for nonexistent os tuning profile "{profile}"')
+            return
+        profile_settings = self.profiles[profile].settings
+        if action == 'remove' and isinstance(settings, list):
+            invalid_settings = [s for s in settings if '=' in s or s not in profile_settings]
+            if invalid_settings:
+                raise OrchestratorError(
+                    f"Invalid settings: {', '.join(invalid_settings)}. "
+                    "Ensure settings are specified without '=' and exist in the profile. Correct format: key1,key2"
+                )
+        if action == 'add' and isinstance(settings, dict):
+            for setting, value in settings.items():
+                self.profiles[profile].settings[setting] = value
+        elif action == 'remove' and isinstance(settings, list):
+            for setting in settings:
+                self.profiles[profile].settings.pop(setting, '')
+        else:
+            logger.error(
+                f'Invalid action "{action}" for settings modification for tuned profile '
+                f'"{profile}". Valid actions are "add" and "remove"'
+            )
+            return
+        self.profiles[profile]._last_updated = datetime_to_str(datetime_now())
+        self.save()
+
     def add_profile(self, spec: TunedProfileSpec) -> None:
         spec._last_updated = datetime_to_str(datetime_now())
         self.profiles[spec.profile_name] = spec
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 7bf65b532fa..e851f1ee3fc 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -30,6 +30,7 @@ import multiprocessing.pool
 import subprocess
 from prettytable import PrettyTable
 
+import ceph.cephadm.images as default_images
 from ceph.deployment import inventory
 from ceph.deployment.drive_group import DriveGroupSpec
 from ceph.deployment.service_spec import \
@@ -130,28 +131,7 @@ def os_exit_noop(status: int) -> None:
 
 os._exit = os_exit_noop   # type: ignore
 
-
-# Default container images -----------------------------------------------------
 DEFAULT_IMAGE = 'quay.io/ceph/ceph'
-DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
-DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
-DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
-DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
-DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
-DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
-DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
-DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
-DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
-DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
-DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
-DEFAULT_SAMBA_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
-DEFAULT_SAMBA_METRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
-# ------------------------------------------------------------------------------
 
 
 def host_exists(hostname_position: int = 1) -> Callable:
@@ -239,92 +219,92 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         ),
         Option(
             'container_image_prometheus',
-            default=DEFAULT_PROMETHEUS_IMAGE,
+            default=default_images.DEFAULT_PROMETHEUS_IMAGE,
             desc='Prometheus container image',
         ),
         Option(
             'container_image_nvmeof',
-            default=DEFAULT_NVMEOF_IMAGE,
+            default=default_images.DEFAULT_NVMEOF_IMAGE,
             desc='Nvme-of container image',
         ),
         Option(
             'container_image_grafana',
-            default=DEFAULT_GRAFANA_IMAGE,
+            default=default_images.DEFAULT_GRAFANA_IMAGE,
             desc='Prometheus container image',
         ),
         Option(
             'container_image_alertmanager',
-            default=DEFAULT_ALERT_MANAGER_IMAGE,
+            default=default_images.DEFAULT_ALERTMANAGER_IMAGE,
             desc='Prometheus container image',
         ),
         Option(
             'container_image_node_exporter',
-            default=DEFAULT_NODE_EXPORTER_IMAGE,
+            default=default_images.DEFAULT_NODE_EXPORTER_IMAGE,
             desc='Prometheus container image',
         ),
         Option(
             'container_image_loki',
-            default=DEFAULT_LOKI_IMAGE,
+            default=default_images.DEFAULT_LOKI_IMAGE,
             desc='Loki container image',
         ),
         Option(
             'container_image_promtail',
-            default=DEFAULT_PROMTAIL_IMAGE,
+            default=default_images.DEFAULT_PROMTAIL_IMAGE,
             desc='Promtail container image',
         ),
         Option(
             'container_image_haproxy',
-            default=DEFAULT_HAPROXY_IMAGE,
+            default=default_images.DEFAULT_HAPROXY_IMAGE,
             desc='HAproxy container image',
         ),
         Option(
             'container_image_keepalived',
-            default=DEFAULT_KEEPALIVED_IMAGE,
+            default=default_images.DEFAULT_KEEPALIVED_IMAGE,
             desc='Keepalived container image',
         ),
         Option(
             'container_image_snmp_gateway',
-            default=DEFAULT_SNMP_GATEWAY_IMAGE,
+            default=default_images.DEFAULT_SNMP_GATEWAY_IMAGE,
             desc='SNMP Gateway container image',
         ),
         Option(
             'container_image_nginx',
-            default=DEFAULT_NGINX_IMAGE,
+            default=default_images.DEFAULT_NGINX_IMAGE,
             desc='Nginx container image',
         ),
         Option(
             'container_image_oauth2_proxy',
-            default=DEFAULT_OAUTH2_PROXY_IMAGE,
+            default=default_images.DEFAULT_OAUTH2_PROXY_IMAGE,
             desc='oauth2-proxy container image',
         ),
         Option(
             'container_image_elasticsearch',
-            default=DEFAULT_ELASTICSEARCH_IMAGE,
+            default=default_images.DEFAULT_ELASTICSEARCH_IMAGE,
             desc='elasticsearch container image',
         ),
         Option(
             'container_image_jaeger_agent',
-            default=DEFAULT_JAEGER_AGENT_IMAGE,
+            default=default_images.DEFAULT_JAEGER_AGENT_IMAGE,
             desc='Jaeger agent container image',
         ),
         Option(
             'container_image_jaeger_collector',
-            default=DEFAULT_JAEGER_COLLECTOR_IMAGE,
+            default=default_images.DEFAULT_JAEGER_COLLECTOR_IMAGE,
             desc='Jaeger collector container image',
         ),
         Option(
             'container_image_jaeger_query',
-            default=DEFAULT_JAEGER_QUERY_IMAGE,
+            default=default_images.DEFAULT_JAEGER_QUERY_IMAGE,
             desc='Jaeger query container image',
         ),
         Option(
             'container_image_samba',
-            default=DEFAULT_SAMBA_IMAGE,
+            default=default_images.DEFAULT_SAMBA_IMAGE,
             desc='Samba/SMB container image',
         ),
         Option(
             'container_image_samba_metrics',
-            default=DEFAULT_SAMBA_METRICS_IMAGE,
+            default=default_images.DEFAULT_SAMBA_METRICS_IMAGE,
             desc='Samba/SMB metrics exporter container image',
         ),
         Option(
@@ -822,30 +802,33 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         security_enabled = self.secure_monitoring_stack or mgmt_gw_enabled
         return security_enabled, mgmt_gw_enabled, oauth2_proxy_enabled
 
-    def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
+    def _get_mgmt_gw_endpoint(self, is_internal: bool) -> Optional[str]:
         mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
         if not mgmt_gw_daemons:
             return None
 
         dd = mgmt_gw_daemons[0]
         assert dd.hostname is not None
-        mgmt_gw_addr = self.get_fqdn(dd.hostname)
-        mgmt_gw_internal_endpoint = build_url(scheme='https', host=mgmt_gw_addr, port=MgmtGatewayService.INTERNAL_SERVICE_PORT)
-        return f'{mgmt_gw_internal_endpoint}/internal'
+        mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
+        mgmt_gw_addr = mgmt_gw_spec.virtual_ip if mgmt_gw_spec.virtual_ip is not None else self.get_fqdn(dd.hostname)
 
-    def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
-        mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
-        if not mgmt_gw_daemons:
-            return None
+        if is_internal:
+            mgmt_gw_port: Optional[int] = MgmtGatewayService.INTERNAL_SERVICE_PORT
+            protocol = 'https'
+            endpoint_suffix = '/internal'
+        else:
+            mgmt_gw_port = dd.ports[0] if dd.ports else None
+            protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
+            endpoint_suffix = ''
 
-        dd = mgmt_gw_daemons[0]
-        assert dd.hostname is not None
-        mgmt_gw_port = dd.ports[0] if dd.ports else None
-        mgmt_gw_addr = self.get_fqdn(dd.hostname)
-        mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
-        protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
-        mgmt_gw_external_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
-        return mgmt_gw_external_endpoint
+        mgmt_gw_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
+        return f'{mgmt_gw_endpoint}{endpoint_suffix}'
+
+    def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
+        return self._get_mgmt_gw_endpoint(is_internal=True)
+
+    def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
+        return self._get_mgmt_gw_endpoint(is_internal=False)
 
     def _get_cephadm_binary_path(self) -> str:
         import hashlib
@@ -1900,7 +1883,7 @@ Then run the following:
         self.inventory.add_host(spec)
         self.offline_hosts_remove(spec.hostname)
         if spec.status == 'maintenance':
-            self._set_maintenance_healthcheck()
+            self.set_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Added host %s' % spec.hostname)
         return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr)
@@ -2071,6 +2054,7 @@ Then run the following:
         self.ssh.reset_con(host)
         # if host was in offline host list, we should remove it now.
         self.offline_hosts_remove(host)
+        self.set_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Removed host %s' % host)
         return "Removed {} host '{}'".format('offline' if offline else '', host)
@@ -2185,7 +2169,7 @@ Then run the following:
         self.log.info(msg)
         return msg
 
-    def _set_maintenance_healthcheck(self) -> None:
+    def set_maintenance_healthcheck(self) -> None:
         """Raise/update or clear the maintenance health check as needed"""
 
         in_maintenance = self.inventory.get_host_with_state("maintenance")
@@ -2269,12 +2253,12 @@ Then run the following:
         self.inventory._inventory[hostname] = tgt_host
         self.inventory.save()
 
-        self._set_maintenance_healthcheck()
+        self.set_maintenance_healthcheck()
         return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode'
 
     @handle_orch_error
     @host_exists()
-    def exit_host_maintenance(self, hostname: str) -> str:
+    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> str:
         """Exit maintenance mode and return a host to an operational state
 
         Returning from maintenance will enable the clusters systemd target and
@@ -2282,6 +2266,8 @@ Then run the following:
         host has osd daemons
 
         :param hostname: (str) host name
+        :param force: (bool) force removal of the host from maintenance mode
+        :param offline: (bool) to remove hosts that are offline from maintenance mode
 
         :raises OrchestratorError: Unable to return from maintenance, or unset the
                                    noout flag
@@ -2290,37 +2276,74 @@ Then run the following:
         if tgt_host['status'] != "maintenance":
             raise OrchestratorError(f"Host {hostname} is not in maintenance mode")
 
-        with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
-            outs, errs, _code = self.wait_async(
-                CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
-                                                'host-maintenance', ['exit'], error_ok=True))
-        returned_msg = errs[0].split('\n')[-1]
-        if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
-            raise OrchestratorError(
-                f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
-
-        if "osd" in self.cache.get_daemon_types(hostname):
-            crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
-            rc, _out, _err = self.mon_command({
-                'prefix': 'osd unset-group',
-                'flags': 'noout',
-                'who': [crush_node],
-                'format': 'json'
-            })
-            if rc:
+        # Given we do not regularly check maintenance mode hosts for being offline,
+        # we have no idea at this point whether the host is online or not.
+        # Keep in mind this goes both ways, as users could have run
+        # "ceph cephadm check-host <hostname>" when the host was in maintenance
+        # mode and offline and the host could have since come online. This following
+        # "cephadm check-host" command is being run purely so we know if the host
+        # is online or offline, as those should be handled differently
+        try:
+            with self.async_timeout_handler(hostname, 'cephadm check-host'):
+                outs, errs, _code = self.wait_async(
+                    CephadmServe(self)._run_cephadm(
+                        hostname, cephadmNoImage,
+                        'check-host', [], error_ok=False
+                    )
+                )
+        except OrchestratorError:
+            pass
+
+        host_offline = hostname in self.offline_hosts
+
+        if host_offline and not offline:
+            raise OrchestratorValidationError(
+                f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode')
+
+        if not host_offline and offline:
+            raise OrchestratorValidationError(
+                f'{hostname} is online, please take host out of maintenance mode without --offline.')
+
+        if offline and not force:
+            raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force")
+
+        # no point trying these parts if we know the host is offline
+        if not host_offline:
+            with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
+                outs, errs, _code = self.wait_async(
+                    CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
+                                                    'host-maintenance', ['exit'], error_ok=True))
+            returned_msg = errs[0].split('\n')[-1]
+            if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')):
                 self.log.warning(
-                    f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
-                raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
-            else:
-                self.log.info(
-                    f"exit maintenance request has UNSET for the noout group on host {hostname}")
+                    f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+                if not force:
+                    raise OrchestratorError(
+                        f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+
+            if "osd" in self.cache.get_daemon_types(hostname):
+                crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
+                rc, _out, _err = self.mon_command({
+                    'prefix': 'osd unset-group',
+                    'flags': 'noout',
+                    'who': [crush_node],
+                    'format': 'json'
+                })
+                if rc:
+                    self.log.warning(
+                        f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
+                    if not force:
+                        raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
+                else:
+                    self.log.info(
+                        f"exit maintenance request has UNSET for the noout group on host {hostname}")
 
         # update the host record status
         tgt_host['status'] = ""
         self.inventory._inventory[hostname] = tgt_host
         self.inventory.save()
 
-        self._set_maintenance_healthcheck()
+        self.set_maintenance_healthcheck()
 
         return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode"
 
@@ -3004,8 +3027,16 @@ Then run the following:
                     daemon_names.append(dd.name())
             return daemon_names
 
-        alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
-        prometheus_user, prometheus_password = self._get_prometheus_credentials()
+        prom_cred_hash = None
+        alertmgr_cred_hash = None
+        security_enabled, mgmt_gw_enabled, _ = self._get_security_config()
+        if security_enabled:
+            alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
+            prometheus_user, prometheus_password = self._get_prometheus_credentials()
+            if prometheus_user and prometheus_password:
+                prom_cred_hash = f'{utils.md5_hash(prometheus_user + prometheus_password)}'
+            if alertmanager_user and alertmanager_password:
+                alertmgr_cred_hash = f'{utils.md5_hash(alertmanager_user + alertmanager_password)}'
 
         deps = []
         if daemon_type == 'haproxy':
@@ -3052,9 +3083,10 @@ Then run the following:
             else:
                 deps = [self.get_mgr_ip()]
         elif daemon_type == 'prometheus':
-            # for prometheus we add the active mgr as an explicit dependency,
-            # this way we force a redeploy after a mgr failover
-            deps.append(self.get_active_mgr().name())
+            if not mgmt_gw_enabled:
+                # for prometheus we add the active mgr as an explicit dependency,
+                # this way we force a redeploy after a mgr failover
+                deps.append(self.get_active_mgr().name())
             deps.append(str(self.get_module_option_ex('prometheus', 'server_port', 9283)))
             deps.append(str(self.service_discovery_port))
             # prometheus yaml configuration file (generated by prometheus.yml.j2) contains
@@ -3071,22 +3103,20 @@ Then run the following:
             deps += [d.name() for d in self.cache.get_daemons_by_service('ceph-exporter')]
             deps += [d.name() for d in self.cache.get_daemons_by_service('mgmt-gateway')]
             deps += [d.name() for d in self.cache.get_daemons_by_service('oauth2-proxy')]
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled:
-                if prometheus_user and prometheus_password:
-                    deps.append(f'{hash(prometheus_user + prometheus_password)}')
-                if alertmanager_user and alertmanager_password:
-                    deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+            if prom_cred_hash is not None:
+                deps.append(prom_cred_hash)
+            if alertmgr_cred_hash is not None:
+                deps.append(alertmgr_cred_hash)
         elif daemon_type == 'grafana':
             deps += get_daemon_names(['prometheus', 'loki', 'mgmt-gateway', 'oauth2-proxy'])
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled and prometheus_user and prometheus_password:
-                deps.append(f'{hash(prometheus_user + prometheus_password)}')
+            if prom_cred_hash is not None:
+                deps.append(prom_cred_hash)
         elif daemon_type == 'alertmanager':
-            deps += get_daemon_names(['mgr', 'alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled and alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+            deps += get_daemon_names(['alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
+            if not mgmt_gw_enabled:
+                deps += get_daemon_names(['mgr'])
+            if alertmgr_cred_hash is not None:
+                deps.append(alertmgr_cred_hash)
         elif daemon_type == 'promtail':
             deps += get_daemon_names(['loki'])
         elif daemon_type in ['ceph-exporter', 'node-exporter']:
@@ -3098,9 +3128,7 @@ Then run the following:
                 deps.append(build_url(host=dd.hostname, port=port).lstrip('/'))
             deps = sorted(deps)
         elif daemon_type == 'mgmt-gateway':
-            # url_prefix for monitoring daemons depends on the presence of mgmt-gateway
-            # while dashboard urls depend on the mgr daemons
-            deps += get_daemon_names(['mgr', 'grafana', 'prometheus', 'alertmanager', 'oauth2-proxy'])
+            deps = MgmtGatewayService.get_dependencies(self)
         else:
             # this daemon type doesn't need deps mgmt
             pass
@@ -3471,6 +3499,33 @@ Then run the following:
         return f'Added setting {setting} with value {value} to tuned profile {profile_name}'
 
     @handle_orch_error
+    def tuned_profile_add_settings(self, profile_name: str, settings: dict) -> str:
+        if profile_name not in self.tuned_profiles:
+            raise OrchestratorError(
+                f"Tuned profile {profile_name} does not exist. Cannot add setting."
+            )
+        self.tuned_profiles.add_settings(profile_name, settings)
+        results = [
+            f"Added setting {key} with value {value} to tuned profile {profile_name}"
+            for key, value in settings.items()
+        ]
+        self._kick_serve_loop()
+        return "\n".join(results)
+
+    @handle_orch_error
+    def tuned_profile_rm_settings(self, profile_name: str, settings: List[str]) -> str:
+        if profile_name not in self.tuned_profiles:
+            raise OrchestratorError(
+                f"Tuned profile {profile_name} does not exist. Cannot remove setting."
+            )
+        self.tuned_profiles.rm_settings(profile_name, settings)
+        results = [
+            f'Removed setting {settings} from tuned profile {profile_name}'
+        ]
+        self._kick_serve_loop()
+        return "\n".join(results)
+
+    @handle_orch_error
     def tuned_profile_rm_setting(self, profile_name: str, setting: str) -> str:
         if profile_name not in self.tuned_profiles:
             raise OrchestratorError(
@@ -3958,6 +4013,7 @@ Then run the following:
         return self.to_remove_osds.all_osds()
 
     @handle_orch_error
+    @host_exists()
     def drain_host(self, hostname: str, force: bool = False, keep_conf_keyring: bool = False, zap_osd_devices: bool = False) -> str:
         """
         Drain all daemons from a host.
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 4a7959ae045..8e9cd00fa81 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -1436,8 +1436,24 @@ class CephadmServe:
                         config_blobs=daemon_spec.final_config,
                     ).dump_json_str(),
                     use_current_daemon_image=reconfig,
+                    error_ok=True
                 )
 
+                # return number corresponding to DAEMON_FAILED_ERROR
+                # in src/cephadm/cephadmlib/constants.
+                # TODO: link these together so one cannot be changed without the other
+                if code == 17:
+                    # daemon failed on systemctl start command, meaning while
+                    # deployment failed the daemon is present and we should handle
+                    # this as if the deploy command "succeeded" and mark the daemon
+                    # as failed later when we fetch its status
+                    self.mgr.log.error(f'Deployment of {daemon_spec.name()} failed during "systemctl start" command')
+                elif code:
+                    # some other failure earlier in the deploy process. Just raise an exception
+                    # the same as we would in _run_cephadm on a nonzero rc
+                    raise OrchestratorError(
+                        f'cephadm exited with an error code: {code}, stderr: {err}')
+
                 if daemon_spec.daemon_type == 'agent':
                     self.mgr.agent_cache.agent_timestamp[daemon_spec.host] = datetime_now()
                     self.mgr.agent_cache.agent_counter[daemon_spec.host] = 1
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 9043577bc5a..04f5af28a9b 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -1015,12 +1015,6 @@ class RgwService(CephService):
         # set rgw_realm rgw_zonegroup and rgw_zone, if present
         self.set_realm_zg_zone(spec)
 
-        if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
-            # generate a self-signed cert for the rgw service
-            cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)
-            spec.rgw_frontend_ssl_certificate = ''.join([key, cert])
-            self.mgr.spec_store.save(spec)
-
         if spec.rgw_frontend_ssl_certificate:
             if isinstance(spec.rgw_frontend_ssl_certificate, list):
                 cert_data = '\n'.join(spec.rgw_frontend_ssl_certificate)
@@ -1068,6 +1062,19 @@ class RgwService(CephService):
             # and it matches the spec.
             port = spec.get_port()
 
+        if spec.generate_cert:
+            cert, key = self.mgr.cert_mgr.generate_cert(
+                daemon_spec.host,
+                self.mgr.inventory.get_addr(daemon_spec.host),
+                custom_san_list=spec.zonegroup_hostnames
+            )
+            pem = ''.join([key, cert])
+            ret, out, err = self.mgr.check_mon_command({
+                'prefix': 'config-key set',
+                'key': f'rgw/cert/{daemon_spec.name()}',
+                'val': pem,
+            })
+
         # configure frontend
         args = []
         ftype = spec.rgw_frontend_type or "beast"
@@ -1078,7 +1085,10 @@ class RgwService(CephService):
                         f"ssl_endpoint={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
                 else:
                     args.append(f"ssl_port={port}")
-                args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
+                if spec.generate_cert:
+                    args.append(f"ssl_certificate=config://rgw/cert/{daemon_spec.name()}")
+                else:
+                    args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
             else:
                 if daemon_spec.ip:
                     args.append(f"endpoint={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
@@ -1091,7 +1101,10 @@ class RgwService(CephService):
                     args.append(f"port={build_url(host=daemon_spec.ip, port=port).lstrip('/')}s")
                 else:
                     args.append(f"port={port}s")  # note the 's' suffix on port
-                args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
+                if spec.generate_cert:
+                    args.append(f"ssl_certificate=config://rgw/cert/{daemon_spec.name()}")
+                else:
+                    args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
             else:
                 if daemon_spec.ip:
                     args.append(f"port={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
@@ -1180,6 +1193,10 @@ class RgwService(CephService):
             'who': utils.name_to_config_section(daemon.name()),
             'name': 'rgw_frontends',
         })
+        self.mgr.check_mon_command({
+            'prefix': 'config-key rm',
+            'key': f'rgw/cert/{daemon.name()}',
+        })
 
     def ok_to_stop(
             self,
diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py
index a17000cd632..7381ef67d7e 100644
--- a/src/pybind/mgr/cephadm/services/ingress.py
+++ b/src/pybind/mgr/cephadm/services/ingress.py
@@ -241,7 +241,12 @@ class IngressService(CephService):
         if spec.keepalived_password:
             password = spec.keepalived_password
 
-        daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
+        if spec.keepalive_only:
+            # when keepalive_only instead of haproxy, we have to monitor the backend service daemons
+            if spec.backend_service is not None:
+                daemons = self.mgr.cache.get_daemons_by_service(spec.backend_service)
+        else:
+            daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
 
         if not daemons and not spec.keepalive_only:
             raise OrchestratorError(
@@ -297,6 +302,10 @@ class IngressService(CephService):
                     port = d.ports[1]   # monitoring port
                     host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
                     script = f'/usr/bin/curl {build_url(scheme="http", host=host_ip, port=port)}/health'
+                elif d.daemon_type == 'mgmt-gateway':
+                    mgmt_gw_port = d.ports[0] if d.ports else None
+                    host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
+                    script = f'/usr/bin/curl -k {build_url(scheme="https", host=host_ip, port=mgmt_gw_port)}/health'
         assert script
 
         states = []
diff --git a/src/pybind/mgr/cephadm/services/mgmt_gateway.py b/src/pybind/mgr/cephadm/services/mgmt_gateway.py
index 1943264025e..0897ce99ff7 100644
--- a/src/pybind/mgr/cephadm/services/mgmt_gateway.py
+++ b/src/pybind/mgr/cephadm/services/mgmt_gateway.py
@@ -1,10 +1,12 @@
 import logging
-from typing import List, Any, Tuple, Dict, cast, Optional
+from typing import List, Any, Tuple, Dict, cast, TYPE_CHECKING
 
 from orchestrator import DaemonDescription
 from ceph.deployment.service_spec import MgmtGatewaySpec, GrafanaSpec
 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec, get_dashboard_endpoints
 
+if TYPE_CHECKING:
+    from ..module import CephadmOrchestrator
 
 logger = logging.getLogger(__name__)
 
@@ -36,10 +38,11 @@ class MgmtGatewayService(CephadmService):
         # if empty list provided, return empty Daemon Desc
         return DaemonDescription()
 
-    def get_oauth2_service_url(self) -> Optional[str]:
-        # TODO(redo): check how can we create several servers for HA
-        oauth2_servers = self.get_service_endpoints('oauth2-proxy')
-        return f'https://{oauth2_servers[0]}' if oauth2_servers else None
+    def get_mgmt_gw_ips(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> List[str]:
+        mgmt_gw_ips = [self.mgr.inventory.get_addr(daemon_spec.host)]
+        if svc_spec.virtual_ip is not None:
+            mgmt_gw_ips.append(svc_spec.virtual_ip)
+        return mgmt_gw_ips
 
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
         # we adjust the standby behaviour so rev-proxy can pick correctly the active instance
@@ -56,9 +59,9 @@ class MgmtGatewayService(CephadmService):
                 key = svc_spec.ssl_certificate_key
             else:
                 # not provided on the spec, let's generate self-sigend certificates
-                addr = self.mgr.inventory.get_addr(daemon_spec.host)
+                ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
                 host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
-                cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, addr)
+                cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
             # save certificates
             if cert and key:
                 self.mgr.cert_key_store.save_cert('mgmt_gw_cert', cert)
@@ -67,23 +70,33 @@ class MgmtGatewayService(CephadmService):
                 logger.error("Failed to obtain certificate and key from mgmt-gateway.")
         return cert, key
 
-    def get_internal_certificates(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
-        node_ip = self.mgr.inventory.get_addr(daemon_spec.host)
+    def get_internal_certificates(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
+        ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
         host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
-        return self.mgr.cert_mgr.generate_cert(host_fqdn, node_ip)
+        return self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
 
-    def get_mgmt_gateway_deps(self) -> List[str]:
-        # url_prefix for the following services depends on the presence of mgmt-gateway
-        deps: List[str] = []
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('prometheus')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('alertmanager')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('grafana')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')]
+    def get_service_discovery_endpoints(self) -> List[str]:
+        sd_endpoints = []
         for dd in self.mgr.cache.get_daemons_by_service('mgr'):
-            # we consider mgr a dep even if the dashboard is disabled
-            # in order to be consistent with _calc_daemon_deps().
-            deps.append(dd.name())
+            assert dd.hostname is not None
+            addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+            sd_endpoints.append(f"{addr}:{self.mgr.service_discovery_port}")
+        return sd_endpoints
 
+    @staticmethod
+    def get_dependencies(mgr: "CephadmOrchestrator") -> List[str]:
+        # url_prefix for the following services depends on the presence of mgmt-gateway
+        deps = [
+            f'{d.name()}:{d.ports[0]}' if d.ports else d.name()
+            for service in ['prometheus', 'alertmanager', 'grafana', 'oauth2-proxy']
+            for d in mgr.cache.get_daemons_by_service(service)
+        ]
+        # dashboard and service discovery urls depend on the mgr daemons
+        deps += [
+            f'{d.name()}'
+            for service in ['mgr']
+            for d in mgr.cache.get_daemons_by_service(service)
+        ]
         return deps
 
     def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
@@ -94,6 +107,8 @@ class MgmtGatewayService(CephadmService):
         prometheus_endpoints = self.get_service_endpoints('prometheus')
         alertmanager_endpoints = self.get_service_endpoints('alertmanager')
         grafana_endpoints = self.get_service_endpoints('grafana')
+        oauth2_proxy_endpoints = self.get_service_endpoints('oauth2-proxy')
+        service_discovery_endpoints = self.get_service_discovery_endpoints()
         try:
             grafana_spec = cast(GrafanaSpec, self.mgr.spec_store['grafana'].spec)
             grafana_protocol = grafana_spec.protocol
@@ -104,7 +119,9 @@ class MgmtGatewayService(CephadmService):
             'dashboard_endpoints': dashboard_endpoints,
             'prometheus_endpoints': prometheus_endpoints,
             'alertmanager_endpoints': alertmanager_endpoints,
-            'grafana_endpoints': grafana_endpoints
+            'grafana_endpoints': grafana_endpoints,
+            'oauth2_proxy_endpoints': oauth2_proxy_endpoints,
+            'service_discovery_endpoints': service_discovery_endpoints
         }
         server_context = {
             'spec': svc_spec,
@@ -117,11 +134,12 @@ class MgmtGatewayService(CephadmService):
             'prometheus_endpoints': prometheus_endpoints,
             'alertmanager_endpoints': alertmanager_endpoints,
             'grafana_endpoints': grafana_endpoints,
-            'oauth2_proxy_url': self.get_oauth2_service_url(),
+            'service_discovery_endpoints': service_discovery_endpoints,
+            'enable_oauth2_proxy': bool(oauth2_proxy_endpoints),
         }
 
         cert, key = self.get_external_certificates(svc_spec, daemon_spec)
-        internal_cert, internal_pkey = self.get_internal_certificates(daemon_spec)
+        internal_cert, internal_pkey = self.get_internal_certificates(svc_spec, daemon_spec)
         daemon_config = {
             "files": {
                 "nginx.conf": self.mgr.template.render(self.SVC_TEMPLATE_PATH, main_context),
@@ -136,7 +154,7 @@ class MgmtGatewayService(CephadmService):
             daemon_config["files"]["nginx.crt"] = cert
             daemon_config["files"]["nginx.key"] = key
 
-        return daemon_config, sorted(self.get_mgmt_gateway_deps())
+        return daemon_config, sorted(MgmtGatewayService.get_dependencies(self.mgr))
 
     def pre_remove(self, daemon: DaemonDescription) -> None:
         """
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index 6a57e3b31ef..1b9cf618570 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -8,10 +8,11 @@ from mgr_module import HandleCommandResult
 
 from orchestrator import DaemonDescription
 from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, \
-    SNMPGatewaySpec, PrometheusSpec
+    SNMPGatewaySpec, PrometheusSpec, MgmtGatewaySpec
 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec, get_dashboard_urls
 from mgr_util import verify_tls, ServerConfigException, build_url, get_cert_issuer_info, password_hash
 from ceph.deployment.utils import wrap_ipv6
+from .. import utils
 
 logger = logging.getLogger(__name__)
 
@@ -57,15 +58,17 @@ class GrafanaService(CephadmService):
                 daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to}
                 grafana_ip = ip_to_bind_to
 
-        mgmt_gw_ip = None
         domain = self.mgr.get_fqdn(daemon_spec.host)
+        mgmt_gw_ips = []
         if mgmt_gw_enabled:
             mgmt_gw_daemons = self.mgr.cache.get_daemons_by_service('mgmt-gateway')
             if mgmt_gw_daemons:
                 dd = mgmt_gw_daemons[0]
                 assert dd.hostname
-                domain = self.mgr.get_fqdn(dd.hostname)
-                mgmt_gw_ip = self.mgr.inventory.get_addr(dd.hostname)
+                mgmt_gw_spec = cast(MgmtGatewaySpec, self.mgr.spec_store['mgmt-gateway'].spec)
+                # TODO(redo): should we resolve the virtual_ip to a name if possible?
+                domain = mgmt_gw_spec.virtual_ip or self.mgr.get_fqdn(dd.hostname)  # give prio to VIP if configured
+                mgmt_gw_ips = [self.mgr.inventory.get_addr(dd.hostname) for dd in mgmt_gw_daemons]  # type: ignore
 
         return self.mgr.template.render('services/grafana/grafana.ini.j2', {
             'anonymous_access': spec.anonymous_access,
@@ -76,7 +79,7 @@ class GrafanaService(CephadmService):
             'domain': domain,
             'mgmt_gw_enabled': mgmt_gw_enabled,
             'oauth2_enabled': oauth2_enabled,
-            'mgmt_gw_ip': mgmt_gw_ip,
+            'mgmt_gw_ips': ','.join(mgmt_gw_ips),
         })
 
     def calculate_grafana_deps(self, security_enabled: bool) -> List[str]:
@@ -87,7 +90,7 @@ class GrafanaService(CephadmService):
         # in case security is enabled we have to reconfig when prom user/pass changes
         prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
         if security_enabled and prometheus_user and prometheus_password:
-            deps.append(f'{hash(prometheus_user + prometheus_password)}')
+            deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}')
 
         # adding a dependency for mgmt-gateway because the usage of url_prefix relies on its presence.
         # another dependency is added for oauth-proxy as Grafana login is delegated to this service when enabled.
@@ -311,17 +314,18 @@ class AlertmanagerService(CephadmService):
         # add a dependency since enbling basic-auth (or not) depends on the existence of 'oauth2-proxy'
         deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')]
 
-        # scan all mgrs to generate deps and to get standbys too.
-        for dd in self.mgr.cache.get_daemons_by_service('mgr'):
-            # we consider mgr a dep even if the dashboard is disabled
-            # in order to be consistent with _calc_daemon_deps().
-            deps.append(dd.name())
-
         security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
         if mgmt_gw_enabled:
             dashboard_urls = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}/dashboard']
         else:
             dashboard_urls = get_dashboard_urls(self)
+            # scan all mgrs to generate deps and to get standbys too.
+            for dd in self.mgr.cache.get_daemons_by_service('mgr'):
+                # we consider mgr a dep even if the dashboard is disabled
+                # in order to be consistent with _calc_daemon_deps().
+                # when mgmt_gw is enabled there's no need for mgr dep as
+                # mgmt-gw wil route to the active mgr automatically
+                deps.append(dd.name())
 
         snmp_gateway_urls: List[str] = []
         for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'):
@@ -354,7 +358,7 @@ class AlertmanagerService(CephadmService):
         if security_enabled:
             alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
             if alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+                deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}')
             cert, key = self.get_alertmanager_certificates(daemon_spec)
             context = {
                 'enable_mtls': mgmt_gw_enabled,
@@ -489,8 +493,14 @@ class PrometheusService(CephadmService):
         security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
         port = self.mgr.service_discovery_port
         mgr_addr = wrap_ipv6(self.mgr.get_mgr_ip())
+
         protocol = 'https' if security_enabled else 'http'
-        srv_end_point = f'{protocol}://{mgr_addr}:{port}/sd/prometheus/sd-config?'
+        self.mgr.get_mgmt_gw_internal_endpoint()
+        if mgmt_gw_enabled:
+            service_discovery_url_prefix = f'{self.mgr.get_mgmt_gw_internal_endpoint()}'
+        else:
+            service_discovery_url_prefix = f'{protocol}://{mgr_addr}:{port}'
+        srv_end_point = f'{service_discovery_url_prefix}/sd/prometheus/sd-config?'
 
         node_exporter_cnt = len(self.mgr.cache.get_daemons_by_service('node-exporter'))
         alertmgr_cnt = len(self.mgr.cache.get_daemons_by_service('alertmanager'))
@@ -617,18 +627,23 @@ class PrometheusService(CephadmService):
         port = cast(int, self.mgr.get_module_option_ex('prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT))
         deps.append(str(port))
         deps.append(str(self.mgr.service_discovery_port))
-        # add an explicit dependency on the active manager. This will force to
-        # re-deploy prometheus if the mgr has changed (due to a fail-over i.e).
-        deps.append(self.mgr.get_active_mgr().name())
         deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}')
-        security_enabled, _, _ = self.mgr._get_security_config()
+        security_enabled, mgmt_gw_enabled, _ = self.mgr._get_security_config()
+
+        if not mgmt_gw_enabled:
+            # add an explicit dependency on the active manager. This will force to
+            # re-deploy prometheus if the mgr has changed (due to a fail-over i.e).
+            # when mgmt_gw is enabled there's no need for such dep as mgmt-gw wil
+            # route to the active mgr automatically
+            deps.append(self.mgr.get_active_mgr().name())
+
         if security_enabled:
             alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
             prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
             if prometheus_user and prometheus_password:
-                deps.append(f'{hash(prometheus_user + prometheus_password)}')
+                deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}')
             if alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+                deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}')
 
         # add a dependency since url_prefix depends on the existence of mgmt-gateway
         deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('mgmt-gateway')]
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 162815da24c..418be93b6af 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -55,7 +55,7 @@ class NvmeofService(CephService):
             'addr': addr,
             'discovery_addr': discovery_addr,
             'port': spec.port,
-            'spdk_log_level': 'WARNING',
+            'spdk_log_level': '',
             'rpc_socket_dir': '/var/tmp/',
             'rpc_socket_name': 'spdk.sock',
             'transport_tcp_options': transport_tcp_options,
@@ -66,6 +66,10 @@ class NvmeofService(CephService):
         daemon_spec.keyring = keyring
         daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf}
 
+        # Indicate to the daemon whether to utilize huge pages
+        if spec.spdk_mem_size:
+            daemon_spec.extra_files['spdk_mem_size'] = str(spec.spdk_mem_size)
+
         if spec.enable_auth:
             if (
                 not spec.client_cert
@@ -185,19 +189,21 @@ class NvmeofService(CephService):
         # to clean the keyring up
         super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
         service_name = daemon.service_name()
+        daemon_name = daemon.name()
 
         # remove config for dashboard nvmeof gateways if any
-        ret, out, err = self.mgr.mon_command({
+        ret, _, err = self.mgr.mon_command({
             'prefix': 'dashboard nvmeof-gateway-rm',
             'name': service_name,
+            'daemon_name': daemon_name
         })
         if not ret:
-            logger.info(f'{daemon.hostname} removed from nvmeof gateways dashboard config')
+            logger.info(f'{daemon_name} removed from nvmeof gateways dashboard config')
 
         spec = cast(NvmeofServiceSpec,
                     self.mgr.spec_store.all_specs.get(daemon.service_name(), None))
         if not spec:
-            self.mgr.log.error(f'Failed to find spec for {daemon.name()}')
+            self.mgr.log.error(f'Failed to find spec for {daemon_name}')
             return
         pool = spec.pool
         group = spec.group
diff --git a/src/pybind/mgr/cephadm/ssl_cert_utils.py b/src/pybind/mgr/cephadm/ssl_cert_utils.py
index 930b276c8de..467b32a4df0 100644
--- a/src/pybind/mgr/cephadm/ssl_cert_utils.py
+++ b/src/pybind/mgr/cephadm/ssl_cert_utils.py
@@ -70,7 +70,12 @@ class SSLCerts:
 
         return (cert_str, key_str)
 
-    def generate_cert(self, _hosts: Union[str, List[str]], _addrs: Union[str, List[str]]) -> Tuple[str, str]:
+    def generate_cert(
+        self,
+        _hosts: Union[str, List[str]],
+        _addrs: Union[str, List[str]],
+        custom_san_list: Optional[List[str]] = None,
+    ) -> Tuple[str, str]:
 
         addrs = [_addrs] if isinstance(_addrs, str) else _addrs
         hosts = [_hosts] if isinstance(_hosts, str) else _hosts
@@ -97,6 +102,8 @@ class SSLCerts:
         san_list: List[x509.GeneralName] = [x509.DNSName(host) for host in hosts]
         if valid_ips:
             san_list.extend(ips)
+        if custom_san_list:
+            san_list.extend([x509.DNSName(n) for n in custom_san_list])
 
         builder = builder.add_extension(
             x509.SubjectAlternativeName(
diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
index 967f1355af1..c767baddbb7 100644
--- a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
+++ b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
@@ -39,7 +39,7 @@
   header_property = username
   auto_sign_up = true
   sync_ttl = 15
-  whitelist = {{ mgmt_gw_ip }}
+  whitelist = {{ mgmt_gw_ips }}
   headers_encoded = false
   enable_login_token = false
   headers = Role:X-WEBAUTH-ROLE
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index 594582e7ee4..50a61f843d1 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -46,15 +46,15 @@ server {
     # add_header Content-Security-Policy "default-src 'self'; script-src 'self'; object-src 'none'; base-uri 'none'; require-trusted-types-for 'script'; frame-ancestors 'self';";
 {% endif %}
 
-{% if spec.enable_health_check_endpoint %}
+{% if spec.enable_health_check_endpoint or spec.virtual_ip %}
     location /health {
          return 200 'OK';
          add_header Content-Type text/plain;
     }
 {% endif %}
-{% if oauth2_proxy_url %}
+{% if enable_oauth2_proxy %}
     location /oauth2/ {
-        proxy_pass {{ oauth2_proxy_url }};
+        proxy_pass https://oauth2_proxy_servers;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header X-Scheme $scheme;
@@ -64,7 +64,7 @@ server {
 
     location = /oauth2/auth {
         internal;
-        proxy_pass {{ oauth2_proxy_url }};
+        proxy_pass https://oauth2_proxy_servers;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header X-Scheme $scheme;
@@ -78,7 +78,7 @@ server {
     location / {
         proxy_pass {{ dashboard_scheme }}://dashboard_servers;
         proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -120,7 +120,7 @@ server {
         # will send this header if Grafana is running on the same node as one of those services
         proxy_set_header Authorization "";
         proxy_buffering off;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -156,7 +156,7 @@ server {
         proxy_ssl_trusted_certificate /etc/nginx/ssl/ca.crt;
         proxy_ssl_verify on;
         proxy_ssl_verify_depth 2;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -180,7 +180,7 @@ server {
         proxy_ssl_trusted_certificate /etc/nginx/ssl/ca.crt;
         proxy_ssl_verify on;
         proxy_ssl_verify_depth 2;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
index 9148ddc4a14..2abb24b2eba 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
@@ -12,12 +12,20 @@ server {
     ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
     ssl_prefer_server_ciphers on;
 
-{% if spec.enable_health_check_endpoint %}
+{% if spec.enable_health_check_endpoint or spec.virtual_ip %}
     location /health {
          return 200 'OK';
          add_header Content-Type text/plain;
     }
 {% endif %}
+{% if service_discovery_endpoints %}
+    location /internal/sd {
+        rewrite ^/internal/(.*) /$1 break;
+        proxy_pass https://service_discovery_servers;
+        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+    }
+{% endif %}
+
 {% if dashboard_endpoints %}
     location /internal/dashboard {
         rewrite ^/internal/dashboard/(.*) /$1 break;
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
index 0c2a6b98c3b..b9773ceeeb3 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
@@ -8,6 +8,7 @@ events {
 
 http {
 
+    #access_log /dev/stdout;
     client_header_buffer_size 32K;
     large_client_header_buffers 4 32k;
     proxy_busy_buffers_size 512k;
@@ -16,6 +17,22 @@ http {
     proxy_headers_hash_max_size 1024;
     proxy_headers_hash_bucket_size 128;
 
+{% if oauth2_proxy_endpoints %}
+    upstream oauth2_proxy_servers {
+     {% for ep in oauth2_proxy_endpoints %}
+     server {{ ep }};
+     {% endfor %}
+    }
+{% endif %}
+
+{% if service_discovery_endpoints %}
+    upstream service_discovery_servers {
+     {% for ep in service_discovery_endpoints %}
+     server {{ ep }};
+     {% endfor %}
+    }
+{% endif %}
+
 {% if dashboard_endpoints %}
     upstream dashboard_servers {
      {% for ep in dashboard_endpoints %}
diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
index 03ff8a32ca2..b85ccd7b7fb 100644
--- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
@@ -2,7 +2,9 @@
 NFS_CORE_PARAM {
         Enable_NLM = {{ enable_nlm }};
         Enable_RQUOTA = false;
-        Protocols = 4;
+        Protocols = 3, 4;
+        mount_path_pseudo = true;
+        Enable_UDP = false;
         NFS_Port = {{ port }};
         allow_set_io_flusher_fail = true;
 {% if bind_addr %}
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index dbe29004771..fed2a1be82b 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -20,6 +20,12 @@ allowed_consecutive_spdk_ping_failures = {{ spec.allowed_consecutive_spdk_ping_f
 spdk_ping_interval_in_seconds = {{ spec.spdk_ping_interval_in_seconds }}
 ping_spdk_under_lock = {{ spec.ping_spdk_under_lock }}
 enable_monitor_client = {{ spec.enable_monitor_client }}
+max_hosts_per_namespace = {{ spec.max_hosts_per_namespace }}
+max_namespaces_with_netmask = {{ spec.max_namespaces_with_netmask }}
+max_subsystems = {{ spec.max_subsystems }}
+max_namespaces = {{ spec.max_namespaces }}
+max_namespaces_per_subsystem = {{ spec.max_namespaces_per_subsystem }}
+max_hosts_per_subsystem = {{ spec.max_hosts_per_subsystem }}
 
 [gateway-logs]
 log_level = {{ spec.log_level }}
@@ -53,8 +59,19 @@ rpc_socket_dir = {{ spec.rpc_socket_dir }}
 rpc_socket_name = {{ spec.rpc_socket_name }}
 timeout = {{ spec.spdk_timeout }}
 bdevs_per_cluster = {{ spec.bdevs_per_cluster }}
+{% if spec.spdk_log_level %}
 log_level = {{ spec.spdk_log_level }}
+{% endif %}
+{% if spec.spdk_protocol_log_level %}
+protocol_log_level = {{ spec.spdk_protocol_log_level }}
+{% endif %}
+{% if spec.spdk_log_file_dir %}
+log_file_dir = {{ spec.spdk_log_file_dir }}
+{% endif %}
 conn_retries = {{ spec.conn_retries }}
+{% if spec.spdk_mem_size %}
+mem_size = {{ spec.spdk_mem_size }}
+{% endif %}
 transports = {{ spec.transports }}
 {% if transport_tcp_options %}
 transport_tcp_options = {{ transport_tcp_options }}
@@ -65,4 +82,7 @@ tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }}
 
 [monitor]
 timeout = {{ spec.monitor_timeout }}
+{% if spec.monitor_client_log_file_dir %}
+log_file_dir = {{ spec.monitor_client_log_file_dir }}
+{% endif %}
 
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 975c125225d..3c647476e44 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -136,7 +136,7 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str
         mock.call(host, 'osd', 'ceph-volume',
                   ['--', 'lvm', 'list', '--format', 'json'],
                   no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
-        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False),
+        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, error_ok=True, use_current_daemon_image=False),
         mock.call(host, 'osd', 'ceph-volume',
                   ['--', 'raw', 'list', '--format', 'json'],
                   no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
@@ -499,7 +499,7 @@ class TestCephadm(object):
 
                 CephadmServe(cephadm_module)._check_daemons()
 
-                assert _save_host.called_with('test')
+                _save_host.assert_called_with('test')
                 assert cephadm_module.cache.get_scheduled_daemon_action('test', daemon_name) is None
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -563,6 +563,7 @@ class TestCephadm(object):
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=True,
                 )
 
@@ -618,6 +619,7 @@ class TestCephadm(object):
                             "crush_location": "datacenter=a",
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -660,6 +662,7 @@ class TestCephadm(object):
                             "keyring": "[client.crash.test]\nkey = None\n",
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -702,6 +705,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -752,6 +756,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -806,6 +811,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2720,6 +2726,7 @@ Traceback (most recent call last):
             cephadm_module.cache.facts = facts
             assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value
 
+    @mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None)
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
     def test_tuned_profiles_validation(self, cephadm_module):
         with with_host(cephadm_module, 'test'):
@@ -2840,16 +2847,23 @@ Traceback (most recent call last):
             with cephadm_module.async_timeout_handler('hostC', 'very slow', 999):
                 cephadm_module.wait_async(_timeout())
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
     @mock.patch("cephadm.CephadmOrchestrator.remove_osds")
     @mock.patch("cephadm.CephadmOrchestrator.add_host_label", lambda *a, **kw: None)
     @mock.patch("cephadm.inventory.HostCache.get_daemons_by_host", lambda *a, **kw: [])
     def test_host_drain_zap(self, _rm_osds, cephadm_module):
         # pass force=true in these tests to bypass _admin label check
-        cephadm_module.drain_host('host1', force=True, zap_osd_devices=False)
-        assert _rm_osds.called_with([], zap=False)
+        with with_host(cephadm_module, 'test', refresh_hosts=False, rm_with_force=True):
+            cephadm_module.drain_host('test', force=True, zap_osd_devices=False)
+            _rm_osds.assert_called_with([], zap=False)
+
+        with with_host(cephadm_module, 'test', refresh_hosts=False, rm_with_force=True):
+            cephadm_module.drain_host('test', force=True, zap_osd_devices=True)
+            _rm_osds.assert_called_with([], zap=True)
 
-        cephadm_module.drain_host('host1', force=True, zap_osd_devices=True)
-        assert _rm_osds.called_with([], zap=True)
+        with pytest.raises(OrchestratorError, match=r"Cannot find host 'host1' in the inventory."):
+            cephadm_module.drain_host('host1', force=True, zap_osd_devices=True)
+            _rm_osds.assert_called_with([], zap=True)
 
     def test_process_ls_output(self, cephadm_module):
         sample_ls_output = """[
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index 4b11a588ad3..83da1fa4232 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -349,6 +349,7 @@ log_to_file = False"""
                             },
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -409,6 +410,12 @@ allowed_consecutive_spdk_ping_failures = 1
 spdk_ping_interval_in_seconds = 2.0
 ping_spdk_under_lock = False
 enable_monitor_client = True
+max_hosts_per_namespace = 1
+max_namespaces_with_netmask = 1000
+max_subsystems = 128
+max_namespaces = 1024
+max_namespaces_per_subsystem = 256
+max_hosts_per_subsystem = 32
 
 [gateway-logs]
 log_level = INFO
@@ -442,7 +449,7 @@ rpc_socket_dir = /var/tmp/
 rpc_socket_name = spdk.sock
 timeout = 60.0
 bdevs_per_cluster = 32
-log_level = WARNING
+protocol_log_level = WARNING
 conn_retries = 10
 transports = tcp
 transport_tcp_options = {{"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7}}
@@ -487,6 +494,7 @@ timeout = 1.0\n"""
                             }
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -599,6 +607,7 @@ class TestMonitoring:
                             "use_url_prefix": False,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -694,6 +703,7 @@ class TestMonitoring:
                             "use_url_prefix": True,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -786,6 +796,7 @@ class TestMonitoring:
                             "use_url_prefix": False,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -831,6 +842,7 @@ class TestMonitoring:
                                                         "files": {
                                                             "ceph-exporter.crt": "mycert",
                                                             "ceph-exporter.key": "mykey"}}}),
+                                                error_ok=True,
                                                 use_current_daemon_image=False)
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -874,6 +886,7 @@ class TestMonitoring:
                         },
                         "config_blobs": {}
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -934,6 +947,7 @@ class TestMonitoring:
                             'web_config': '/etc/node-exporter/web.yml',
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1069,6 +1083,7 @@ class TestMonitoring:
                             "use_url_prefix": False
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1301,6 +1316,7 @@ class TestMonitoring:
                             "use_url_prefix": False
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1377,6 +1393,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1436,6 +1453,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1586,6 +1604,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1716,6 +1735,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1829,6 +1849,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2003,6 +2024,7 @@ spec:
                             },
                             "config_blobs": {},
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -2110,6 +2132,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2158,6 +2181,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2210,6 +2234,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2267,6 +2292,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3187,7 +3213,9 @@ class TestIngressService:
             'NFS_CORE_PARAM {\n'
             '        Enable_NLM = true;\n'
             '        Enable_RQUOTA = false;\n'
-            '        Protocols = 4;\n'
+            '        Protocols = 3, 4;\n'
+            '        mount_path_pseudo = true;\n'
+            '        Enable_UDP = false;\n'
             '        NFS_Port = 2049;\n'
             '        allow_set_io_flusher_fail = true;\n'
             '        HAProxy_Hosts = 192.168.122.111, 10.10.2.20, 192.168.122.222;\n'
@@ -3357,6 +3385,7 @@ class TestJaeger:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3397,6 +3426,7 @@ class TestJaeger:
                         },
                         "config_blobs": es_config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, collector_spec):
@@ -3425,6 +3455,7 @@ class TestJaeger:
                             },
                             "config_blobs": collector_config,
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -3465,6 +3496,7 @@ class TestJaeger:
                         },
                         "config_blobs": collector_config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, agent_spec):
@@ -3493,6 +3525,7 @@ class TestJaeger:
                             },
                             "config_blobs": agent_config,
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -3550,6 +3583,7 @@ class TestCustomContainer:
                             },
                         }
                     ),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3637,6 +3671,7 @@ class TestCustomContainer:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3690,6 +3725,7 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3762,6 +3798,7 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3769,14 +3806,19 @@ class TestSMB:
 class TestMgmtGateway:
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_discovery_endpoints")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_mgmt_gw_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gateway_config_no_auth(self,
+                                         get_service_discovery_endpoints_mock: List[str],
+                                         get_service_endpoints_mock: List[str],
+                                         _run_cephadm,
+                                         cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3789,6 +3831,7 @@ class TestMgmtGateway:
 
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
         get_service_endpoints_mock.side_effect = get_services_endpoints
+        get_service_discovery_endpoints_mock.side_effect = lambda: ["ceph-node-0:8765", "ceph-node-2:8765"]
 
         server_port = 5555
         spec = MgmtGatewaySpec(port=server_port,
@@ -3823,6 +3866,7 @@ class TestMgmtGateway:
 
                                          http {
 
+                                             #access_log /dev/stdout;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
@@ -3831,6 +3875,12 @@ class TestMgmtGateway:
                                              proxy_headers_hash_max_size 1024;
                                              proxy_headers_hash_bucket_size 128;
 
+
+                                             upstream service_discovery_servers {
+                                              server ceph-node-0:8765;
+                                              server ceph-node-2:8765;
+                                             }
+
                                              upstream dashboard_servers {
                                               server ceph-node-2:8443;
                                               server ceph-node-2:8443;
@@ -3938,6 +3988,12 @@ class TestMgmtGateway:
                                                  ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
                                                  ssl_prefer_server_ciphers on;
 
+                                                 location /internal/sd {
+                                                     rewrite ^/internal/(.*) /$1 break;
+                                                     proxy_pass https://service_discovery_servers;
+                                                     proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+                                                 }
+
                                                  location /internal/dashboard {
                                                      rewrite ^/internal/dashboard/(.*) /$1 break;
                                                      proxy_pass https://dashboard_servers;
@@ -3988,20 +4044,25 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_discovery_endpoints")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_oauth2_service_url", lambda _: "https://192.168.100.102:4180")
-    def test_mgmt_gw_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gateway_config_with_auth(self,
+                                           get_service_discovery_endpoints_mock: List[str],
+                                           get_service_endpoints_mock: List[str],
+                                           _run_cephadm,
+                                           cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -4010,10 +4071,13 @@ class TestMgmtGateway:
                 return ["ceph-node-2:3000", "ceph-node-2:3000"]
             elif name == 'alertmanager':
                 return ["192.168.100.100:9093", "192.168.100.102:9093"]
+            elif name == 'oauth2-proxy':
+                return ["192.168.100.101:4180", "192.168.100.102:4180"]
             return []
 
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
         get_service_endpoints_mock.side_effect = get_services_endpoints
+        get_service_discovery_endpoints_mock.side_effect = lambda: ["ceph-node-0:8765", "ceph-node-2:8765"]
 
         server_port = 5555
         spec = MgmtGatewaySpec(port=server_port,
@@ -4049,6 +4113,7 @@ class TestMgmtGateway:
 
                                          http {
 
+                                             #access_log /dev/stdout;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
@@ -4057,6 +4122,16 @@ class TestMgmtGateway:
                                              proxy_headers_hash_max_size 1024;
                                              proxy_headers_hash_bucket_size 128;
 
+                                             upstream oauth2_proxy_servers {
+                                              server 192.168.100.101:4180;
+                                              server 192.168.100.102:4180;
+                                             }
+
+                                             upstream service_discovery_servers {
+                                              server ceph-node-0:8765;
+                                              server ceph-node-2:8765;
+                                             }
+
                                              upstream dashboard_servers {
                                               server ceph-node-2:8443;
                                               server ceph-node-2:8443;
@@ -4117,7 +4192,7 @@ class TestMgmtGateway:
                                                  # add_header Content-Security-Policy "default-src 'self'; script-src 'self'; object-src 'none'; base-uri 'none'; require-trusted-types-for 'script'; frame-ancestors 'self';";
 
                                                  location /oauth2/ {
-                                                     proxy_pass https://192.168.100.102:4180;
+                                                     proxy_pass https://oauth2_proxy_servers;
                                                      proxy_set_header Host $host;
                                                      proxy_set_header X-Real-IP $remote_addr;
                                                      proxy_set_header X-Scheme $scheme;
@@ -4127,7 +4202,7 @@ class TestMgmtGateway:
 
                                                  location = /oauth2/auth {
                                                      internal;
-                                                     proxy_pass https://192.168.100.102:4180;
+                                                     proxy_pass https://oauth2_proxy_servers;
                                                      proxy_set_header Host $host;
                                                      proxy_set_header X-Real-IP $remote_addr;
                                                      proxy_set_header X-Scheme $scheme;
@@ -4255,6 +4330,12 @@ class TestMgmtGateway:
                                                  ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
                                                  ssl_prefer_server_ciphers on;
 
+                                                 location /internal/sd {
+                                                     rewrite ^/internal/(.*) /$1 break;
+                                                     proxy_pass https://service_discovery_servers;
+                                                     proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+                                                 }
+
                                                  location /internal/dashboard {
                                                      rewrite ^/internal/dashboard/(.*) /$1 break;
                                                      proxy_pass https://dashboard_servers;
@@ -4305,6 +4386,7 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -4313,12 +4395,26 @@ class TestMgmtGateway:
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_oauth2_proxy_service(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_oauth2_proxy_service(self, get_service_endpoints_mock, _run_cephadm, cephadm_module):
+        self.oauth2_proxy_service_common(get_service_endpoints_mock, _run_cephadm, cephadm_module, virtual_ip=None)
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
+    def test_oauth2_proxy_service_with_ha(self, get_service_endpoints_mock, _run_cephadm, cephadm_module):
+        self.oauth2_proxy_service_common(get_service_endpoints_mock, _run_cephadm, cephadm_module, virtual_ip="192.168.100.200")
+
+    def oauth2_proxy_service_common(self, get_service_endpoints_mock, _run_cephadm, cephadm_module: CephadmOrchestrator, virtual_ip=None):
         def get_services_endpoints(name):
             if name == 'prometheus':
                 return ["192.168.100.100:9095", "192.168.100.101:9095"]
@@ -4335,7 +4431,8 @@ class TestMgmtGateway:
         mgmt_gw_spec = MgmtGatewaySpec(port=server_port,
                                        ssl_certificate=ceph_generated_cert,
                                        ssl_certificate_key=ceph_generated_key,
-                                       enable_auth=True)
+                                       enable_auth=True,
+                                       virtual_ip=virtual_ip)
 
         oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider',
                                       client_id='my_client_id',
@@ -4344,6 +4441,8 @@ class TestMgmtGateway:
                                       cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=',
                                       ssl_certificate=ceph_generated_cert,
                                       ssl_certificate_key=ceph_generated_key)
+
+        redirect_url = f"https://{virtual_ip if virtual_ip else 'host_fqdn'}:5555/oauth2/callback"
         expected = {
             "fsid": "fsid",
             "name": "oauth2-proxy.ceph-node",
@@ -4362,7 +4461,7 @@ class TestMgmtGateway:
             },
             "config_blobs": {
                 "files": {
-                    "oauth2-proxy.conf": dedent("""
+                    "oauth2-proxy.conf": dedent(f"""
                                          # Listen on port 4180 for incoming HTTP traffic.
                                          https_address= "0.0.0.0:4180"
 
@@ -4375,7 +4474,7 @@ class TestMgmtGateway:
                                          client_id= "my_client_id"
                                          client_secret= "my_client_secret"
                                          oidc_issuer_url= "http://192.168.10.10:8888/dex"
-                                         redirect_url= "https://host_fqdn:5555/oauth2/callback"
+                                         redirect_url= "{redirect_url}"
 
                                          ssl_insecure_skip_verify=true
 
@@ -4411,5 +4510,6 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py
index 3673fbf621c..edd775aa178 100644
--- a/src/pybind/mgr/cephadm/utils.py
+++ b/src/pybind/mgr/cephadm/utils.py
@@ -5,6 +5,7 @@ from enum import Enum
 from functools import wraps
 from typing import Optional, Callable, TypeVar, List, NewType, TYPE_CHECKING, Any, NamedTuple
 from orchestrator import OrchestratorError
+import hashlib
 
 if TYPE_CHECKING:
     from cephadm import CephadmOrchestrator
@@ -154,3 +155,9 @@ def file_mode_to_str(mode: int) -> str:
             f'{"x" if (mode >> shift) & 1 else "-"}'
         ) + r
     return r
+
+
+def md5_hash(input_value: str) -> str:
+    input_str = str(input_value).encode('utf-8')
+    hash_object = hashlib.md5(input_str)
+    return hash_object.hexdigest()
diff --git a/src/pybind/mgr/dashboard/cherrypy_backports.py b/src/pybind/mgr/dashboard/cherrypy_backports.py
deleted file mode 100644
index 8871004fed2..00000000000
--- a/src/pybind/mgr/dashboard/cherrypy_backports.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Copyright © 2004-2019, CherryPy Team (team@cherrypy.org)
-
-All rights reserved.
-
-* * *
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of CherryPy nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-from pkg_resources import parse_version
-
-# The SSL code in CherryPy 3.5.0 is buggy.  It was fixed long ago,
-# but 3.5.0 is still shipping in major linux distributions
-# (Fedora 27, Ubuntu Xenial), so we must monkey patch it to get SSL working.
-
-
-def patch_http_connection_init(v):
-    # It was fixed in 3.7.0.  Exact lower bound version is probably earlier,
-    # but 3.5.0 is what this monkey patch is tested on.
-    if parse_version("3.5.0") <= v < parse_version("3.7.0"):
-        from cherrypy.wsgiserver.wsgiserver2 import CP_fileobject, HTTPConnection
-
-        def fixed_init(hc_self, server, sock, makefile=CP_fileobject):
-            hc_self.server = server
-            hc_self.socket = sock
-            hc_self.rfile = makefile(sock, "rb", hc_self.rbufsize)
-            hc_self.wfile = makefile(sock, "wb", hc_self.wbufsize)
-            hc_self.requests_seen = 0
-
-        HTTPConnection.__init__ = fixed_init
-
-
-# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
-# that the ports its listening on are in fact bound. When using the any address
-# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
-# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
-# exception.
-def skip_wait_for_occupied_port(v):
-    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
-    # centos:7) and back to at least 3.0.0.
-    if parse_version("3.1.2") <= v < parse_version("3.2.3"):
-        # https://github.com/cherrypy/cherrypy/issues/1100
-        from cherrypy.process import servers
-        servers.wait_for_occupied_port = lambda host, port: None
-
-
-# cherrypy.wsgiserver was extracted wsgiserver into cheroot in cherrypy v9.0.0
-def patch_builtin_ssl_wrap(v, new_wrap):
-    if v < parse_version("9.0.0"):
-        from cherrypy.wsgiserver.ssl_builtin import BuiltinSSLAdapter as builtin_ssl
-    else:
-        from cheroot.ssl.builtin import BuiltinSSLAdapter as builtin_ssl  # type: ignore
-    builtin_ssl.wrap = new_wrap(builtin_ssl.wrap)
-
-
-def accept_exceptions_from_builtin_ssl(v):
-    # the fix was included by cheroot v5.2.0, which was included by cherrypy
-    # 10.2.0.
-    if v < parse_version("10.2.0"):
-        # see https://github.com/cherrypy/cheroot/pull/4
-        import ssl
-
-        def accept_ssl_errors(func):
-            def wrapper(self, sock):
-                try:
-                    return func(self, sock)
-                except ssl.SSLError as e:
-                    if e.errno == ssl.SSL_ERROR_SSL:
-                        # Check if it's one of the known errors
-                        # Errors that are caught by PyOpenSSL, but thrown by
-                        # built-in ssl
-                        _block_errors = ('unknown protocol', 'unknown ca', 'unknown_ca',
-                                         'unknown error',
-                                         'https proxy request', 'inappropriate fallback',
-                                         'wrong version number',
-                                         'no shared cipher', 'certificate unknown',
-                                         'ccs received early',
-                                         'certificate verify failed',  # client cert w/o trusted CA
-                                         'version too low',  # caused by SSL3 connections
-                                         'unsupported protocol',  # caused by TLS1 connections
-                                         'sslv3 alert bad certificate')
-                        for error_text in _block_errors:
-                            if error_text in e.args[1].lower():
-                                # Accepted error, let's pass
-                                return None, {}
-                        raise
-            return wrapper
-        patch_builtin_ssl_wrap(v, accept_ssl_errors)
-
-
-def accept_socket_error_0(v):
-    # see https://github.com/cherrypy/cherrypy/issues/1618
-    try:
-        import cheroot
-        cheroot_version = parse_version(cheroot.__version__)
-    except ImportError:
-        pass
-
-    if v < parse_version("9.0.0") or cheroot_version < parse_version("6.5.5"):
-        generic_socket_error = OSError
-
-        def accept_socket_error_0(func):
-            def wrapper(self, sock):
-                try:
-                    return func(self, sock)
-                except generic_socket_error as e:
-                    """It is unclear why exactly this happens.
-
-                    It's reproducible only with openssl>1.0 and stdlib ``ssl`` wrapper.
-                    In CherryPy it's triggered by Checker plugin, which connects
-                    to the app listening to the socket port in TLS mode via plain
-                    HTTP during startup (from the same process).
-
-                    Ref: https://github.com/cherrypy/cherrypy/issues/1618
-                    """
-                    import ssl
-                    is_error0 = e.args == (0, 'Error')
-                    IS_ABOVE_OPENSSL10 = ssl.OPENSSL_VERSION_INFO >= (1, 1)
-                    del ssl
-                    if is_error0 and IS_ABOVE_OPENSSL10:
-                        return None, {}
-                    raise
-            return wrapper
-        patch_builtin_ssl_wrap(v, accept_socket_error_0)
-
-
-def patch_request_unique_id(v):
-    """
-    Older versions of cherrypy don't include request.unique_id field (a lazily
-    calculated UUID4).
-
-    Monkey-patching is preferred over alternatives as inheritance, as it'd break
-    type checks (cherrypy/lib/cgtools.py: `isinstance(obj, _cprequest.Request)`)
-    """
-    if v < parse_version('11.1.0'):
-        import uuid
-        from functools import update_wrapper
-
-        from cherrypy._cprequest import Request
-
-        class LazyUUID4(object):
-            def __str__(self):
-                """Return UUID4 and keep it for future calls."""
-                return str(self.uuid4)
-
-            @property
-            def uuid4(self):
-                """Provide unique id on per-request basis using UUID4.
-                It's evaluated lazily on render.
-                """
-                try:
-                    self._uuid4  # type: ignore
-                except AttributeError:
-                    # evaluate on first access
-                    self._uuid4 = uuid.uuid4()
-
-                return self._uuid4
-
-        old_init = Request.__init__
-
-        def init_with_unique_id(self, *args, **kwargs):
-            old_init(self, *args, **kwargs)
-            self.unique_id = LazyUUID4()
-
-        Request.__init__ = update_wrapper(init_with_unique_id, old_init)
-
-
-def patch_cherrypy(v):
-    ver = parse_version(v)
-    patch_http_connection_init(ver)
-    skip_wait_for_occupied_port(ver)
-    accept_exceptions_from_builtin_ssl(ver)
-    accept_socket_error_0(ver)
-    patch_request_unique_id(ver)
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
index ae720e6d49b..08ce7618114 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
@@ -23,12 +23,25 @@ bootstrap_extra_options='--allow-fqdn-hostname --dashboard-password-noupdate'
 # {% if expanded_cluster is not defined %}
 #   bootstrap_extra_options+=" ${bootstrap_extra_options_not_expanded}"
 # {% endif %}
+quick_install_options=''
+{% if quick_install is defined %}
+  quick_install_options="--image localhost:5000/ceph"
+{% endif %}
+
+{% if nodes < 3 %}
+  bootstrap_extra_options+=" --config /root/initial-ceph.conf"
+{% endif %}
 
-$CEPHADM bootstrap --mon-ip $mon_ip --initial-dashboard-password {{ admin_password }} --shared_ceph_folder /mnt/{{ ceph_dev_folder }} ${bootstrap_extra_options}
+{% if ceph_dev_folder is defined %}
+  bootstrap_extra_options+=" --shared_ceph_folder /mnt/{{ ceph_dev_folder }}"
+{% endif %}
+
+$CEPHADM ${quick_install_options} bootstrap --mon-ip $mon_ip --initial-dashboard-password {{ admin_password }} ${bootstrap_extra_options}
 
 fsid=$(cat /etc/ceph/ceph.conf | grep fsid | awk '{ print $3}')
 cephadm_shell="$CEPHADM shell --fsid ${fsid} -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring"
 
+
 {% for number in range(1, nodes) %}
   ssh-copy-id -f -i /etc/ceph/ceph.pub  -o StrictHostKeyChecking=no root@192.168.100.10{{ number }}
   {% if expanded_cluster is defined %}
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/ceph_cluster.yml b/src/pybind/mgr/dashboard/ci/cephadm/ceph_cluster.yml
index a334fbad5f6..3273cbc41eb 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/ceph_cluster.yml
+++ b/src/pybind/mgr/dashboard/ci/cephadm/ceph_cluster.yml
@@ -8,7 +8,7 @@ parameters:
  prefix: ceph
  numcpus: 1
  memory: 2048
- image: fedora36
+ image: fedora40
  notify: false
  admin_password: password
  disks:
@@ -35,8 +35,17 @@ parameters:
  sharedfolders: [{{ ceph_dev_folder }}]
  files:
   - bootstrap-cluster.sh
+  - dnf.conf.tpl
+  - load-podman-image.sh
+  - initial-ceph.conf
  cmds:
+ # updating the dnf.conf to make the dnf faster
+ - cp /root/dnf.conf.tpl /etc/dnf/dnf.conf
  - dnf -y install python3 chrony lvm2 podman
+ # setting up an insecure podman registry and then loading the ceph image to all hosts
+ {% if quick_install is defined %}
+ - /root/load-podman-image.sh
+ {% endif %}
  - sed -i "s/SELINUX=enforcing/SELINUX=permissive/" /etc/selinux/config
  - setenforce 0
  {% if number == 0 %}
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/dnf.conf.tpl b/src/pybind/mgr/dashboard/ci/cephadm/dnf.conf.tpl
new file mode 100644
index 00000000000..a53a68fd2a3
--- /dev/null
+++ b/src/pybind/mgr/dashboard/ci/cephadm/dnf.conf.tpl
@@ -0,0 +1,10 @@
+[main]
+fastestmirror=true
+max_parallel_downloads=10
+metadata_expire=1h
+clean_requirements_on_remove=true
+assumeyes=true
+gpgcheck=1
+keepcache=0
+plugins=1
+installonly_limit=3
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/initial-ceph.conf b/src/pybind/mgr/dashboard/ci/cephadm/initial-ceph.conf
new file mode 100644
index 00000000000..397d01489d0
--- /dev/null
+++ b/src/pybind/mgr/dashboard/ci/cephadm/initial-ceph.conf
@@ -0,0 +1,9 @@
+[global]
+osd_pool_default_min_size=1
+osd_pool_default_size=1
+
+[mon]
+mon_allow_pool_size_one=true
+mon_allow_pool_delete=true
+mon_data_avail_crit=1
+mon_data_avail_warn=1
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/load-podman-image.sh b/src/pybind/mgr/dashboard/ci/cephadm/load-podman-image.sh
new file mode 100755
index 00000000000..41ab402bca0
--- /dev/null
+++ b/src/pybind/mgr/dashboard/ci/cephadm/load-podman-image.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+echo -e "[registries.insecure]\n\
+registries = ['localhost:5000']" | sudo tee /etc/containers/registries.conf
+
+podman run -d -p 5000:5000 --name my-registry registry:2
+# Load the image and capture the output
+output=$(podman load -i /root/ceph_image.tar)
+
+# Extract image name from output
+image_name=$(echo "$output" | grep -oP '(?<=^Loaded image: ).*')
+
+if [[ -n "$image_name" ]]; then
+  echo "Image loaded: $image_name"
+  podman tag "$image_name" localhost:5000/ceph
+  echo "Tagged image as localhost:5000/ceph"
+else
+  echo "Failed to load image or extract image name."
+  exit 1
+fi
+
+podman push localhost:5000/ceph
+rm -f /root/ceph_image.tar
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/quick-bootstrap.sh b/src/pybind/mgr/dashboard/ci/cephadm/quick-bootstrap.sh
new file mode 100755
index 00000000000..759747415f2
--- /dev/null
+++ b/src/pybind/mgr/dashboard/ci/cephadm/quick-bootstrap.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+source bootstrap-cluster.sh > /dev/null 2>&1
+
+set +x
+
+show_help() {
+  echo "Usage: ./quick-bootstrap.sh [OPTIONS]"
+  echo ""
+  echo "Options:"
+  echo "  -u, --use-cached-image     Uses the existing podman image in local. Only use this if there is such an image present."
+  echo "  -dir, --ceph-dir             Use this to provide the local ceph directory. eg. --ceph-dir=/path/to/ceph"
+  echo "  -e, --expanded-cluster     To add all the hosts and deploy OSDs on top of it."
+  echo "  -h, --help             Display this help message."
+  echo ""
+  echo "Example:"
+  echo "  ./quick-bootstrap.sh --use-cached-image"
+}
+
+use_cached_image=false
+extra_args="-P quick_install=True"
+
+for arg in "$@"; do
+  case "$arg" in
+    -u|--use-cached-image)
+      use_cached_image=true
+      ;;
+    -dir=*|--ceph-dir=*)
+      extra_args+=" -P ceph_dev_folder=${arg#*=}"
+      ;;
+    -e|--expanded-cluster)
+      extra_args+=" -P expanded_cluster=True"
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $arg"
+      show_help
+      exit 1
+      ;;
+  esac
+done
+
+image_name=$(echo "$CEPHADM_IMAGE")
+ceph_cluster_yml='ceph_cluster.yml'
+node_count=$(awk '/nodes:/ {print $2}' "${ceph_cluster_yml}")
+
+if [[ ${use_cached_image} == false ]]; then
+    printf "Pulling the image: %s\n" "$image_name"
+    podman pull "${image_name}"
+fi
+
+rm -f ceph_image.tar
+
+printf "Saving the image: %s\n" "$image_name"
+podman save -o ceph_image.tar quay.ceph.io/ceph-ci/ceph:main
+
+printf "Creating the plan\n"
+kcli create plan -f ceph_cluster.yml ${extra_args} ceph
+
+attempt=0
+
+MAX_ATTEMPTS=10
+SLEEP_INTERVAL=5
+
+printf "Waiting for the host to be reachable\n"
+while [[ ${attempt} -lt ${MAX_ATTEMPTS} ]]; do
+    if ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=10 root@192.168.100.100 exit; then
+        break
+    else
+        echo "Waiting for ssh connection to be available..., attempt: ${attempt}"
+        ((attempt++))
+        sleep ${SLEEP_INTERVAL}
+    fi
+done
+
+printf "Copying the image to the hosts\n"
+
+for node in $(seq 0 $((node_count - 1))); do
+    scp -o StrictHostKeyChecking=no ceph_image.tar root@192.168.100.10"${node}":/root/
+done
+
+rm -f ceph_image.tar
+kcli ssh -u root -- ceph-node-00 'journalctl -n all -ft cloud-init'
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
index cda0635bc08..16151f39153 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
@@ -59,8 +59,8 @@ fi
 npm run build ${FRONTEND_BUILD_OPTS} &
 
 cd ${CEPH_DEV_FOLDER}
-: ${VM_IMAGE:='fedora36'}
-: ${VM_IMAGE_URL:='https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/x86_64/images/Fedora-Cloud-Base-36-1.5.x86_64.qcow2'}
+: ${VM_IMAGE:='fedora40'}
+: ${VM_IMAGE_URL:='https://download.fedoraproject.org/pub/fedora/linux/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2'}
 kcli download image -p ceph-dashboard -u ${VM_IMAGE_URL} ${VM_IMAGE}
 kcli delete plan -y ceph || true
 # Compile cephadm locally for the shared_ceph_folder to pick it up
diff --git a/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py b/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py
index 317dc45ce2e..95e1c3ffe03 100644
--- a/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py
+++ b/src/pybind/mgr/dashboard/ci/check_grafana_dashboards.py
@@ -103,8 +103,6 @@ def get_grafana_dashboards(base_dir):
                 title = dashboard_config['title']
                 assert len(title) > 0, \
                     "Title not found in '{}'".format(json_file)
-                assert len(dashboard_config.get('links', [])) == 0, \
-                    "Links found in '{}'".format(json_file)
                 if not uid:
                     continue
                 if uid in dashboards:
diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 2e6e466f97b..9d257674794 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -16,6 +16,7 @@ from ..services.auth import AuthManager, JwtManager
 from ..services.ceph_service import CephService
 from ..services.rgw_client import _SYNC_GROUP_ID, NoRgwDaemonsException, \
     RgwClient, RgwMultisite, RgwMultisiteAutomation
+from ..services.rgw_iam import RgwAccounts
 from ..services.service import RgwServiceManager, wait_for_daemon_to_start
 from ..tools import json_str_to_object, str_to_bool
 from . import APIDoc, APIRouter, BaseController, CreatePermission, \
@@ -399,6 +400,15 @@ class RgwBucket(RgwRESTController):
                 if bucket['tenant'] else bucket['bucket']
         return bucket
 
+    def _get_owner(self, owner):
+        accounts = RgwAccounts().get_accounts()
+
+        # if the owner is present in the accounts list,
+        # then the bucket is owned by an account.
+        # hence we will use dashboard user to fetch the
+        # bucket info
+        return owner if owner not in accounts else RgwServiceManager.user
+
     def _get_versioning(self, owner, daemon_name, bucket_name):
         rgw_client = RgwClient.instance(owner, daemon_name)
         return rgw_client.get_bucket_versioning(bucket_name)
@@ -542,19 +552,20 @@ class RgwBucket(RgwRESTController):
         bucket_name = RgwBucket.get_s3_bucket_name(result['bucket'],
                                                    result['tenant'])
 
+        owner = self._get_owner(result['owner'])
         # Append the versioning configuration.
-        versioning = self._get_versioning(result['owner'], daemon_name, bucket_name)
-        encryption = self._get_encryption(bucket_name, daemon_name, result['owner'])
+        versioning = self._get_versioning(owner, daemon_name, bucket_name)
+        encryption = self._get_encryption(bucket_name, daemon_name, owner)
         result['encryption'] = encryption['Status']
         result['versioning'] = versioning['Status']
         result['mfa_delete'] = versioning['MfaDelete']
-        result['bucket_policy'] = self._get_policy(bucket_name, daemon_name, result['owner'])
-        result['acl'] = self._get_acl(bucket_name, daemon_name, result['owner'])
-        result['replication'] = self._get_replication(bucket_name, result['owner'], daemon_name)
-        result['lifecycle'] = self._get_lifecycle(bucket_name, daemon_name, result['owner'])
+        result['bucket_policy'] = self._get_policy(bucket_name, daemon_name, owner)
+        result['acl'] = self._get_acl(bucket_name, daemon_name, owner)
+        result['replication'] = self._get_replication(bucket_name, owner, daemon_name)
+        result['lifecycle'] = self._get_lifecycle(bucket_name, daemon_name, owner)
 
         # Append the locking configuration.
-        locking = self._get_locking(result['owner'], daemon_name, bucket_name)
+        locking = self._get_locking(owner, daemon_name, bucket_name)
         result.update(locking)
 
         return self._append_bid(result)
@@ -599,7 +610,7 @@ class RgwBucket(RgwRESTController):
             raise DashboardException(e, http_status_code=500, component='rgw')
 
     @allow_empty_body
-    def set(self, bucket, bucket_id, uid, versioning_state=None,
+    def set(self, bucket, bucket_id, uid=None, versioning_state=None,
             encryption_state='false', encryption_type=None, key_id=None,
             mfa_delete=None, mfa_token_serial=None, mfa_token_pin=None,
             lock_mode=None, lock_retention_period_days=None,
@@ -609,23 +620,27 @@ class RgwBucket(RgwRESTController):
         encryption_state = str_to_bool(encryption_state)
         if replication is not None:
             replication = str_to_bool(replication)
-        # When linking a non-tenant-user owned bucket to a tenanted user, we
-        # need to prefix bucket name with '/'. e.g. photos -> /photos
-        if '$' in uid and '/' not in bucket:
-            bucket = '/{}'.format(bucket)
-
-        # Link bucket to new user:
-        result = self.proxy(daemon_name,
-                            'PUT',
-                            'bucket', {
-                                'bucket': bucket,
-                                'bucket-id': bucket_id,
-                                'uid': uid
-                            },
-                            json_response=False)
+
+        result = None
+        if uid:
+            # When linking a non-tenant-user owned bucket to a tenanted user, we
+            # need to prefix bucket name with '/'. e.g. photos -> /photos
+            if '$' in uid and '/' not in bucket:
+                bucket = '/{}'.format(bucket)
+
+            # Link bucket to new user:
+            result = self.proxy(daemon_name,
+                                'PUT',
+                                'bucket', {
+                                    'bucket': bucket,
+                                    'bucket-id': bucket_id,
+                                    'uid': uid
+                                },
+                                json_response=False)
 
         uid_tenant = uid[:uid.find('$')] if uid.find('$') >= 0 else None
         bucket_name = RgwBucket.get_s3_bucket_name(bucket, uid_tenant)
+        uid = self._get_owner(uid)
 
         locking = self._get_locking(uid, daemon_name, bucket_name)
         if versioning_state:
@@ -659,7 +674,7 @@ class RgwBucket(RgwRESTController):
             self._set_lifecycle(bucket_name, lifecycle, daemon_name, uid)
         else:
             self._delete_lifecycle(bucket_name, daemon_name, uid)
-        return self._append_bid(result)
+        return self._append_bid(result) if result else None
 
     def delete(self, bucket, purge_objects='true', daemon_name=None):
         return self.proxy(daemon_name, 'DELETE', 'bucket', {
diff --git a/src/pybind/mgr/dashboard/frontend/package-lock.json b/src/pybind/mgr/dashboard/frontend/package-lock.json
index f2d4bbf06fa..e03e5945916 100644
--- a/src/pybind/mgr/dashboard/frontend/package-lock.json
+++ b/src/pybind/mgr/dashboard/frontend/package-lock.json
@@ -30,7 +30,7 @@
         "@types/file-saver": "2.0.1",
         "async-mutex": "0.2.4",
         "bootstrap": "5.2.3",
-        "carbon-components-angular": "5.25.1",
+        "carbon-components-angular": "5.48.0",
         "chart.js": "4.4.0",
         "chartjs-adapter-moment": "1.0.1",
         "detect-browser": "5.2.0",
@@ -11223,9 +11223,9 @@
       ]
     },
     "node_modules/carbon-components-angular": {
-      "version": "5.25.1",
-      "resolved": "https://registry.npmjs.org/carbon-components-angular/-/carbon-components-angular-5.25.1.tgz",
-      "integrity": "sha512-v49djZmcHs47G7wzaS+SQUTqp+vErlHDc4ohbsx29Q+Jq1m6IJSaTUCN9GuQG/lLa7W1se0vS23TOToKwjIbcw==",
+      "version": "5.48.0",
+      "resolved": "https://registry.npmjs.org/carbon-components-angular/-/carbon-components-angular-5.48.0.tgz",
+      "integrity": "sha512-NZwpKBKgkgaR51S0Pm16MvashO4g8B+dzGeNE8l/RYRWXpVDLShR6YnBc80t2iMtTolxGiPwHmzpyMieVtIGLg==",
       "hasInstallScript": true,
       "dependencies": {
         "@carbon/icon-helpers": "10.37.0",
@@ -11234,6 +11234,7 @@
         "@floating-ui/dom": "1.6.3",
         "@ibm/telemetry-js": "^1.5.0",
         "flatpickr": "4.6.13",
+        "lodash-es": "4.17.21",
         "tslib": "2.3.0"
       },
       "peerDependencies": {
diff --git a/src/pybind/mgr/dashboard/frontend/package.json b/src/pybind/mgr/dashboard/frontend/package.json
index 7443f42ea6a..b95a84df2b1 100644
--- a/src/pybind/mgr/dashboard/frontend/package.json
+++ b/src/pybind/mgr/dashboard/frontend/package.json
@@ -64,7 +64,7 @@
     "@types/file-saver": "2.0.1",
     "async-mutex": "0.2.4",
     "bootstrap": "5.2.3",
-    "carbon-components-angular": "5.25.1",
+    "carbon-components-angular": "5.48.0",
     "chart.js": "4.4.0",
     "chartjs-adapter-moment": "1.0.1",
     "detect-browser": "5.2.0",
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/mirroring/bootstrap-create-modal/bootstrap-create-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/mirroring/bootstrap-create-modal/bootstrap-create-modal.component.ts
index ddd0a5dfecd..e5b55258d41 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/mirroring/bootstrap-create-modal/bootstrap-create-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/mirroring/bootstrap-create-modal/bootstrap-create-modal.component.ts
@@ -1,4 +1,12 @@
-import { Component, Inject, OnDestroy, OnInit, Optional } from '@angular/core';
+import {
+  AfterViewInit,
+  ChangeDetectorRef,
+  Component,
+  Inject,
+  OnDestroy,
+  OnInit,
+  Optional
+} from '@angular/core';
 import { UntypedFormControl, UntypedFormGroup, ValidatorFn, Validators } from '@angular/forms';
 
 import { BaseModal } from 'carbon-components-angular';
@@ -17,7 +25,9 @@ import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
   templateUrl: './bootstrap-create-modal.component.html',
   styleUrls: ['./bootstrap-create-modal.component.scss']
 })
-export class BootstrapCreateModalComponent extends BaseModal implements OnDestroy, OnInit {
+export class BootstrapCreateModalComponent
+  extends BaseModal
+  implements OnDestroy, OnInit, AfterViewInit {
   pools: any[] = [];
   token: string;
 
@@ -28,6 +38,7 @@ export class BootstrapCreateModalComponent extends BaseModal implements OnDestro
   constructor(
     private rbdMirroringService: RbdMirroringService,
     private taskWrapper: TaskWrapperService,
+    private changeDetectorRef: ChangeDetectorRef,
 
     @Inject('siteName') @Optional() public siteName?: string
   ) {
@@ -35,6 +46,10 @@ export class BootstrapCreateModalComponent extends BaseModal implements OnDestro
     this.createForm();
   }
 
+  ngAfterViewInit(): void {
+    this.changeDetectorRef.detectChanges();
+  }
+
   createForm() {
     this.createBootstrapForm = new CdFormGroup({
       siteName: new UntypedFormControl('', {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-auth-modal/cephfs-auth-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-auth-modal/cephfs-auth-modal.component.ts
index 8af55cd2dec..435cdb9644f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-auth-modal/cephfs-auth-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-auth-modal/cephfs-auth-modal.component.ts
@@ -1,4 +1,11 @@
-import { Component, Inject, OnInit, Optional } from '@angular/core';
+import {
+  AfterViewInit,
+  ChangeDetectorRef,
+  Component,
+  Inject,
+  OnInit,
+  Optional
+} from '@angular/core';
 import { FormControl, Validators } from '@angular/forms';
 import { OperatorFunction, Observable, of } from 'rxjs';
 import { debounceTime, distinctUntilChanged, switchMap, catchError } from 'rxjs/operators';
@@ -19,7 +26,7 @@ const DEBOUNCE_TIMER = 300;
   templateUrl: './cephfs-auth-modal.component.html',
   styleUrls: ['./cephfs-auth-modal.component.scss']
 })
-export class CephfsAuthModalComponent extends CdForm implements OnInit {
+export class CephfsAuthModalComponent extends CdForm implements OnInit, AfterViewInit {
   subvolumeGroup: string;
   subvolume: string;
   isDefaultSubvolumeGroup = false;
@@ -58,6 +65,7 @@ export class CephfsAuthModalComponent extends CdForm implements OnInit {
     private cephfsService: CephfsService,
     private taskWrapper: TaskWrapperService,
     private modalService: ModalCdsService,
+    private changeDetectorRef: ChangeDetectorRef,
 
     @Optional() @Inject('fsName') public fsName: string,
     @Optional() @Inject('id') public id: number
@@ -67,6 +75,10 @@ export class CephfsAuthModalComponent extends CdForm implements OnInit {
     this.resource = $localize`access`;
   }
 
+  ngAfterViewInit(): void {
+    this.changeDetectorRef.detectChanges();
+  }
+
   ngOnInit() {
     this.directoryStore.loadDirectories(this.id, '/', 3);
     this.createForm();
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
index dc61dc5ab67..b6ae76a66be 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
@@ -9,7 +9,9 @@ import {
   CheckboxModule,
   ButtonModule,
   GridModule,
-  ProgressIndicatorModule
+  ProgressIndicatorModule,
+  InputModule,
+  ModalModule
 } from 'carbon-components-angular';
 
 import { TreeModule } from '@circlon/angular-tree-component';
@@ -102,7 +104,9 @@ import { MultiClusterDetailsComponent } from './multi-cluster/multi-cluster-deta
     CheckboxModule,
     GridModule,
     ProgressIndicatorModule,
-    ButtonModule
+    ButtonModule,
+    InputModule,
+    ModalModule
   ],
   declarations: [
     HostsComponent,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
index af09b9a4fef..9b751d69c5a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
@@ -1,108 +1,104 @@
-<cd-modal [pageURL]="pageURL"
-          [modalRef]="activeModal">
-  <span class="modal-title"
-        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</span>
-
-  <ng-container class="modal-content">
-
-    <div *cdFormLoading="loading">
-      <form name="hostForm"
-            #formDir="ngForm"
-            [formGroup]="hostForm"
-            novalidate>
-
-        <div class="modal-body">
-
-          <!-- Hostname -->
-          <div class="form-group row">
-            <label class="cd-col-form-label required"
-                   for="hostname">
-            <ng-container i18n>Hostname</ng-container>
-            <cd-helper>
-              <p i18n>To add multiple hosts at once, you can enter:</p>
-              <ul>
-                <li i18n>a comma-separated list of hostnames <samp>(e.g.: example-01,example-02,example-03)</samp>,</li>
-                <li i18n>a range expression <samp>(e.g.: example-[01-03].ceph)</samp>,</li>
-                <li i18n>a comma separated range expression <samp>(e.g.: example-[01-05].lab.com,example2-[1-4].lab.com,example3-[001-006].lab.com)</samp></li>
-              </ul>
-            </cd-helper>
-            </label>
-            <div class="cd-col-form-input">
-              <input class="form-control"
-                     type="text"
-                     placeholder="mon-123"
-                     id="hostname"
-                     name="hostname"
-                     formControlName="hostname"
-                     autofocus
-                     (keyup)="checkHostNameValue()">
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('hostname', formDir, 'required')"
-                    i18n>This field is required.</span>
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('hostname', formDir, 'uniqueName')"
-                    i18n>The chosen hostname is already in use.</span>
-            </div>
-          </div>
-
-          <!-- Address -->
-          <div class="form-group row"
-               *ngIf="!hostPattern">
-            <label class="cd-col-form-label"
-                   for="addr"
-                   i18n>Network address</label>
-            <div class="cd-col-form-input">
-              <input class="form-control"
-                     type="text"
-                     placeholder="192.168.0.1"
-                     id="addr"
-                     name="addr"
-                     formControlName="addr">
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('addr', formDir, 'pattern')"
-                    i18n>The value is not a valid IP address.</span>
-            </div>
-          </div>
-
-          <!-- Labels -->
-          <div class="form-group row">
-            <label i18n
-                   for="labels"
-                   class="cd-col-form-label">Labels</label>
-            <div class="cd-col-form-input">
-              <cd-select-badges id="labels"
-                                [data]="hostForm.controls.labels.value"
-                                [options]="labelsOption"
-                                [customBadges]="true"
-                                [messages]="messages">
-              </cd-select-badges>
-            </div>
-          </div>
-
-          <!-- Maintenance Mode -->
-          <div class="form-group row"
-               *ngIf="!hideMaintenance">
-            <div class="cd-col-form-offset">
-              <div class="custom-control custom-checkbox">
-                <input class="custom-control-input"
-                       id="maintenance"
-                       type="checkbox"
-                       formControlName="maintenance">
-                <label class="custom-control-label"
-                       for="maintenance"
-                       i18n>Maintenance Mode</label>
-              </div>
-            </div>
-          </div>
-        </div>
-
-        <div class="modal-footer">
-          <cd-form-button-panel (submitActionEvent)="submit()"
-                                [form]="hostForm"
-                                [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
-                                wrappingClass="text-right"></cd-form-button-panel>
-        </div>
-      </form>
+<cds-modal size="md"
+           [open]="open"
+           [hasScrollingContent]="true"
+           (overlaySelected)="closeModal()">
+  <cds-modal-header (closeSelect)="closeModal()">
+    <h3 cdsModalHeaderHeading
+        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</h3>
+  </cds-modal-header>
+  <ng-container *cdFormLoading="loading">
+  <form name="hostForm"
+        #formDir="ngForm"
+        [formGroup]="hostForm"
+        novalidate>
+    <div cdsModalContent>
+      <!-- Hostname -->
+      <div class="form-item">
+        <cds-text-label label="Hostname"
+                        for="hostname"
+                        cdRequiredField="Hostname"
+                        [invalid]="!hostForm.controls.hostname.valid && hostForm.controls.hostname.dirty"
+                        [invalidText]="hostnameError"
+                        i18n>Hostname
+          <input cdsText
+                 type="text"
+                 placeholder="mon-123"
+                 id="hostname"
+                 name="hostname"
+                 formControlName="hostname"
+                 autofocus
+                 (keyup)="checkHostNameValue()">
+        </cds-text-label>
+        <ng-template #hostnameError>
+          <span *ngIf="hostForm.showError('hostname', formDir, 'required')"
+                class="invalid-feedback">
+            <ng-container i18n> This field is required. </ng-container>
+          </span>
+          <span *ngIf="hostForm.showError('hostname', formDir, 'uniqueName')"
+                class="invalid-feedback">
+            <ng-container i18n> The chosen hostname is already in use. </ng-container>
+          </span>
+        </ng-template>
+        <cd-help-text>
+          To add multiple hosts at once, you can enter:
+          <ul>
+            <li>a comma-separated list of hostnames <samp>(e.g.: example-01,example-02,example-03)</samp>,</li>
+            <li>a range expression <samp>(e.g.: example-[01-03].ceph)</samp>,</li>
+            <li>a comma separated range expression <samp>(e.g.: example-[01-05].lab.com,example2-[1-4].lab.com,example3-[001-006].lab.com)</samp></li>
+          </ul>
+        </cd-help-text>
+      </div>
+      <!-- Address -->
+      <div class="form-item"
+           *ngIf="!hostPattern">
+        <cds-text-label label="Network address"
+                        for="addr"
+                        i18n>Network address
+          <input cdsText
+                 type="text"
+                 placeholder="192.168.0.1"
+                 id="addr"
+                 name="addr"
+                 formControlName="addr"/>
+        </cds-text-label>
+        <ng-template #hostaddrError>
+          <span *ngIf="hostForm.showError('addr', formDir, 'pattern')">
+            <ng-container i18n> The value is not a valid IP address. </ng-container>
+          </span>
+        </ng-template>
+      </div>
+      <!-- Labels -->
+      <div class="form-item">
+        <cds-combo-box label="Labels"
+                       type="multi"
+                       selectionFeedback="top-after-reopen"
+                       for="labels"
+                       name="labels"
+                       formControlName="labels"
+                       placeholder="Select Labels..."
+                       i18n-placeholder
+                       [appendInline]="true"
+                       [items]="labelsOption"
+                       itemValueKey="value"
+                       id="labels"
+                       i18n>
+          <cds-dropdown-list></cds-dropdown-list>
+        </cds-combo-box>
+      </div>
+      <!-- Maintenance Mode -->
+      <div *ngIf="!hideMaintenance">
+        <cds-checkbox id="maintenance"
+                      type="checkbox"
+                      formControlName="maintenance"
+                      i18n>Maintenance Mode
+        </cds-checkbox>
+      </div>
     </div>
-  </ng-container>
-</cd-modal>
+    <cd-form-button-panel (submitActionEvent)="submit()"
+                          [form]="hostForm"
+                          [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
+                          [modalForm]="true">
+    </cd-form-button-panel>
+  </form>
+</ng-container>
+</cds-modal>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
index ed85d96cb1b..8097bb26018 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
@@ -10,6 +10,7 @@ import { LoadingPanelComponent } from '~/app/shared/components/loading-panel/loa
 import { SharedModule } from '~/app/shared/shared.module';
 import { configureTestBed, FormHelper } from '~/testing/unit-test-helper';
 import { HostFormComponent } from './host-form.component';
+import { InputModule, ModalModule } from 'carbon-components-angular';
 
 describe('HostFormComponent', () => {
   let component: HostFormComponent;
@@ -23,7 +24,9 @@ describe('HostFormComponent', () => {
         HttpClientTestingModule,
         RouterTestingModule,
         ReactiveFormsModule,
-        ToastrModule.forRoot()
+        ToastrModule.forRoot(),
+        InputModule,
+        ModalModule
       ],
       declarations: [HostFormComponent],
       providers: [NgbActiveModal]
@@ -45,7 +48,7 @@ describe('HostFormComponent', () => {
 
   it('should open the form in a modal', () => {
     const nativeEl = fixture.debugElement.nativeElement;
-    expect(nativeEl.querySelector('cd-modal')).not.toBe(null);
+    expect(nativeEl.querySelector('cds-modal')).not.toBe(null);
   });
 
   it('should validate the network address is valid', fakeAsync(() => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
index 240a0a7bebb..166ab013e73 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
@@ -1,8 +1,6 @@
 import { Component, OnInit } from '@angular/core';
 import { UntypedFormControl, Validators } from '@angular/forms';
-import { Router } from '@angular/router';
-
-import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap';
+import { ActivatedRoute, Router } from '@angular/router';
 import expand from 'brace-expansion';
 
 import { HostService } from '~/app/shared/api/host.service';
@@ -15,6 +13,7 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
+import { Location } from '@angular/common';
 
 @Component({
   selector: 'cd-host-form',
@@ -22,6 +21,7 @@ import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
   styleUrls: ['./host-form.component.scss']
 })
 export class HostFormComponent extends CdForm implements OnInit {
+  open: boolean = false;
   hostForm: CdFormGroup;
   action: string;
   resource: string;
@@ -46,7 +46,8 @@ export class HostFormComponent extends CdForm implements OnInit {
     private actionLabels: ActionLabelsI18n,
     private hostService: HostService,
     private taskWrapper: TaskWrapperService,
-    public activeModal: NgbActiveModal
+    private route: ActivatedRoute,
+    private location: Location
   ) {
     super();
     this.resource = $localize`host`;
@@ -54,9 +55,7 @@ export class HostFormComponent extends CdForm implements OnInit {
   }
 
   ngOnInit() {
-    if (this.router.url.includes('hosts')) {
-      this.pageURL = 'hosts';
-    }
+    this.open = this.route.outlet === 'modal';
     this.createForm();
     const hostContext = new CdTableFetchDataContext(() => undefined);
     this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: any[]) => {
@@ -69,7 +68,7 @@ export class HostFormComponent extends CdForm implements OnInit {
     this.hostService.getLabels().subscribe((resp: string[]) => {
       const uniqueLabels = new Set(resp.concat(this.hostService.predefinedLabels));
       this.labelsOption = Array.from(uniqueLabels).map((label) => {
-        return { enabled: true, name: label, selected: false, description: null };
+        return { enabled: true, name: label, content: label, selected: false, description: null };
       });
     });
   }
@@ -94,7 +93,7 @@ export class HostFormComponent extends CdForm implements OnInit {
         validators: [CdValidators.ip()]
       }),
       labels: new UntypedFormControl([]),
-      maintenance: new UntypedFormControl(false)
+      maintenance: new UntypedFormControl()
     });
   }
 
@@ -166,9 +165,13 @@ export class HostFormComponent extends CdForm implements OnInit {
           complete: () => {
             this.pageURL === 'hosts'
               ? this.router.navigate([this.pageURL, { outlets: { modal: null } }])
-              : this.activeModal.close();
+              : this.location.back();
           }
         });
     });
   }
+
+  closeModal(): void {
+    this.location.back();
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
index adb89e6cd5c..c26d24177fd 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
@@ -29,7 +29,6 @@ import { Permissions } from '~/app/shared/models/permissions';
 import { EmptyPipe } from '~/app/shared/pipes/empty.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { CdTableServerSideService } from '~/app/shared/services/cd-table-server-side.service';
-import { ModalService } from '~/app/shared/services/modal.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { URLBuilderService } from '~/app/shared/services/url-builder.service';
@@ -125,7 +124,6 @@ export class HostsComponent extends ListWithDetails implements OnDestroy, OnInit
     private emptyPipe: EmptyPipe,
     private hostService: HostService,
     private actionLabels: ActionLabelsI18n,
-    private modalService: ModalService,
     private taskWrapper: TaskWrapperService,
     private router: Router,
     private notificationService: NotificationService,
@@ -153,7 +151,7 @@ export class HostsComponent extends ListWithDetails implements OnDestroy, OnInit
         click: () =>
           this.router.url.includes('/hosts')
             ? this.router.navigate([BASE_URL, { outlets: { modal: [URLVerbs.ADD] } }])
-            : (this.bsModalRef = this.modalService.show(HostFormComponent, {
+            : (this.bsModalRef = this.cdsModalService.show(HostFormComponent, {
                 hideMaintenance: this.hideMaintenance
               })),
         disable: (selection: CdTableSelection) => this.getDisable('add', selection)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.spec.ts
index 3609467db1e..47793509747 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.spec.ts
@@ -13,7 +13,7 @@ import { TableActionsComponent } from '~/app/shared/datatable/table-actions/tabl
 import { NotificationType } from '~/app/shared/enum/notification-type.enum';
 import { Permission } from '~/app/shared/models/permissions';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { SharedModule } from '~/app/shared/shared.module';
 import { configureTestBed, PermissionHelper } from '~/testing/unit-test-helper';
@@ -159,7 +159,7 @@ describe('SilenceListComponent', () => {
       const mockObservable = () => of([]);
       spyOn(component, 'refresh').and.callFake(mockObservable);
       spyOn(prometheusService, 'expireSilence').and.callFake(mockObservable);
-      spyOn(TestBed.inject(ModalService), 'show').and.callFake((deletionClass, config) => {
+      spyOn(TestBed.inject(ModalCdsService), 'show').and.callFake((deletionClass, config) => {
         return {
           componentInstance: Object.assign(new deletionClass(), config)
         };
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.ts
index c5734236e5f..7098e002797 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/prometheus/silence-list/silence-list.component.ts
@@ -19,7 +19,7 @@ import { Permission } from '~/app/shared/models/permissions';
 import { PrometheusRule } from '~/app/shared/models/prometheus-alerts';
 import { CdDatePipe } from '~/app/shared/pipes/cd-date.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { PrometheusSilenceMatcherService } from '~/app/shared/services/prometheus-silence-matcher.service';
 import { URLBuilderService } from '~/app/shared/services/url-builder.service';
@@ -56,7 +56,7 @@ export class SilenceListComponent extends PrometheusListHelper {
   constructor(
     private authStorageService: AuthStorageService,
     private cdDatePipe: CdDatePipe,
-    private modalService: ModalService,
+    private modalService: ModalCdsService,
     private notificationService: NotificationService,
     private urlBuilder: URLBuilderService,
     private actionLabels: ActionLabelsI18n,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/upgrade/upgrade-progress/upgrade-progress.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/upgrade/upgrade-progress/upgrade-progress.component.ts
index 03bb6ed084d..a04b4577365 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/upgrade/upgrade-progress/upgrade-progress.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/upgrade/upgrade-progress/upgrade-progress.component.ts
@@ -5,7 +5,7 @@ import { NgbModalRef } from '@ng-bootstrap/ng-bootstrap';
 
 import { Icons } from '~/app/shared/enum/icons.enum';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { Permission } from '~/app/shared/models/permissions';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { UpgradeService } from '~/app/shared/api/upgrade.service';
@@ -37,7 +37,7 @@ export class UpgradeProgressComponent implements OnInit, OnDestroy {
     private authStorageService: AuthStorageService,
     private upgradeService: UpgradeService,
     private notificationService: NotificationService,
-    private modalService: ModalService,
+    private modalService: ModalCdsService,
     private summaryService: SummaryService,
     private router: Router,
     private refreshIntervalService: RefreshIntervalService
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.spec.ts
index caf8c0b6a71..811371329db 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.spec.ts
@@ -28,7 +28,7 @@ import { ErasureCodeProfile } from '~/app/shared/models/erasure-code-profile';
 import { Permission } from '~/app/shared/models/permissions';
 import { PoolFormInfo } from '~/app/shared/models/pool-form-info';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { SharedModule } from '~/app/shared/shared.module';
 import {
@@ -765,7 +765,7 @@ describe('PoolFormComponent', () => {
     it('should select the newly created rule', () => {
       expect(form.getValue('crushRule').rule_name).toBe('rep1');
       const name = 'awesomeRule';
-      spyOn(TestBed.inject(ModalService), 'show').and.callFake(() => {
+      spyOn(TestBed.inject(ModalCdsService), 'show').and.callFake(() => {
         return {
           componentInstance: {
             submitAction: of({ name })
@@ -828,7 +828,7 @@ describe('PoolFormComponent', () => {
       };
 
       beforeEach(() => {
-        modalSpy = spyOn(TestBed.inject(ModalService), 'show').and.callFake(
+        modalSpy = spyOn(TestBed.inject(ModalCdsService), 'show').and.callFake(
           (deletionClass: any, initialState: any) => {
             deletion = Object.assign(new deletionClass(), initialState);
             return {
@@ -933,7 +933,7 @@ describe('PoolFormComponent', () => {
       spyOn(ecpService, 'list').and.callFake(() => of(infoReturn.erasure_code_profiles));
       expect(form.getValue('erasureProfile').name).toBe('ecp1');
       const name = 'awesomeProfile';
-      spyOn(TestBed.inject(ModalService), 'show').and.callFake(() => {
+      spyOn(TestBed.inject(ModalCdsService), 'show').and.callFake(() => {
         return {
           componentInstance: {
             submitAction: of({ name })
@@ -977,7 +977,7 @@ describe('PoolFormComponent', () => {
 
       beforeEach(() => {
         deletion = undefined;
-        modalSpy = spyOn(TestBed.inject(ModalService), 'show').and.callFake(
+        modalSpy = spyOn(TestBed.inject(ModalCdsService), 'show').and.callFake(
           (comp: any, init: any) => {
             modal = modalServiceShow(comp, init);
             return modal;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.ts
index c1a54223dcc..ff5e20c6d5d 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/pool/pool-form/pool-form.component.ts
@@ -31,7 +31,7 @@ import { PoolFormInfo } from '~/app/shared/models/pool-form-info';
 import { DimlessBinaryPipe } from '~/app/shared/pipes/dimless-binary.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { FormatterService } from '~/app/shared/services/formatter.service';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { CrushRuleFormModalComponent } from '../crush-rule-form-modal/crush-rule-form-modal.component';
 import { ErasureCodeProfileFormModalComponent } from '../erasure-code-profile-form/erasure-code-profile-form-modal.component';
@@ -95,7 +95,7 @@ export class PoolFormComponent extends CdForm implements OnInit {
     private dimlessBinaryPipe: DimlessBinaryPipe,
     private route: ActivatedRoute,
     private router: Router,
-    private modalService: ModalService,
+    private modalService: ModalCdsService,
     private poolService: PoolService,
     private authStorageService: AuthStorageService,
     private formatter: FormatterService,
@@ -167,7 +167,7 @@ export class PoolFormComponent extends CdForm implements OnInit {
             CdValidators.custom(
               'required',
               (rule: CrushRule) =>
-                this.isReplicated && this.info.crush_rules_replicated.length > 0 && !rule
+                this.isReplicated && this.info?.crush_rules_replicated?.length > 0 && !rule
             )
           ]
         }),
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
index f77526be779..9c07182a0e5 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
@@ -91,6 +91,14 @@
             <span class="invalid-feedback"
                   *ngIf="bucketForm.showError('owner', frm, 'required')"
                   i18n>This field is required.</span>
+            <cd-alert-panel
+              type="info"
+              *ngIf="bucketForm.get('owner').disabled"
+              spacingClass="me-1 mt-1"
+              i18n>
+                The bucket is owned by an account. UI does not support changing
+                the ownership of bucket owned by an account.
+            </cd-alert-panel>
           </div>
         </div>
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
index d82c71e3cf7..53a1ac442c5 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
@@ -269,6 +269,14 @@ export class RgwBucketFormComponent extends CdForm implements OnInit, AfterViewC
           }
           this.bucketForm.setValue(value);
           if (this.editing) {
+            // temporary fix until the s3 account management is implemented in
+            // the frontend. Disable changing the owner of the bucket in case
+            // its owned by the account.
+            // @TODO: Introduce account selection for a bucket.
+            if (!this.owners.includes(value['owner'])) {
+              this.owners.push(value['owner']);
+              this.bucketForm.get('owner').disable();
+            }
             this.isVersioningAlreadyEnabled = this.isVersioningEnabled;
             this.isMfaDeleteAlreadyEnabled = this.isMfaDeleteEnabled;
             this.setMfaDeleteValidators();
@@ -327,11 +335,15 @@ export class RgwBucketFormComponent extends CdForm implements OnInit, AfterViewC
       // Edit
       const versioning = this.getVersioningStatus();
       const mfaDelete = this.getMfaDeleteStatus();
+      // make the owner empty if the field is disabled.
+      // this ensures the bucket doesn't gets updated with owner when
+      // the bucket is owned by the account.
+      const owner = this.bucketForm.get('owner').disabled === true ? '' : values['owner'];
       this.rgwBucketService
         .update(
           values['bid'],
           values['id'],
-          values['owner'],
+          owner,
           versioning,
           values['encryption_enabled'],
           values['encryption_type'],
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
index 22da2851d5a..67c98b0a59f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
@@ -139,9 +139,7 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
       multisiteInfo: this.multisiteInfo
     };
     if (entityName === 'realm') {
-      this.bsModalRef = this.modalService.show(RgwMultisiteRealmFormComponent, initialState, {
-        size: 'lg'
-      });
+      this.bsModalRef = this.cdsModalService.show(RgwMultisiteRealmFormComponent, initialState);
     } else if (entityName === 'zonegroup') {
       this.bsModalRef = this.modalService.show(RgwMultisiteZonegroupFormComponent, initialState, {
         size: 'lg'
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
index 1fa5b08f60d..5ca36f4bd2f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
@@ -1,58 +1,91 @@
-<cd-modal [modalRef]="activeModal">
-  <ng-container i18n="form title"
-                class="modal-title">{{ action | titlecase }} {{ resource | upperFirst }}</ng-container>
+<cds-modal size="sm"
+           [open]="open"
+           [hasScrollingContent]="false"
+           (overlaySelected)="closeModal()">
+  <cds-modal-header (closeSelect)="closeModal()">
+    <h3 cdsModalHeaderHeading
+        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</h3>
+  </cds-modal-header>
 
-  <ng-container class="modal-content">
+  <div cdsModalContent>
     <form name="multisiteRealmForm"
           #formDir="ngForm"
           [formGroup]="multisiteRealmForm"
           novalidate>
-    <div class="modal-body">
-      <div class="form-group row">
-        <label class="cd-col-form-label required"
-               for="realmName"
-               i18n>Realm Name</label>
-        <div class="cd-col-form-input">
-          <input class="form-control"
-                 type="text"
-                 placeholder="Realm name..."
-                 id="realmName"
-                 name="realmName"
-                 formControlName="realmName">
-          <span class="invalid-feedback"
-                *ngIf="multisiteRealmForm.showError('realmName', formDir, 'required')"
-                i18n>This field is required.</span>
-          <span class="invalid-feedback"
-                *ngIf="multisiteRealmForm.showError('realmName', formDir, 'uniqueName')"
-                i18n>The chosen realm name is already in use.</span>
-          <div class="custom-control custom-checkbox">
-            <input class="form-check-input"
-                   id="default_realm"
-                   name="default_realm"
-                   formControlName="default_realm"
-                   [attr.disabled]="action === 'edit' ? true: null"
-                   type="checkbox">
-            <label class="form-check-label"
-                   for="default_realm"
-                   i18n>Default</label>
-            <cd-helper *ngIf="action === 'edit' && info.data.is_default">
-              <span i18n>You cannot unset the default flag.</span>
-            </cd-helper>
-            <cd-helper *ngIf="action === 'edit' && !info.data.is_default">
-              <span i18n>Please consult the&nbsp;<a href="{{ docUrl }}">documentation</a>&nbsp;to follow the failover mechanism</span>
-            </cd-helper>
-            <cd-helper *ngIf="defaultRealmDisabled && action === 'create'">
-              <span i18n>Default realm already exists.</span>
-            </cd-helper>
-          </div>
-        </div>
+      <div class="form-item">
+        <cds-text-label
+          labelInputID="realmName"
+          [invalid]="
+            multisiteRealmForm.controls.realmName.invalid &&
+            (multisiteRealmForm.controls.realmName.touched ||
+              multisiteRealmForm.controls.realmName.dirty)
+          "
+          [invalidText]="realmNameError"
+          cdRequiredField="Realm Name"
+          i18n
+          >Realm Name
+          <input
+            cdsText
+            type="text"
+            placeholder="Realm name..."
+            id="realmName"
+            name="realmName"
+            formControlName="realmName"
+            [invalid]="
+              multisiteRealmForm.controls.realmName.invalid &&
+              (multisiteRealmForm.controls.realmName.touched ||
+                multisiteRealmForm.controls.realmName.dirty)
+            "
+            [autofocus]="true"
+            modal-primary-focus
+          />
+        </cds-text-label>
+        <ng-template #realmNameError>
+          <span
+            class="invalid-feedback"
+            *ngIf="multisiteRealmForm.showError('realmName', formDir, 'required')"
+            i18n
+            >This field is required.</span
+          >
+          <span
+            class="invalid-feedback"
+            *ngIf="multisiteRealmForm.showError('realmName', formDir, 'uniqueName')"
+            i18n
+            >The chosen realm name is already in use.</span
+          >
+        </ng-template>
+      </div>
+
+      <div class="form-item">
+        <cds-checkbox
+          label="Default"
+          for="default_realm"
+          formControlName="default_realm"
+          name="default_realm"
+          [disabled]="action === actionLabels.EDIT"
+          i18n
+          >Default
+          <cd-help-text *ngIf="action === actionLabels.EDIT && info.data.is_default">
+            <span>You cannot unset the default flag.</span>
+          </cd-help-text>
+          <cd-help-text *ngIf="action === actionLabels.EDIT && !info.data.is_default">
+            <span
+              >Please consult the <a href="{{ docUrl }}">documentation</a> to follow the failover
+              mechanism</span
+            >
+          </cd-help-text>
+          <cd-help-text *ngIf="defaultRealmDisabled && action === actionLabels.CREATE">
+            <span>Default realm already exists.</span>
+          </cd-help-text>
+        </cds-checkbox>
       </div>
-    </div>
-    <div class="modal-footer">
-      <cd-form-button-panel (submitActionEvent)="submit()"
-                            [form]="multisiteRealmForm"
-                            [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"></cd-form-button-panel>
-    </div>
     </form>
-  </ng-container>
-</cd-modal>
+  </div>
+  <cd-form-button-panel
+    (submitActionEvent)="submit()"
+    [form]="multisiteRealmForm"
+    [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
+    [modalForm]="true"
+  >
+  </cd-form-button-panel>
+</cds-modal>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
index becb1569ad6..f68619fe9ff 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
@@ -14,6 +14,7 @@ import { SharedModule } from '~/app/shared/shared.module';
 
 import { RgwMultisiteRealmFormComponent } from './rgw-multisite-realm-form.component';
 import { configureTestBed } from '~/testing/unit-test-helper';
+import { CheckboxModule, InputModule, ModalModule } from 'carbon-components-angular';
 
 describe('RgwMultisiteRealmFormComponent', () => {
   let component: RgwMultisiteRealmFormComponent;
@@ -26,9 +27,16 @@ describe('RgwMultisiteRealmFormComponent', () => {
       ReactiveFormsModule,
       RouterTestingModule,
       HttpClientTestingModule,
-      ToastrModule.forRoot()
+      ToastrModule.forRoot(),
+      ModalModule,
+      InputModule,
+      CheckboxModule
+    ],
+    providers: [
+      NgbActiveModal,
+      { provide: 'multisiteInfo', useValue: [[]] },
+      { provide: 'info', useValue: { data: { name: 'null' } } }
     ],
-    providers: [NgbActiveModal],
     declarations: [RgwMultisiteRealmFormComponent]
   });
 
@@ -68,7 +76,6 @@ describe('RgwMultisiteRealmFormComponent', () => {
 
     it('tests create success notification', () => {
       spyOn(rgwRealmService, 'create').and.returnValue(observableOf([]));
-      component.action = 'create';
       component.multisiteRealmForm.markAsDirty();
       component.submit();
       expect(notificationService.show).toHaveBeenCalledWith(
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
index 20cd2032faf..1e18598b0db 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
@@ -1,4 +1,4 @@
-import { Component, OnInit } from '@angular/core';
+import { Component, Inject, OnInit, Optional } from '@angular/core';
 import { UntypedFormControl, Validators } from '@angular/forms';
 import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap';
 import { RgwRealmService } from '~/app/shared/api/rgw-realm.service';
@@ -9,26 +9,21 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { RgwRealm } from '../models/rgw-multisite';
 import { DocService } from '~/app/shared/services/doc.service';
+import { BaseModal } from 'carbon-components-angular';
 
 @Component({
   selector: 'cd-rgw-multisite-realm-form',
   templateUrl: './rgw-multisite-realm-form.component.html',
   styleUrls: ['./rgw-multisite-realm-form.component.scss']
 })
-export class RgwMultisiteRealmFormComponent implements OnInit {
-  action: string;
+export class RgwMultisiteRealmFormComponent extends BaseModal implements OnInit {
   multisiteRealmForm: CdFormGroup;
-  info: any;
-  editing = false;
-  resource: string;
-  multisiteInfo: object[] = [];
   realm: RgwRealm;
   realmList: RgwRealm[] = [];
   zonegroupList: RgwRealm[] = [];
   realmNames: string[];
   newRealmName: string;
   isMaster: boolean;
-  defaultsInfo: string[];
   defaultRealmDisabled = false;
   docUrl: string;
 
@@ -37,11 +32,17 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
     public actionLabels: ActionLabelsI18n,
     public rgwRealmService: RgwRealmService,
     public notificationService: NotificationService,
-    public docService: DocService
+    public docService: DocService,
+    @Optional() @Inject('action') public action: string,
+    @Optional() @Inject('resource') public resource: string,
+    @Optional() @Inject('info') public info: any,
+    @Optional() @Inject('multisiteInfo') public multisiteInfo: object[],
+    @Optional() @Inject('defaultsInfo') public defaultsInfo: string[],
+    @Optional() @Inject('editing') public editing: boolean
   ) {
-    this.action = this.editing
-      ? this.actionLabels.EDIT + this.resource
-      : this.actionLabels.CREATE + this.resource;
+    super();
+
+    this.action = this.editing ? this.actionLabels.EDIT : this.actionLabels.CREATE;
     this.createForm();
   }
 
@@ -52,7 +53,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
           Validators.required,
           CdValidators.custom('uniqueName', (realmName: string) => {
             return (
-              this.action === 'create' &&
+              this.action === this.actionLabels.CREATE &&
               this.realmNames &&
               this.realmNames.indexOf(realmName) !== -1
             );
@@ -71,7 +72,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
     this.realmNames = this.realmList.map((realm) => {
       return realm['name'];
     });
-    if (this.action === 'edit') {
+    if (this.action === this.actionLabels.EDIT) {
       this.zonegroupList =
         this.multisiteInfo[1] !== undefined && this.multisiteInfo[1].hasOwnProperty('zonegroups')
           ? this.multisiteInfo[1]['zonegroups']
@@ -97,7 +98,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
   submit() {
     const values = this.multisiteRealmForm.getRawValue();
     this.realm = new RgwRealm();
-    if (this.action === 'create') {
+    if (this.action === this.actionLabels.CREATE) {
       this.realm.name = values['realmName'];
       this.rgwRealmService.create(this.realm, values['default_realm']).subscribe(
         () => {
@@ -105,13 +106,13 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
             NotificationType.success,
             $localize`Realm: '${values['realmName']}' created successfully`
           );
-          this.activeModal.close();
+          this.closeModal();
         },
         () => {
           this.multisiteRealmForm.setErrors({ cdSubmitButton: true });
         }
       );
-    } else if (this.action === 'edit') {
+    } else {
       this.realm.name = this.info.data.name;
       this.newRealmName = values['realmName'];
       this.rgwRealmService.update(this.realm, values['default_realm'], this.newRealmName).subscribe(
@@ -120,7 +121,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
             NotificationType.success,
             $localize`Realm: '${values['realmName']}' updated successfully`
           );
-          this.activeModal.close();
+          this.closeModal();
         },
         () => {
           this.multisiteRealmForm.setErrors({ cdSubmitButton: true });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
index 5f8c6f50135..a55cb179778 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
@@ -71,7 +71,9 @@ import {
   LoadingModule,
   ModalModule,
   ProgressIndicatorModule,
-  CodeSnippetModule
+  CodeSnippetModule,
+  InputModule,
+  CheckboxModule
 } from 'carbon-components-angular';
 import { CephSharedModule } from '../shared/ceph-shared.module';
 
@@ -99,7 +101,9 @@ import { CephSharedModule } from '../shared/ceph-shared.module';
     ButtonModule,
     LoadingModule,
     IconModule,
-    NgbProgressbar
+    NgbProgressbar,
+    InputModule,
+    CheckboxModule
   ],
   exports: [
     RgwDaemonListComponent,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
index 244a7861b27..8b2c9f1eca3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
@@ -31,28 +31,28 @@ export class RoleDetailsComponent implements OnChanges, OnInit {
         prop: 'read',
         name: $localize`Read`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'create',
         name: $localize`Create`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'update',
         name: $localize`Update`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'delete',
         name: $localize`Delete`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       }
     ];
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
index c9640e4ffab..8e7e12b3692 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
@@ -99,7 +99,7 @@ export class RoleListComponent extends ListWithDetails implements OnInit {
       {
         name: $localize`System Role`,
         prop: 'system',
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         flexGrow: 1,
         cellTransformation: CellTemplate.checkIcon
       }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.spec.ts
index 4f95ac1e26c..943fc033ede 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.spec.ts
@@ -16,13 +16,13 @@ import { ComponentsModule } from '~/app/shared/components/components.module';
 import { LoadingPanelComponent } from '~/app/shared/components/loading-panel/loading-panel.component';
 import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
-import { ModalService } from '~/app/shared/services/modal.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { PasswordPolicyService } from '~/app/shared/services/password-policy.service';
 import { SharedModule } from '~/app/shared/shared.module';
 import { configureTestBed, FormHelper } from '~/testing/unit-test-helper';
 import { UserFormComponent } from './user-form.component';
 import { UserFormModel } from './user-form.model';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 
 describe('UserFormComponent', () => {
   let component: UserFormComponent;
@@ -30,7 +30,7 @@ describe('UserFormComponent', () => {
   let fixture: ComponentFixture<UserFormComponent>;
   let httpTesting: HttpTestingController;
   let userService: UserService;
-  let modalService: ModalService;
+  let modalService: ModalCdsService;
   let router: Router;
   let formHelper: FormHelper;
 
@@ -67,7 +67,7 @@ describe('UserFormComponent', () => {
     form = component.userForm;
     httpTesting = TestBed.inject(HttpTestingController);
     userService = TestBed.inject(UserService);
-    modalService = TestBed.inject(ModalService);
+    modalService = TestBed.inject(ModalCdsService);
     router = TestBed.inject(Router);
     spyOn(router, 'navigate');
     fixture.detectChanges();
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.ts
index 1a0ddf35cc9..7c02b86eae0 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/user-form/user-form.component.ts
@@ -22,7 +22,7 @@ import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { CdPwdExpirationSettings } from '~/app/shared/models/cd-pwd-expiration-settings';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
-import { ModalService } from '~/app/shared/services/modal.service';
+import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { PasswordPolicyService } from '~/app/shared/services/password-policy.service';
 import { UserFormMode } from './user-form-mode.enum';
@@ -61,7 +61,7 @@ export class UserFormComponent extends CdForm implements OnInit {
     private authStorageService: AuthStorageService,
     private route: ActivatedRoute,
     public router: Router,
-    private modalService: ModalService,
+    private modalService: ModalCdsService,
     private roleService: RoleService,
     private userService: UserService,
     private notificationService: NotificationService,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
index a5c84e60b6f..92eee852d88 100644..100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
@@ -2,13 +2,15 @@ import { TestBed } from '@angular/core/testing';
 import { HttpClientTestingModule, HttpTestingController } from '@angular/common/http/testing';
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { NvmeofService } from '../../shared/api/nvmeof.service';
+import { throwError } from 'rxjs';
 
 describe('NvmeofService', () => {
   let service: NvmeofService;
   let httpTesting: HttpTestingController;
   const mockGroupName = 'default';
   const mockNQN = 'nqn.2001-07.com.ceph:1721041732363';
-
+  const UI_API_PATH = 'ui-api/nvmeof';
+  const API_PATH = 'api/nvmeof';
   configureTestBed({
     providers: [NvmeofService],
     imports: [HttpClientTestingModule]
@@ -27,56 +29,155 @@ describe('NvmeofService', () => {
     expect(service).toBeTruthy();
   });
 
-  // gateways
-  it('should call listGatewayGroups', () => {
-    service.listGatewayGroups().subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/gateway/group');
-    expect(req.request.method).toBe('GET');
-  });
+  describe('test gateway APIs', () => {
+    it('should call listGatewayGroups', () => {
+      service.listGatewayGroups().subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/gateway/group`);
+      expect(req.request.method).toBe('GET');
+    });
 
-  it('should call listGateways', () => {
-    service.listGateways().subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/gateway');
-    expect(req.request.method).toBe('GET');
+    it('should call listGateways', () => {
+      service.listGateways().subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/gateway`);
+      expect(req.request.method).toBe('GET');
+    });
   });
 
-  // subsystems
-  it('should call listSubsystems', () => {
-    service.listSubsystems(mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('GET');
-  });
+  describe('test subsystems APIs', () => {
+    it('should call listSubsystems', () => {
+      service.listSubsystems(mockGroupName).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem?gw_group=${mockGroupName}`);
+      expect(req.request.method).toBe('GET');
+    });
+
+    it('should call getSubsystem', () => {
+      service.getSubsystem(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+
+    it('should call createSubsystem', () => {
+      const request = {
+        nqn: mockNQN,
+        enable_ha: true,
+        initiators: '*',
+        gw_group: mockGroupName
+      };
+      service.createSubsystem(request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem`);
+      expect(req.request.method).toBe('POST');
+    });
 
-  it('should call getSubsystem', () => {
-    service.getSubsystem(mockNQN, mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('GET');
+    it('should call deleteSubsystem', () => {
+      service.deleteSubsystem(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
+    it('should call isSubsystemPresent', () => {
+      spyOn(service, 'getSubsystem').and.returnValue(throwError('test'));
+      service.isSubsystemPresent(mockNQN, mockGroupName).subscribe((res) => {
+        expect(res).toBe(false);
+      });
+    });
   });
 
-  it('should call createSubsystem', () => {
-    const request = {
-      nqn: mockNQN,
-      enable_ha: true,
-      initiators: '*',
-      gw_group: mockGroupName
-    };
-    service.createSubsystem(request).subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/subsystem');
-    expect(req.request.method).toBe('POST');
+  describe('test initiators APIs', () => {
+    let request = { host_nqn: '', gw_group: mockGroupName };
+    it('should call getInitiators', () => {
+      service.getInitiators(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/host?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call addInitiators', () => {
+      service.addInitiators(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(`${UI_API_PATH}/subsystem/${mockNQN}/host`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call removeInitiators', () => {
+      service.removeInitiators(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(
+        `${UI_API_PATH}/subsystem/${mockNQN}/host/${request.host_nqn}/${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 
-  it('should call deleteSubsystem', () => {
-    service.deleteSubsystem(mockNQN, mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('DELETE');
+  describe('test listener APIs', () => {
+    it('it should listListeners', () => {
+      service.listListeners(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/listener?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call createListener', () => {
+      const request = {
+        gw_group: mockGroupName,
+        host_name: 'ceph-node-02',
+        traddr: '192.168.100.102',
+        trsvcid: 4421
+      };
+      service.createListener(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/listener`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call deleteListener', () => {
+      const request = { host_name: 'ceph-node-02', traddr: '192.168.100.102', trsvcid: '4421' };
+      service
+        .deleteListener(mockNQN, request.host_name, request.traddr, request.trsvcid)
+        .subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/listener/${request.host_name}/${request.traddr}?trsvcid=${request.trsvcid}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 
-  // initiators
-  it('should call getInitiators', () => {
-    service.getInitiators(mockNQN, mockGroupName).subscribe();
-    const req = httpTesting.expectOne(
-      `api/nvmeof/subsystem/${mockNQN}/host?gw_group=${mockGroupName}`
-    );
-    expect(req.request.method).toBe('GET');
+  describe('test namespace APIs', () => {
+    const mockNsid = '1';
+    it('should call listNamespaces', () => {
+      service.listNamespaces(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call getNamespace', () => {
+      service.getNamespace(mockNQN, mockNsid, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call createNamespace', () => {
+      const mockNamespaceObj = {
+        rbd_image_name: 'nvme_ns_image:12345678',
+        rbd_pool: 'rbd',
+        size: 1024,
+        gw_group: mockGroupName
+      };
+      service.createNamespace(mockNQN, mockNamespaceObj).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/namespace`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call updateNamespace', () => {
+      const request = { rbd_image_size: 1024, gw_group: mockGroupName };
+      service.updateNamespace(mockNQN, mockNsid, request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}`);
+      expect(req.request.method).toBe('PATCH');
+    });
+    it('should call deleteNamespace', () => {
+      service.deleteNamespace(mockNQN, mockNsid, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html
index b022f1551e8..72ca4e47990 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-key-value/table-key-value.component.html
@@ -1,6 +1,6 @@
 <div class="table-scroller">
   <cd-table #table
-            [data]="tableData"
+            [data]="tableData || []"
             [columns]="columns"
             columnMode="flex"
             [toolHeader]="false"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
index c8aa3f47e2f..6ca4378b126 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
@@ -897,7 +897,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
 
     if (this.limit === 0) {
       this.model.currentPage = 1;
-      this.model.pageLength = filteredData.length;
+      this.model.pageLength = filteredData.length || 1;
       this._dataset.next(filteredData);
       return;
     }
diff --git a/src/pybind/mgr/dashboard/module.py b/src/pybind/mgr/dashboard/module.py
index 57be3f9ec0d..ac6e094a4aa 100644
--- a/src/pybind/mgr/dashboard/module.py
+++ b/src/pybind/mgr/dashboard/module.py
@@ -49,10 +49,6 @@ except ImportError:
 
 from .services.sso import load_sso_db
 
-if cherrypy is not None:
-    from .cherrypy_backports import patch_cherrypy
-    patch_cherrypy(cherrypy.__version__)
-
 # pylint: disable=wrong-import-position
 from .plugins import PLUGIN_MANAGER, debug, feature_toggles, motd  # isort:skip # noqa E501 # pylint: disable=unused-import
 
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index aedee7e493d..b464344e27a 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -11193,7 +11193,6 @@ paths:
                   type: string
               required:
               - bucket_id
-              - uid
               type: object
       responses:
         '200':
diff --git a/src/pybind/mgr/dashboard/run-backend-api-tests.sh b/src/pybind/mgr/dashboard/run-backend-api-tests.sh
index e7d441f44bb..981b331df19 100755
--- a/src/pybind/mgr/dashboard/run-backend-api-tests.sh
+++ b/src/pybind/mgr/dashboard/run-backend-api-tests.sh
@@ -134,7 +134,7 @@ run_teuthology_tests() {
     export CEPH_OUT_CLIENT_DIR=${LOCAL_BUILD_DIR}/out/client
     find . -iname "*${COVERAGE_FILE}*" -type f -delete
 
-    python ../qa/tasks/vstart_runner.py --ignore-missing-binaries --no-verbose $OPTIONS $(echo $TEST_CASES) ||
+    python ../qa/tasks/vstart_runner.py --ignore-missing-binaries --no-verbose --debug $OPTIONS $(echo $TEST_CASES) ||
       on_tests_error
 
     deactivate
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_cli.py b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
index 11a95237053..bd9de350448 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_cli.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
@@ -36,12 +36,12 @@ def add_nvmeof_gateway(_, inbuf, name: str, group: str, daemon_name: str):
 
 
 @CLIWriteCommand('dashboard nvmeof-gateway-rm')
-def remove_nvmeof_gateway(_, name: str):
+def remove_nvmeof_gateway(_, name: str, daemon_name: str = ''):
     '''
     Remove NVMe-oF gateway configuration
     '''
     try:
-        NvmeofGatewaysConfig.remove_gateway(name)
+        NvmeofGatewaysConfig.remove_gateway(name, daemon_name)
         return 0, 'Success', ''
     except ManagedByOrchestratorException as ex:
         return -errno.EINVAL, '', str(ex)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index 2426c599078..170f98c70d1 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -77,11 +77,22 @@ class NvmeofGatewaysConfig(object):
         cls._save_config(config)
 
     @classmethod
-    def remove_gateway(cls, name):
+    def remove_gateway(cls, name, daemon_name=None):
         config = cls.get_gateways_config()
         if name not in config['gateways']:
             raise NvmeofGatewayDoesNotExist(name)
-        del config['gateways'][name]
+
+        if not daemon_name:
+            del config['gateways'][name]
+        else:
+            # remove the daemon from the list of gateways
+            config['gateways'][name] = [daemon for daemon in config['gateways'][name]
+                                        if daemon['daemon_name'] != daemon_name]
+
+            # if there are no more daemons in the list, remove the gateway
+            if not config['gateways'][name]:
+                del config['gateways'][name]
+
         cls._save_config(config)
 
     @classmethod
diff --git a/src/pybind/mgr/dashboard/services/rgw_iam.py b/src/pybind/mgr/dashboard/services/rgw_iam.py
new file mode 100644
index 00000000000..dbf00df25e0
--- /dev/null
+++ b/src/pybind/mgr/dashboard/services/rgw_iam.py
@@ -0,0 +1,24 @@
+from subprocess import SubprocessError
+from typing import List
+
+from .. import mgr
+from ..exceptions import DashboardException
+
+
+class RgwAccounts:
+    def send_rgw_cmd(self, command: List[str]):
+        try:
+            exit_code, out, err = mgr.send_rgwadmin_command(command)
+
+            if exit_code != 0:
+                raise DashboardException(msg=err,
+                                         http_status_code=500,
+                                         component='rgw')
+            return out
+
+        except SubprocessError as e:
+            raise DashboardException(e, component='rgw')
+
+    def get_accounts(self):
+        get_accounts_cmd = ['account', 'list']
+        return self.send_rgw_cmd(get_accounts_cmd)
diff --git a/src/pybind/mgr/dashboard/services/service.py b/src/pybind/mgr/dashboard/services/service.py
index 41fcc4c4446..9b789c0c859 100644
--- a/src/pybind/mgr/dashboard/services/service.py
+++ b/src/pybind/mgr/dashboard/services/service.py
@@ -101,6 +101,8 @@ def wait_for_daemon_to_start(service_name, timeout=30):
 
 
 class RgwServiceManager:
+    user = 'dashboard'
+
     def find_available_port(self, starting_port=80):
         orch = OrchClient.instance()
         daemons = [d.to_dict() for d in orch.services.list_daemons(daemon_type='rgw')]
@@ -172,7 +174,6 @@ class RgwServiceManager:
 
     def configure_rgw_credentials(self):
         logger.info('Configuring dashboard RGW credentials')
-        user = 'dashboard'
         realms = []
         access_key = ''
         secret_key = ''
@@ -186,7 +187,7 @@ class RgwServiceManager:
                 realm_access_keys = {}
                 realm_secret_keys = {}
                 for realm in realms:
-                    realm_access_key, realm_secret_key = self._get_user_keys(user, realm)
+                    realm_access_key, realm_secret_key = self._get_user_keys(self.user, realm)
                     if realm_access_key:
                         realm_access_keys[realm] = realm_access_key
                         realm_secret_keys[realm] = realm_secret_key
@@ -194,7 +195,7 @@ class RgwServiceManager:
                     access_key = json.dumps(realm_access_keys)
                     secret_key = json.dumps(realm_secret_keys)
             else:
-                access_key, secret_key = self._get_user_keys(user)
+                access_key, secret_key = self._get_user_keys(self.user)
 
             assert access_key and secret_key
             Settings.RGW_API_ACCESS_KEY = access_key
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 29ddff2ffc2..76ad8d9d0ce 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -1285,7 +1285,7 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin):
             if latest < version:
                 raise RuntimeError(f"main.db version is newer ({version}) than module ({latest})")
             for i in range(version, latest):
-                self.log.info(f"upgrading main.db for {self.module_name} from {i-1}:{i}")
+                self.log.info(f"upgrading main.db for {self.module_name} from {i - 1}:{i}")
                 for sql in self.SCHEMA_VERSIONED[i]:
                     db.execute(sql)
             if version < latest:
diff --git a/src/pybind/mgr/nfs/ganesha_conf.py b/src/pybind/mgr/nfs/ganesha_conf.py
index 56c56b434bb..5108222eef3 100644
--- a/src/pybind/mgr/nfs/ganesha_conf.py
+++ b/src/pybind/mgr/nfs/ganesha_conf.py
@@ -459,7 +459,7 @@ class Export:
                    ex_dict.get('access_type', 'RO'),
                    ex_dict.get('squash', 'no_root_squash'),
                    ex_dict.get('security_label', True),
-                   ex_dict.get('protocols', [4]),
+                   ex_dict.get('protocols', [3, 4]),
                    ex_dict.get('transports', ['TCP']),
                    FSAL.from_dict(ex_dict.get('fsal', {})),
                    [Client.from_dict(client) for client in ex_dict.get('clients', [])],
diff --git a/src/pybind/mgr/nfs/tests/test_nfs.py b/src/pybind/mgr/nfs/tests/test_nfs.py
index edf8bab37a1..c0c1a73e269 100644
--- a/src/pybind/mgr/nfs/tests/test_nfs.py
+++ b/src/pybind/mgr/nfs/tests/test_nfs.py
@@ -1017,7 +1017,7 @@ NFS_CORE_PARAM {
         assert export.pseudo == "/mybucket"
         assert export.access_type == "none"
         assert export.squash == "none"
-        assert export.protocols == [4]
+        assert export.protocols == [3, 4]
         assert export.transports == ["TCP"]
         assert export.fsal.name == "RGW"
         assert export.fsal.user_id == "bucket_owner_user"
@@ -1060,7 +1060,7 @@ NFS_CORE_PARAM {
         assert export.pseudo == "/mybucket"
         assert export.access_type == "none"
         assert export.squash == "none"
-        assert export.protocols == [4]
+        assert export.protocols == [3, 4]
         assert export.transports == ["TCP"]
         assert export.fsal.name == "RGW"
         assert export.fsal.access_key_id == "the_access_key"
@@ -1102,7 +1102,7 @@ NFS_CORE_PARAM {
         assert export.pseudo == "/mybucket"
         assert export.access_type == "none"
         assert export.squash == "none"
-        assert export.protocols == [4]
+        assert export.protocols == [3, 4]
         assert export.transports == ["TCP"]
         assert export.fsal.name == "RGW"
         assert export.fsal.access_key_id == "the_access_key"
@@ -1151,7 +1151,7 @@ NFS_CORE_PARAM {
         assert export.pseudo == "/cephfs2"
         assert export.access_type == "none"
         assert export.squash == "none"
-        assert export.protocols == [4]
+        assert export.protocols == [3, 4]
         assert export.transports == ["TCP"]
         assert export.fsal.name == "CEPH"
         assert export.fsal.user_id == "nfs.foo.myfs.86ca58ef"
@@ -1190,7 +1190,7 @@ NFS_CORE_PARAM {
         assert export.pseudo == "/cephfs3"
         assert export.access_type == "RW"
         assert export.squash == "root"
-        assert export.protocols == [4]
+        assert export.protocols == [3, 4]
         assert export.fsal.name == "CEPH"
         assert export.fsal.user_id == "nfs.foo.myfs.86ca58ef"
         assert export.fsal.cephx_key == "thekeyforclientabc"
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index d5c351fda7e..a505801eea5 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -503,7 +503,7 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
-    def exit_host_maintenance(self, hostname: str) -> OrchResult:
+    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult:
         """
         Return a host from maintenance, restarting the clusters systemd target
         """
@@ -901,10 +901,18 @@ class Orchestrator(object):
         """Change/Add a specific setting for a tuned profile"""
         raise NotImplementedError()
 
+    def tuned_profile_add_settings(self, profile_name: str, setting: dict) -> OrchResult[str]:
+        """Change/Add multiple settings for a tuned profile"""
+        raise NotImplementedError()
+
     def tuned_profile_rm_setting(self, profile_name: str, setting: str) -> OrchResult[str]:
         """Remove a specific setting for a tuned profile"""
         raise NotImplementedError()
 
+    def tuned_profile_rm_settings(self, profile_name: str, settings: List[str]) -> OrchResult[str]:
+        """Remove multiple settings from a tuned profile"""
+        raise NotImplementedError
+
     def upgrade_check(self, image: Optional[str], version: Optional[str]) -> OrchResult[str]:
         raise NotImplementedError()
 
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index dbfa10fb720..332bc75d862 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -799,11 +799,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
         return HandleCommandResult(stdout=completion.result_str())
 
     @_cli_write_command('orch host maintenance exit')
-    def _host_maintenance_exit(self, hostname: str) -> HandleCommandResult:
+    def _host_maintenance_exit(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult:
         """
         Return a host from maintenance, restarting all Ceph daemons (cephadm only)
         """
-        completion = self.exit_host_maintenance(hostname)
+        completion = self.exit_host_maintenance(hostname, force, offline)
         raise_if_exception(completion)
 
         return HandleCommandResult(stdout=completion.result_str())
@@ -2250,6 +2250,39 @@ Usage:
         res = raise_if_exception(completion)
         return HandleCommandResult(stdout=res)
 
+    @_cli_write_command("orch tuned-profile add-settings")
+    def _tuned_profile_add_settings(self, profile_name: str, settings: str) -> HandleCommandResult:
+        try:
+            setting_pairs = settings.split(",")
+            parsed_setting = {}
+            parsed_setting = {key.strip(): value.strip() for key, value in (s.split('=', 1) for s in setting_pairs)}
+            completion = self.tuned_profile_add_settings(profile_name, parsed_setting)
+            res = raise_if_exception(completion)
+            return HandleCommandResult(stdout=res)
+        except ValueError:
+            error_message = (
+                "Error: Invalid format detected. "
+                "The correct format is key=value pairs separated by commas,"
+                "e.g., 'vm.swappiness=11,vm.user_reserve_kbytes=116851'"
+            )
+            return HandleCommandResult(stderr=error_message)
+
+    @_cli_write_command("orch tuned-profile rm-settings")
+    def _tuned_profile_rm_settings(self, profile_name: str, settings: str) -> HandleCommandResult:
+        try:
+            setting = [s.strip() for s in settings.split(",") if s.strip()]
+            if not setting:
+                raise ValueError(
+                    "Error: Invalid format."
+                    "The correct format is key1,key2"
+                    "e.g., vm.swappiness,vm.user_reserve_kbytes"
+                )
+            completion = self.tuned_profile_rm_settings(profile_name, setting)
+            res = raise_if_exception(completion)
+            return HandleCommandResult(stdout=res)
+        except ValueError as e:
+            return HandleCommandResult(stderr=str(e))
+
     def self_test(self) -> None:
         old_orch = self._select_orchestrator()
         self._set_backend('')
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
index 8b1c0921896..381f7e460c5 100644
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -8,7 +8,6 @@ import re
 import threading
 import time
 import enum
-from packaging import version  # type: ignore
 from collections import namedtuple
 import tempfile
 
@@ -29,21 +28,6 @@ MetricValue = Dict[LabelValues, Number]
 
 DEFAULT_PORT = 9283
 
-# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
-# that the ports its listening on are in fact bound. When using the any address
-# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
-# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
-# exception.
-if cherrypy is not None:
-    Version = version.Version
-    v = Version(cherrypy.__version__)
-    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
-    # centos:7) and back to at least 3.0.0.
-    if Version("3.1.2") <= v < Version("3.2.3"):
-        # https://github.com/cherrypy/cherrypy/issues/1100
-        from cherrypy.process import servers
-        servers.wait_for_occupied_port = lambda host, port: None
-
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
 def os_exit_noop(status: int) -> None:
diff --git a/src/pybind/mgr/restful/__init__.py b/src/pybind/mgr/restful/__init__.py
deleted file mode 100644
index 8f210ac9247..00000000000
--- a/src/pybind/mgr/restful/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .module import Module
diff --git a/src/pybind/mgr/restful/api/__init__.py b/src/pybind/mgr/restful/api/__init__.py
deleted file mode 100644
index a105dfe87f8..00000000000
--- a/src/pybind/mgr/restful/api/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from pecan import expose
-from pecan.rest import RestController
-
-from .config import Config
-from .crush import Crush
-from .doc import Doc
-from .mon import Mon
-from .osd import Osd
-from .pool import Pool
-from .perf import Perf
-from .request import Request
-from .server import Server
-
-
-class Root(RestController):
-    config = Config()
-    crush = Crush()
-    doc = Doc()
-    mon = Mon()
-    osd = Osd()
-    perf = Perf()
-    pool = Pool()
-    request = Request()
-    server = Server()
-
-    @expose(template='json')
-    def get(self, **kwargs):
-        """
-        Show the basic information for the REST API
-        This includes values like api version or auth method
-        """
-        return {
-            'api_version': 1,
-            'auth':
-                'Use "ceph restful create-key <key>" to create a key pair, '
-                'pass it as HTTP Basic auth to authenticate',
-            'doc': 'See /doc endpoint',
-            'info': "Ceph Manager RESTful API server",
-        }
diff --git a/src/pybind/mgr/restful/api/config.py b/src/pybind/mgr/restful/api/config.py
deleted file mode 100644
index 5b0e0af96c2..00000000000
--- a/src/pybind/mgr/restful/api/config.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from pecan import expose, request
-from pecan.rest import RestController
-
-from restful import common, context
-from restful.decorators import auth
-
-
-class ConfigOsd(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show OSD configuration options
-        """
-        flags = context.instance.get("osd_map")['flags']
-
-        # pause is a valid osd config command that sets pauserd,pausewr
-        flags = flags.replace('pauserd,pausewr', 'pause')
-
-        return flags.split(',')
-
-
-    @expose(template='json')
-    @auth
-    def patch(self, **kwargs):
-        """
-        Modify OSD configuration options
-        """
-        args = request.json
-
-        commands = []
-
-        valid_flags = set(args.keys()) & set(common.OSD_FLAGS)
-        invalid_flags = list(set(args.keys()) - valid_flags)
-        if invalid_flags:
-            context.instance.log.warning("%s not valid to set/unset", invalid_flags)
-
-        for flag in list(valid_flags):
-            if args[flag]:
-                mode = 'set'
-            else:
-                mode = 'unset'
-
-            commands.append({
-                'prefix': 'osd ' + mode,
-                'key': flag,
-            })
-
-        return context.instance.submit_request([commands], **kwargs)
-
-
-
-class ConfigClusterKey(RestController):
-    def __init__(self, key):
-        self.key = key
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show specific configuration option
-        """
-        return context.instance.get("config").get(self.key, None)
-
-
-
-class ConfigCluster(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show all cluster configuration options
-        """
-        return context.instance.get("config")
-
-
-    @expose()
-    def _lookup(self, key, *remainder):
-        return ConfigClusterKey(key), remainder
-
-
-
-class Config(RestController):
-    cluster = ConfigCluster()
-    osd = ConfigOsd()
diff --git a/src/pybind/mgr/restful/api/crush.py b/src/pybind/mgr/restful/api/crush.py
deleted file mode 100644
index 79f9007b6fd..00000000000
--- a/src/pybind/mgr/restful/api/crush.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from pecan import expose
-from pecan.rest import RestController
-
-from restful import common, context
-
-from restful.decorators import auth
-
-
-class CrushRule(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show crush rules
-        """
-        crush = context.instance.get('osd_map_crush')
-        rules = crush['rules']
-
-        for rule in rules:
-            rule['osd_count'] = len(common.crush_rule_osds(crush['buckets'], rule))
-
-        return rules
-
-class Crush(RestController):
-    rule = CrushRule()
diff --git a/src/pybind/mgr/restful/api/doc.py b/src/pybind/mgr/restful/api/doc.py
deleted file mode 100644
index f1038c21b16..00000000000
--- a/src/pybind/mgr/restful/api/doc.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pecan import expose
-from pecan.rest import RestController
-
-from restful import context
-
-import restful
-
-
-class Doc(RestController):
-    @expose(template='json')
-    def get(self, **kwargs):
-        """
-        Show documentation information
-        """
-        return context.instance.get_doc_api(restful.api.Root)
diff --git a/src/pybind/mgr/restful/api/mon.py b/src/pybind/mgr/restful/api/mon.py
deleted file mode 100644
index 20d0336059a..00000000000
--- a/src/pybind/mgr/restful/api/mon.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from pecan import expose, response
-from pecan.rest import RestController
-
-from restful import context
-from restful.decorators import auth
-
-
-class MonName(RestController):
-    def __init__(self, name):
-        self.name = name
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for the monitor name
-        """
-        mon = [x for x in context.instance.get_mons()
-               if x['name'] == self.name]
-        if len(mon) != 1:
-            response.status = 500
-            return {'message': 'Failed to identify the monitor node "{}"'.format(self.name)}
-        return mon[0]
-
-
-
-class Mon(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for all the monitors
-        """
-        return context.instance.get_mons()
-
-
-    @expose()
-    def _lookup(self, name, *remainder):
-        return MonName(name), remainder
diff --git a/src/pybind/mgr/restful/api/osd.py b/src/pybind/mgr/restful/api/osd.py
deleted file mode 100644
index 8577fae98eb..00000000000
--- a/src/pybind/mgr/restful/api/osd.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from pecan import expose, request, response
-from pecan.rest import RestController
-
-from restful import common, context
-from restful.decorators import auth
-
-
-class OsdIdCommand(RestController):
-    def __init__(self, osd_id):
-        self.osd_id = osd_id
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show implemented commands for the OSD id
-        """
-        osd = context.instance.get_osd_by_id(self.osd_id)
-
-        if not osd:
-            response.status = 500
-            return {'message': 'Failed to identify the OSD id "{}"'.format(self.osd_id)}
-
-        if osd['up']:
-            return common.OSD_IMPLEMENTED_COMMANDS
-        else:
-            return []
-
-
-    @expose(template='json')
-    @auth
-    def post(self, **kwargs):
-        """
-        Run the implemented command for the OSD id
-        """
-        command = request.json.get('command', None)
-
-        osd = context.instance.get_osd_by_id(self.osd_id)
-
-        if not osd:
-            response.status = 500
-            return {'message': 'Failed to identify the OSD id "{}"'.format(self.osd_id)}
-
-        if not osd['up'] or command not in common.OSD_IMPLEMENTED_COMMANDS:
-            response.status = 500
-            return {'message': 'Command "{}" not available'.format(command)}
-
-        return context.instance.submit_request([[{
-            'prefix': 'osd ' + command,
-            'who': str(self.osd_id)
-        }]], **kwargs)
-
-
-
-class OsdId(RestController):
-    def __init__(self, osd_id):
-        self.osd_id = osd_id
-        self.command = OsdIdCommand(osd_id)
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for the OSD id
-        """
-        osd = context.instance.get_osds(ids=[str(self.osd_id)])
-        if len(osd) != 1:
-            response.status = 500
-            return {'message': 'Failed to identify the OSD id "{}"'.format(self.osd_id)}
-
-        return osd[0]
-
-
-    @expose(template='json')
-    @auth
-    def patch(self, **kwargs):
-        """
-        Modify the state (up, in) of the OSD id or reweight it
-        """
-        args = request.json
-
-        commands = []
-
-        if 'in' in args:
-            if args['in']:
-                commands.append({
-                    'prefix': 'osd in',
-                    'ids': [str(self.osd_id)]
-                })
-            else:
-                commands.append({
-                    'prefix': 'osd out',
-                    'ids': [str(self.osd_id)]
-                })
-
-        if 'up' in args:
-            if args['up']:
-                response.status = 500
-                return {'message': "It is not valid to set a down OSD to be up"}
-            else:
-                commands.append({
-                    'prefix': 'osd down',
-                    'ids': [str(self.osd_id)]
-                })
-
-        if 'reweight' in args:
-            commands.append({
-                'prefix': 'osd reweight',
-                'id': self.osd_id,
-                'weight': args['reweight']
-            })
-
-        return context.instance.submit_request([commands], **kwargs)
-
-
-
-class Osd(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for all the OSDs
-        """
-        # Parse request args
-        # TODO Filter by ids
-        pool_id = kwargs.get('pool', None)
-
-        return context.instance.get_osds(pool_id)
-
-
-    @expose()
-    def _lookup(self, osd_id, *remainder):
-        return OsdId(int(osd_id)), remainder
diff --git a/src/pybind/mgr/restful/api/perf.py b/src/pybind/mgr/restful/api/perf.py
deleted file mode 100644
index c484ac55e44..00000000000
--- a/src/pybind/mgr/restful/api/perf.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from pecan import expose, request, response
-from pecan.rest import RestController
-
-from restful import context
-from restful.decorators import auth, lock, paginate
-
-import re
-
-class Perf(RestController):
-    @expose(template='json')
-    @paginate
-    @auth
-    def get(self, **kwargs):
-        """
-        List all the available performance counters
-
-        Options:
-         - 'daemon' -- filter by daemon, accepts Python regexp
-        """
-
-        counters = context.instance.get_unlabeled_perf_counters()
-
-        if 'daemon' in kwargs:
-            _re = re.compile(kwargs['daemon'])
-            counters = {k: v for k, v in counters.items() if _re.match(k)}
-
-        return counters
diff --git a/src/pybind/mgr/restful/api/pool.py b/src/pybind/mgr/restful/api/pool.py
deleted file mode 100644
index 40de54eb957..00000000000
--- a/src/pybind/mgr/restful/api/pool.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from pecan import expose, request, response
-from pecan.rest import RestController
-
-from restful import common, context
-from restful.decorators import auth
-
-
-class PoolId(RestController):
-    def __init__(self, pool_id):
-        self.pool_id = pool_id
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for the pool id
-        """
-        pool = context.instance.get_pool_by_id(self.pool_id)
-
-        if not pool:
-            response.status = 500
-            return {'message': 'Failed to identify the pool id "{}"'.format(self.pool_id)}
-
-        # pgp_num is called pg_placement_num, deal with that
-        if 'pg_placement_num' in pool:
-            pool['pgp_num'] = pool.pop('pg_placement_num')
-        return pool
-
-
-    @expose(template='json')
-    @auth
-    def patch(self, **kwargs):
-        """
-        Modify the information for the pool id
-        """
-        try:
-            args = request.json
-        except ValueError:
-            response.status = 400
-            return {'message': 'Bad request: malformed JSON or wrong Content-Type'}
-
-        # Get the pool info for its name
-        pool = context.instance.get_pool_by_id(self.pool_id)
-        if not pool:
-            response.status = 500
-            return {'message': 'Failed to identify the pool id "{}"'.format(self.pool_id)}
-
-        # Check for invalid pool args
-        invalid = common.invalid_pool_args(args)
-        if invalid:
-            response.status = 500
-            return {'message': 'Invalid arguments found: "{}"'.format(invalid)}
-
-        # Schedule the update request
-        return context.instance.submit_request(common.pool_update_commands(pool['pool_name'], args), **kwargs)
-
-
-    @expose(template='json')
-    @auth
-    def delete(self, **kwargs):
-        """
-        Remove the pool data for the pool id
-        """
-        pool = context.instance.get_pool_by_id(self.pool_id)
-
-        if not pool:
-            response.status = 500
-            return {'message': 'Failed to identify the pool id "{}"'.format(self.pool_id)}
-
-        return context.instance.submit_request([[{
-            'prefix': 'osd pool delete',
-            'pool': pool['pool_name'],
-            'pool2': pool['pool_name'],
-            'yes_i_really_really_mean_it': True
-        }]], **kwargs)
-
-
-
-class Pool(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for all the pools
-        """
-        pools = context.instance.get('osd_map')['pools']
-
-        # pgp_num is called pg_placement_num, deal with that
-        for pool in pools:
-            if 'pg_placement_num' in pool:
-                pool['pgp_num'] = pool.pop('pg_placement_num')
-
-        return pools
-
-
-    @expose(template='json')
-    @auth
-    def post(self, **kwargs):
-        """
-        Create a new pool
-        Requires name and pg_num dict arguments
-        """
-        args = request.json
-
-        # Check for the required arguments
-        pool_name = args.pop('name', None)
-        if pool_name is None:
-            response.status = 500
-            return {'message': 'You need to specify the pool "name" argument'}
-
-        pg_num = args.pop('pg_num', None)
-        if pg_num is None:
-            response.status = 500
-            return {'message': 'You need to specify the "pg_num" argument'}
-
-        # Run the pool create command first
-        create_command = {
-            'prefix': 'osd pool create',
-            'pool': pool_name,
-            'pg_num': pg_num
-        }
-
-        # Check for invalid pool args
-        invalid = common.invalid_pool_args(args)
-        if invalid:
-            response.status = 500
-            return {'message': 'Invalid arguments found: "{}"'.format(invalid)}
-
-        # Schedule the creation and update requests
-        return context.instance.submit_request(
-            [[create_command]] +
-            common.pool_update_commands(pool_name, args),
-            **kwargs
-        )
-
-
-    @expose()
-    def _lookup(self, pool_id, *remainder):
-        return PoolId(int(pool_id)), remainder
diff --git a/src/pybind/mgr/restful/api/request.py b/src/pybind/mgr/restful/api/request.py
deleted file mode 100644
index 67143ef508d..00000000000
--- a/src/pybind/mgr/restful/api/request.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from pecan import expose, request, response
-from pecan.rest import RestController
-
-from restful import context
-from restful.decorators import auth, lock, paginate
-
-
-class RequestId(RestController):
-    def __init__(self, request_id):
-        self.request_id = request_id
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for the request id
-        """
-        request = [x for x in context.instance.requests
-                   if x.id == self.request_id]
-        if len(request) != 1:
-            response.status = 500
-            return {'message': 'Unknown request id "{}"'.format(self.request_id)}
-        return request[0]
-
-
-    @expose(template='json')
-    @auth
-    @lock
-    def delete(self, **kwargs):
-        """
-        Remove the request id from the database
-        """
-        for index in range(len(context.instance.requests)):
-            if context.instance.requests[index].id == self.request_id:
-                return context.instance.requests.pop(index)
-
-        # Failed to find the job to cancel
-        response.status = 500
-        return {'message': 'No such request id'}
-
-
-
-class Request(RestController):
-    @expose(template='json')
-    @paginate
-    @auth
-    def get(self, **kwargs):
-        """
-        List all the available requests
-        """
-        return context.instance.requests
-
-
-    @expose(template='json')
-    @auth
-    @lock
-    def delete(self, **kwargs):
-        """
-        Remove all the finished requests
-        """
-        num_requests = len(context.instance.requests)
-
-        context.instance.requests = [x for x in context.instance.requests
-                                     if not x.is_finished()]
-        remaining = len(context.instance.requests)
-        # Return the job statistics
-        return {
-            'cleaned': num_requests - remaining,
-            'remaining': remaining,
-        }
-
-
-    @expose(template='json')
-    @auth
-    def post(self, **kwargs):
-        """
-        Pass through method to create any request
-        """
-        if isinstance(request.json, list):
-            if all(isinstance(element, list) for element in request.json):
-                return context.instance.submit_request(request.json, **kwargs)
-
-            # The request.json has wrong format
-            response.status = 500
-            return {'message': 'The request format should be [[{c1},{c2}]]'}
-
-        return context.instance.submit_request([[request.json]], **kwargs)
-
-
-    @expose()
-    def _lookup(self, request_id, *remainder):
-        return RequestId(request_id), remainder
diff --git a/src/pybind/mgr/restful/api/server.py b/src/pybind/mgr/restful/api/server.py
deleted file mode 100644
index 8ce63493754..00000000000
--- a/src/pybind/mgr/restful/api/server.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from pecan import expose
-from pecan.rest import RestController
-
-from restful import context
-from restful.decorators import auth
-
-
-class ServerFqdn(RestController):
-    def __init__(self, fqdn):
-        self.fqdn = fqdn
-
-
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for the server fqdn
-        """
-        return context.instance.get_server(self.fqdn)
-
-
-
-class Server(RestController):
-    @expose(template='json')
-    @auth
-    def get(self, **kwargs):
-        """
-        Show the information for all the servers
-        """
-        return context.instance.list_servers()
-
-
-    @expose()
-    def _lookup(self, fqdn, *remainder):
-        return ServerFqdn(fqdn), remainder
diff --git a/src/pybind/mgr/restful/common.py b/src/pybind/mgr/restful/common.py
deleted file mode 100644
index 1b957d6b5ec..00000000000
--- a/src/pybind/mgr/restful/common.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# List of valid osd flags
-OSD_FLAGS = [
-    'pause', 'noup', 'nodown', 'noout', 'noin', 'nobackfill',
-    'norecover', 'noscrub', 'nodeep-scrub',
-]
-
-# Implemented osd commands
-OSD_IMPLEMENTED_COMMANDS = [
-    'scrub', 'deep-scrub', 'repair'
-]
-
-# Valid values for the 'var' argument to 'ceph osd pool set'
-POOL_PROPERTIES_1 = [
-    'size', 'min_size', 'pg_num',
-    'crush_rule', 'hashpspool',
-]
-
-POOL_PROPERTIES_2 = [
-    'pgp_num'
-]
-
-POOL_PROPERTIES = POOL_PROPERTIES_1 + POOL_PROPERTIES_2
-
-# Valid values for the 'ceph osd pool set-quota' command
-POOL_QUOTA_PROPERTIES = [
-    ('quota_max_bytes', 'max_bytes'),
-    ('quota_max_objects', 'max_objects'),
-]
-
-POOL_ARGS = POOL_PROPERTIES + [x for x,_ in POOL_QUOTA_PROPERTIES]
-
-
-# Transform command to a human readable form
-def humanify_command(command):
-    out = [command['prefix']]
-
-    for arg, val in command.items():
-        if arg != 'prefix':
-            out.append("%s=%s" % (str(arg), str(val)))
-
-    return " ".join(out)
-
-
-def invalid_pool_args(args):
-    invalid = []
-    for arg in args:
-        if arg not in POOL_ARGS:
-            invalid.append(arg)
-
-    return invalid
-
-
-def pool_update_commands(pool_name, args):
-    commands = [[], []]
-
-    # We should increase pgp_num when we are re-setting pg_num
-    if 'pg_num' in args and 'pgp_num' not in args:
-        args['pgp_num'] = args['pg_num']
-
-    # Run the first pool set and quota properties in parallel
-    for var in POOL_PROPERTIES_1:
-        if var in args:
-            commands[0].append({
-                'prefix': 'osd pool set',
-                'pool': pool_name,
-                'var': var,
-                'val': args[var],
-            })
-
-    for (var, field) in POOL_QUOTA_PROPERTIES:
-        if var in args:
-            commands[0].append({
-                'prefix': 'osd pool set-quota',
-                'pool': pool_name,
-                'field': field,
-                'val': str(args[var]),
-            })
-
-    # The second pool set properties need to be run after the first wave
-    for var in POOL_PROPERTIES_2:
-        if var in args:
-            commands[1].append({
-                'prefix': 'osd pool set',
-                'pool': pool_name,
-                'var': var,
-                'val': args[var],
-            })
-
-    return commands
-
-def crush_rule_osds(node_buckets, rule):
-    nodes_by_id = dict((b['id'], b) for b in node_buckets)
-
-    def _gather_leaf_ids(node_id):
-        if node_id >= 0:
-            return set([node_id])
-
-        result = set()
-        for item in nodes_by_id[node_id]['items']:
-            result |= _gather_leaf_ids(item['id'])
-
-        return result
-
-    def _gather_descendent_ids(node, typ):
-        result = set()
-        for item in node['items']:
-            if item['id'] >= 0:
-                if typ == "osd":
-                    result.add(item['id'])
-            else:
-                child_node = nodes_by_id[item['id']]
-                if child_node['type_name'] == typ:
-                    result.add(child_node['id'])
-                elif 'items' in child_node:
-                    result |= _gather_descendent_ids(child_node, typ)
-
-        return result
-
-    def _gather_osds(root, steps):
-        if root['id'] >= 0:
-            return set([root['id']])
-
-        osds = set()
-        step = steps[0]
-        if step['op'] == 'choose_firstn':
-            # Choose all descendents of the current node of type 'type'
-            descendent_ids = _gather_descendent_ids(root, step['type'])
-            for node_id in descendent_ids:
-                if node_id >= 0:
-                    osds.add(node_id)
-                else:
-                    osds |= _gather_osds(nodes_by_id[node_id], steps[1:])
-        elif step['op'] == 'chooseleaf_firstn':
-            # Choose all descendents of the current node of type 'type',
-            # and select all leaves beneath those
-            descendent_ids = _gather_descendent_ids(root, step['type'])
-            for node_id in descendent_ids:
-                if node_id >= 0:
-                    osds.add(node_id)
-                else:
-                    for desc_node in nodes_by_id[node_id]['items']:
-                        # Short circuit another iteration to find the emit
-                        # and assume anything we've done a chooseleaf on
-                        # is going to be part of the selected set of osds
-                        osds |= _gather_leaf_ids(desc_node['id'])
-        elif step['op'] == 'emit':
-            if root['id'] >= 0:
-                osds |= root['id']
-
-        return osds
-
-    osds = set()
-    for i, step in enumerate(rule['steps']):
-        if step['op'] == 'take':
-            osds |= _gather_osds(nodes_by_id[step['item']], rule['steps'][i + 1:])
-    return osds
diff --git a/src/pybind/mgr/restful/context.py b/src/pybind/mgr/restful/context.py
deleted file mode 100644
index a05ea8548df..00000000000
--- a/src/pybind/mgr/restful/context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Global instance to share
-instance = None
diff --git a/src/pybind/mgr/restful/decorators.py b/src/pybind/mgr/restful/decorators.py
deleted file mode 100644
index 11840a9913a..00000000000
--- a/src/pybind/mgr/restful/decorators.py
+++ /dev/null
@@ -1,81 +0,0 @@
-
-from pecan import request, response
-from base64 import b64decode
-from functools import wraps
-
-import traceback
-
-from . import context
-
-
-# Handle authorization
-def auth(f):
-    @wraps(f)
-    def decorated(*args, **kwargs):
-        if not context.instance.enable_auth:
-            return f(*args, **kwargs)
-            
-        if not request.authorization:
-            response.status = 401
-            response.headers['WWW-Authenticate'] = 'Basic realm="Login Required"'
-            return {'message': 'auth: No HTTP username/password'}
-
-        username, password = b64decode(request.authorization[1]).decode('utf-8').split(':')
-
-        # Check that the username exists
-        if username not in context.instance.keys:
-            response.status = 401
-            response.headers['WWW-Authenticate'] = 'Basic realm="Login Required"'
-            return {'message': 'auth: No such user'}
-
-        # Check the password
-        if context.instance.keys[username] != password:
-            response.status = 401
-            response.headers['WWW-Authenticate'] = 'Basic realm="Login Required"'
-            return {'message': 'auth: Incorrect password'}
-
-        return f(*args, **kwargs)
-    return decorated
-
-
-# Helper function to lock the function
-def lock(f):
-    @wraps(f)
-    def decorated(*args, **kwargs):
-        with context.instance.requests_lock:
-            return f(*args, **kwargs)
-    return decorated
-
-
-# Support ?page=N argument
-def paginate(f):
-    @wraps(f)
-    def decorated(*args, **kwargs):
-        _out = f(*args, **kwargs)
-
-        # Do not modify anything without a specific request
-        if not 'page' in kwargs:
-            return _out
-
-        # A pass-through for errors, etc
-        if not isinstance(_out, list):
-            return _out
-
-        # Parse the page argument
-        _page = kwargs['page']
-        try:
-            _page = int(_page)
-        except ValueError:
-            response.status = 500
-            return {'message': 'The requested page is not an integer'}
-
-        # Raise _page so that 0 is the first page and -1 is the last
-        _page += 1
-
-        if _page > 0:
-            _page *= 100
-        else:
-            _page = len(_out) - (_page*100)
-
-        return _out[_page - 100: _page]
-    return decorated
diff --git a/src/pybind/mgr/restful/hooks.py b/src/pybind/mgr/restful/hooks.py
deleted file mode 100644
index c57cbcd404d..00000000000
--- a/src/pybind/mgr/restful/hooks.py
+++ /dev/null
@@ -1,10 +0,0 @@
-
-from pecan.hooks import PecanHook
-
-import traceback
-
-from . import context
-
-class ErrorHook(PecanHook):
-    def on_error(self, stat, exc):
-        context.instance.log.error(str(traceback.format_exc()))
diff --git a/src/pybind/mgr/restful/module.py b/src/pybind/mgr/restful/module.py
deleted file mode 100644
index ad76473afd0..00000000000
--- a/src/pybind/mgr/restful/module.py
+++ /dev/null
@@ -1,633 +0,0 @@
-"""
-A RESTful API for Ceph
-"""
-
-import os
-import json
-import time
-import errno
-import inspect
-import tempfile
-import threading
-import traceback
-import socket
-import fcntl
-from typing import cast
-
-from . import common
-from . import context
-
-from uuid import uuid4
-from pecan import jsonify, make_app
-from OpenSSL import crypto
-from pecan.rest import RestController
-from werkzeug.serving import make_server, make_ssl_devcert
-
-from .hooks import ErrorHook
-from mgr_module import MgrModule, CommandResult, NotifyType, Option
-from mgr_util import build_url
-
-
-class CannotServe(Exception):
-    pass
-
-
-class CommandsRequest(object):
-    """
-    This class handles parallel as well as sequential execution of
-    commands. The class accept a list of iterables that should be
-    executed sequentially. Each iterable can contain several commands
-    that can be executed in parallel.
-
-    Example:
-    [[c1,c2],[c3,c4]]
-     - run c1 and c2 in parallel
-     - wait for them to finish
-     - run c3 and c4 in parallel
-     - wait for them to finish
-    """
-
-
-    def __init__(self, commands_arrays):
-        self.id = str(id(self))
-
-        # Filter out empty sub-requests
-        commands_arrays = [x for x in commands_arrays
-                           if len(x) != 0]
-
-        self.running = []
-        self.waiting = commands_arrays[1:]
-        self.finished = []
-        self.failed = []
-
-        self.lock = threading.RLock()
-        if not len(commands_arrays):
-            # Nothing to run
-            return
-
-        # Process first iteration of commands_arrays in parallel
-        results = self.run(commands_arrays[0])
-
-        self.running.extend(results)
-
-
-    def run(self, commands):
-        """
-        A static method that will execute the given list of commands in
-        parallel and will return the list of command results.
-        """
-
-        # Gather the results (in parallel)
-        results = []
-        for index, command in enumerate(commands):
-            tag = '%s:%s:%d' % (__name__, self.id, index)
-
-            # Store the result
-            result = CommandResult(tag)
-            result.command = common.humanify_command(command)
-            results.append(result)
-
-            # Run the command
-            context.instance.send_command(result, 'mon', '', json.dumps(command), tag)
-
-        return results
-
-
-    def next(self):
-        with self.lock:
-            if not self.waiting:
-                # Nothing to run
-                return
-
-            # Run a next iteration of commands
-            commands = self.waiting[0]
-            self.waiting = self.waiting[1:]
-
-            self.running.extend(self.run(commands))
-
-
-    def finish(self, tag):
-        with self.lock:
-            for index in range(len(self.running)):
-                if self.running[index].tag == tag:
-                    if self.running[index].r == 0:
-                        self.finished.append(self.running.pop(index))
-                    else:
-                        self.failed.append(self.running.pop(index))
-                    return True
-
-            # No such tag found
-            return False
-
-
-    def is_running(self, tag):
-        for result in self.running:
-            if result.tag == tag:
-                return True
-        return False
-
-
-    def is_ready(self):
-        with self.lock:
-            return not self.running and self.waiting
-
-
-    def is_waiting(self):
-        return bool(self.waiting)
-
-
-    def is_finished(self):
-        with self.lock:
-            return not self.running and not self.waiting
-
-
-    def has_failed(self):
-        return bool(self.failed)
-
-
-    def get_state(self):
-        with self.lock:
-            if not self.is_finished():
-                return "pending"
-
-            if self.has_failed():
-                return "failed"
-
-            return "success"
-
-
-    def __json__(self):
-        return {
-            'id': self.id,
-            'running': [
-                {
-                    'command': x.command,
-                    'outs': x.outs,
-                    'outb': x.outb,
-                } for x in self.running
-            ],
-            'finished': [
-                {
-                    'command': x.command,
-                    'outs': x.outs,
-                    'outb': x.outb,
-                } for x in self.finished
-            ],
-            'waiting': [
-                [common.humanify_command(y) for y in x]
-                for x in self.waiting
-            ],
-            'failed': [
-                {
-                    'command': x.command,
-                    'outs': x.outs,
-                    'outb': x.outb,
-                } for x in self.failed
-            ],
-            'is_waiting': self.is_waiting(),
-            'is_finished': self.is_finished(),
-            'has_failed': self.has_failed(),
-            'state': self.get_state(),
-        }
-
-
-
-class Module(MgrModule):
-    MODULE_OPTIONS = [
-        Option(name='server_addr'),
-        Option(name='server_port'),
-        Option(name='key_file'),
-        Option(name='enable_auth',
-               type='bool',
-               default=True),
-        Option(name='max_requests',
-               type='int',
-               default=500,
-               desc='Maximum number of requests to keep in memory. '
-                    ' When new request comes in, the oldest request will be removed if the number of requests exceeds the max request number.'
-                    'if un-finished request is removed, error message will be logged in the ceph-mgr log.'),
-    ]
-
-    COMMANDS = [
-        {
-            "cmd": "restful create-key name=key_name,type=CephString",
-            "desc": "Create an API key with this name",
-            "perm": "rw"
-        },
-        {
-            "cmd": "restful delete-key name=key_name,type=CephString",
-            "desc": "Delete an API key with this name",
-            "perm": "rw"
-        },
-        {
-            "cmd": "restful list-keys",
-            "desc": "List all API keys",
-            "perm": "r"
-        },
-        {
-            "cmd": "restful create-self-signed-cert",
-            "desc": "Create localized self signed certificate",
-            "perm": "rw"
-        },
-        {
-            "cmd": "restful restart",
-            "desc": "Restart API server",
-            "perm": "rw"
-        },
-    ]
-
-    NOTIFY_TYPES = [NotifyType.command]
-
-    def __init__(self, *args, **kwargs):
-        super(Module, self).__init__(*args, **kwargs)
-        context.instance = self
-
-        self.requests = []
-        self.requests_lock = threading.RLock()
-
-        self.keys = {}
-        self.enable_auth = True
-
-        self.server = None
-
-        self.stop_server = False
-        self.serve_event = threading.Event()
-        self.max_requests = cast(int, self.get_localized_module_option('max_requests', 500))
-
-
-    def serve(self):
-        self.log.debug('serve enter')
-        while not self.stop_server:
-            try:
-                self._serve()
-                self.server.socket.close()
-            except CannotServe as cs:
-                self.log.warning("server not running: %s", cs)
-            except:
-                self.log.error(str(traceback.format_exc()))
-
-            # Wait and clear the threading event
-            self.serve_event.wait()
-            self.serve_event.clear()
-        self.log.debug('serve exit')
-
-    def refresh_keys(self):
-        self.keys = {}
-        rawkeys = self.get_store_prefix('keys/') or {}
-        for k, v in rawkeys.items():
-            self.keys[k[5:]] = v  # strip of keys/ prefix
-
-    def _serve(self):
-        # Load stored authentication keys
-        self.refresh_keys()
-
-        jsonify._instance = jsonify.GenericJSON(
-            sort_keys=True,
-            indent=4,
-            separators=(',', ': '),
-        )
-
-        server_addr = self.get_localized_module_option('server_addr', '::')
-        if server_addr is None:
-            raise CannotServe('no server_addr configured; try "ceph config-key set mgr/restful/server_addr <ip>"')
-
-        server_port = int(self.get_localized_module_option('server_port', '8003'))
-        self.log.info('server_addr: %s server_port: %d',
-                      server_addr, server_port)
-
-        cert = self.get_localized_store("crt")
-        if cert is not None:
-            cert_tmp = tempfile.NamedTemporaryFile()
-            cert_tmp.write(cert.encode('utf-8'))
-            cert_tmp.flush()
-            cert_fname = cert_tmp.name
-        else:
-            cert_fname = self.get_localized_store('crt_file')
-
-        pkey = self.get_localized_store("key")
-        if pkey is not None:
-            pkey_tmp = tempfile.NamedTemporaryFile()
-            pkey_tmp.write(pkey.encode('utf-8'))
-            pkey_tmp.flush()
-            pkey_fname = pkey_tmp.name
-        else:
-            pkey_fname = self.get_localized_module_option('key_file')
-
-        self.enable_auth = self.get_localized_module_option('enable_auth', True)
-        
-        if not cert_fname or not pkey_fname:
-            raise CannotServe('no certificate configured')
-        if not os.path.isfile(cert_fname):
-            raise CannotServe('certificate %s does not exist' % cert_fname)
-        if not os.path.isfile(pkey_fname):
-            raise CannotServe('private key %s does not exist' % pkey_fname)
-
-        # Publish the URI that others may use to access the service we're
-        # about to start serving
-        addr = self.get_mgr_ip() if server_addr == "::" else server_addr
-        self.set_uri(build_url(scheme='https', host=addr, port=server_port, path='/'))
-
-        # Create the HTTPS werkzeug server serving pecan app
-        self.server = make_server(
-            host=server_addr,
-            port=server_port,
-            app=make_app(
-                root='restful.api.Root',
-                hooks = [ErrorHook()],  # use a callable if pecan >= 0.3.2
-            ),
-            ssl_context=(cert_fname, pkey_fname),
-        )
-        sock_fd_flag = fcntl.fcntl(self.server.socket.fileno(), fcntl.F_GETFD)
-        if not (sock_fd_flag & fcntl.FD_CLOEXEC):
-            self.log.debug("set server socket close-on-exec")
-            fcntl.fcntl(self.server.socket.fileno(), fcntl.F_SETFD, sock_fd_flag | fcntl.FD_CLOEXEC)
-        if self.stop_server:
-            self.log.debug('made server, but stop flag set')
-        else:
-            self.log.debug('made server, serving forever')
-            self.server.serve_forever()
-
-
-    def shutdown(self):
-        self.log.debug('shutdown enter')
-        try:
-            self.stop_server = True
-            if self.server:
-                self.log.debug('calling server.shutdown')
-                self.server.shutdown()
-                self.log.debug('called server.shutdown')
-            self.serve_event.set()
-        except:
-            self.log.error(str(traceback.format_exc()))
-            raise
-        self.log.debug('shutdown exit')
-
-
-    def restart(self):
-        try:
-            if self.server:
-                self.server.shutdown()
-            self.serve_event.set()
-        except:
-            self.log.error(str(traceback.format_exc()))
-
-
-    def notify(self, notify_type: NotifyType, tag: str):
-        try:
-            self._notify(notify_type, tag)
-        except:
-            self.log.error(str(traceback.format_exc()))
-
-
-    def _notify(self, notify_type: NotifyType, tag):
-        if notify_type != NotifyType.command:
-            self.log.debug("Unhandled notification type '%s'", notify_type)
-            return
-        # we can safely skip all the sequential commands
-        if tag == 'seq':
-            return
-        try:
-            with self.requests_lock:
-                request = next(x for x in self.requests if x.is_running(tag))
-            request.finish(tag)
-            if request.is_ready():
-                request.next()
-        except StopIteration:
-            # the command was not issued by me
-            pass
-
-    def config_notify(self):
-        self.enable_auth = self.get_localized_module_option('enable_auth', True)
-
-
-    def create_self_signed_cert(self):
-        # create a key pair
-        pkey = crypto.PKey()
-        pkey.generate_key(crypto.TYPE_RSA, 2048)
-
-        # create a self-signed cert
-        cert = crypto.X509()
-        cert.get_subject().O = "IT"
-        cert.get_subject().CN = "ceph-restful"
-        cert.set_serial_number(int(uuid4()))
-        cert.gmtime_adj_notBefore(0)
-        cert.gmtime_adj_notAfter(10*365*24*60*60)
-        cert.set_issuer(cert.get_subject())
-        cert.set_pubkey(pkey)
-        cert.sign(pkey, 'sha512')
-
-        return (
-            crypto.dump_certificate(crypto.FILETYPE_PEM, cert),
-            crypto.dump_privatekey(crypto.FILETYPE_PEM, pkey)
-        )
-
-
-    def handle_command(self, inbuf, command):
-        self.log.warning("Handling command: '%s'" % str(command))
-        if command['prefix'] == "restful create-key":
-            if command['key_name'] in self.keys:
-                return 0, self.keys[command['key_name']], ""
-
-            else:
-                key = str(uuid4())
-                self.keys[command['key_name']] = key
-                self.set_store('keys/' + command['key_name'], key)
-
-            return (
-                0,
-                self.keys[command['key_name']],
-                "",
-            )
-
-        elif command['prefix'] == "restful delete-key":
-            if command['key_name'] in self.keys:
-                del self.keys[command['key_name']]
-                self.set_store('keys/' + command['key_name'], None)
-
-            return (
-                0,
-                "",
-                "",
-            )
-
-        elif command['prefix'] == "restful list-keys":
-            self.refresh_keys()
-            return (
-                0,
-                json.dumps(self.keys, indent=4, sort_keys=True),
-                "",
-            )
-
-        elif command['prefix'] == "restful create-self-signed-cert":
-            cert, pkey = self.create_self_signed_cert()
-            self.set_store(self.get_mgr_id() + '/crt', cert.decode('utf-8'))
-            self.set_store(self.get_mgr_id() + '/key', pkey.decode('utf-8'))
-
-            self.restart()
-            return (
-                0,
-                "Restarting RESTful API server...",
-                ""
-            )
-
-        elif command['prefix'] == 'restful restart':
-            self.restart();
-            return (
-                0,
-                "Restarting RESTful API server...",
-                ""
-            )
-
-        else:
-            return (
-                -errno.EINVAL,
-                "",
-                "Command not found '{0}'".format(command['prefix'])
-            )
-
-
-    def get_doc_api(self, root, prefix=''):
-        doc = {}
-        for _obj in dir(root):
-            obj = getattr(root, _obj)
-
-            if isinstance(obj, RestController):
-                doc.update(self.get_doc_api(obj, prefix + '/' + _obj))
-
-        if getattr(root, '_lookup', None) and isinstance(root._lookup('0')[0], RestController):
-            doc.update(self.get_doc_api(root._lookup('0')[0], prefix + '/<arg>'))
-
-        prefix = prefix or '/'
-
-        doc[prefix] = {}
-        for method in 'get', 'post', 'patch', 'delete':
-            if getattr(root, method, None):
-                doc[prefix][method.upper()] = inspect.getdoc(getattr(root, method)).split('\n')
-
-        if len(doc[prefix]) == 0:
-            del doc[prefix]
-
-        return doc
-
-
-    def get_mons(self):
-        mon_map_mons = self.get('mon_map')['mons']
-        mon_status = json.loads(self.get('mon_status')['json'])
-
-        # Add more information
-        for mon in mon_map_mons:
-            mon['in_quorum'] = mon['rank'] in mon_status['quorum']
-            mon['server'] = self.get_metadata("mon", mon['name'])['hostname']
-            mon['leader'] = mon['rank'] == mon_status['quorum'][0]
-
-        return mon_map_mons
-
-
-    def get_osd_pools(self):
-        osds = dict(map(lambda x: (x['osd'], []), self.get('osd_map')['osds']))
-        pools = dict(map(lambda x: (x['pool'], x), self.get('osd_map')['pools']))
-        crush = self.get('osd_map_crush')
-        crush_rules = crush['rules']
-
-        osds_by_pool = {}
-        for pool_id, pool in pools.items():
-            pool_osds = None
-            for rule in [r for r in crush_rules if r['rule_id'] == pool['crush_rule']]:
-                pool_osds = common.crush_rule_osds(crush['buckets'], rule)
-
-            osds_by_pool[pool_id] = pool_osds
-
-        for pool_id in pools.keys():
-            for in_pool_id in osds_by_pool[pool_id]:
-                osds[in_pool_id].append(pool_id)
-
-        return osds
-
-
-    def get_osds(self, pool_id=None, ids=None):
-        # Get data
-        osd_map = self.get('osd_map')
-        osd_metadata = self.get('osd_metadata')
-
-        # Update the data with the additional info from the osd map
-        osds = osd_map['osds']
-
-        # Filter by osd ids
-        if ids is not None:
-            osds = [x for x in osds if str(x['osd']) in ids]
-
-        # Get list of pools per osd node
-        pools_map = self.get_osd_pools()
-
-        # map osd IDs to reweight
-        reweight_map = dict([
-            (x.get('id'), x.get('reweight', None))
-            for x in self.get('osd_map_tree')['nodes']
-        ])
-
-        # Build OSD data objects
-        for osd in osds:
-            osd['pools'] = pools_map[osd['osd']]
-            osd['server'] = osd_metadata.get(str(osd['osd']), {}).get('hostname', None)
-
-            osd['reweight'] = reweight_map.get(osd['osd'], 0.0)
-
-            if osd['up']:
-                osd['valid_commands'] = common.OSD_IMPLEMENTED_COMMANDS
-            else:
-                osd['valid_commands'] = []
-
-        # Filter by pool
-        if pool_id:
-            pool_id = int(pool_id)
-            osds = [x for x in osds if pool_id in x['pools']]
-
-        return osds
-
-
-    def get_osd_by_id(self, osd_id):
-        osd = [x for x in self.get('osd_map')['osds']
-               if x['osd'] == osd_id]
-
-        if len(osd) != 1:
-            return None
-
-        return osd[0]
-
-
-    def get_pool_by_id(self, pool_id):
-        pool = [x for x in self.get('osd_map')['pools']
-                if x['pool'] == pool_id]
-
-        if len(pool) != 1:
-            return None
-
-        return pool[0]
-
-
-    def submit_request(self, _request, **kwargs):
-        with self.requests_lock:
-            request = CommandsRequest(_request)
-            self.requests.append(request)
-            if len(self.requests) > self.max_requests:
-                req_to_trim = 0
-                for i, req in enumerate(self.requests):
-                    if req.is_finished():
-                        self.log.error("Trimmed one finished request due to exceeded maximum requests limit")
-                        req_to_trim = i
-                        break
-                    else:
-                        self.log.error("Trimmed the oldest unfinished request due to exceeded maximum requests limit")
-                self.requests.pop(req_to_trim)
-        if kwargs.get('wait', 0):
-            while not request.is_finished():
-                time.sleep(0.001)
-        return request
-
-
-    def run_command(self, command):
-        # tag with 'seq' so that we can ignore these in notify function
-        result = CommandResult('seq')
-
-        self.send_command(result, 'mon', '', json.dumps(command), 'seq')
-        return result.wait()
diff --git a/src/pybind/mgr/smb/handler.py b/src/pybind/mgr/smb/handler.py
index 7b993d5b60d..5adf319b2f5 100644
--- a/src/pybind/mgr/smb/handler.py
+++ b/src/pybind/mgr/smb/handler.py
@@ -29,6 +29,7 @@ from .enums import (
     JoinSourceType,
     LoginAccess,
     LoginCategory,
+    SMBClustering,
     State,
     UserGroupSourceType,
 )
@@ -788,24 +789,33 @@ def order_resources(
 
 def _check_cluster(cluster: ClusterRef, staging: _Staging) -> None:
     """Check that the cluster resource can be updated."""
-    if cluster.intent == Intent.REMOVED:
-        share_ids = ShareEntry.ids(staging)
-        clusters_used = {cid for cid, _ in share_ids}
-        if cluster.cluster_id in clusters_used:
-            raise ErrorResult(
-                cluster,
-                msg="cluster in use by shares",
-                status={
-                    'shares': [
-                        shid
-                        for cid, shid in share_ids
-                        if cid == cluster.cluster_id
-                    ]
-                },
-            )
-        return
+    if cluster.intent == Intent.PRESENT:
+        return _check_cluster_present(cluster, staging)
+    return _check_cluster_removed(cluster, staging)
+
+
+def _check_cluster_removed(cluster: ClusterRef, staging: _Staging) -> None:
+    share_ids = ShareEntry.ids(staging)
+    clusters_used = {cid for cid, _ in share_ids}
+    if cluster.cluster_id in clusters_used:
+        raise ErrorResult(
+            cluster,
+            msg="cluster in use by shares",
+            status={
+                'shares': [
+                    shid
+                    for cid, shid in share_ids
+                    if cid == cluster.cluster_id
+                ]
+            },
+        )
+
+
+def _check_cluster_present(cluster: ClusterRef, staging: _Staging) -> None:
     assert isinstance(cluster, resources.Cluster)
     cluster.validate()
+    if not staging.is_new(cluster):
+        _check_cluster_modifications(cluster, staging)
     for auth_ref in _auth_refs(cluster):
         auth = staging.get_join_auth(auth_ref)
         if (
@@ -834,6 +844,53 @@ def _check_cluster(cluster: ClusterRef, staging: _Staging) -> None:
             )
 
 
+def _check_cluster_modifications(
+    cluster: resources.Cluster, staging: _Staging
+) -> None:
+    """cluster has some fields we do not permit changing after the cluster has
+    been created.
+    """
+    prev = ClusterEntry.from_store(
+        staging.destination_store, cluster.cluster_id
+    ).get_cluster()
+    if cluster.auth_mode != prev.auth_mode:
+        raise ErrorResult(
+            cluster,
+            'auth_mode value may not be changed',
+            status={'existing_auth_mode': prev.auth_mode},
+        )
+    if cluster.auth_mode == AuthMode.ACTIVE_DIRECTORY:
+        assert prev.domain_settings
+        if not cluster.domain_settings:
+            # should not occur
+            raise ErrorResult(cluster, "domain settings missing from cluster")
+        if cluster.domain_settings.realm != prev.domain_settings.realm:
+            raise ErrorResult(
+                cluster,
+                'domain/realm value may not be changed',
+                status={'existing_domain_realm': prev.domain_settings.realm},
+            )
+    if cluster.is_clustered() != prev.is_clustered():
+        prev_clustering = prev.is_clustered()
+        cterms = {True: 'enabled', False: 'disabled'}
+        msg = (
+            f'a cluster resource with clustering {cterms[prev_clustering]}'
+            f' may not be changed to clustering {cterms[not prev_clustering]}'
+        )
+        opt_terms = {
+            True: SMBClustering.ALWAYS.value,
+            False: SMBClustering.NEVER.value,
+        }
+        hint = {
+            'note': (
+                'Set "clustering" to an explicit value that matches the'
+                ' current clustering behavior'
+            ),
+            'value': opt_terms[prev_clustering],
+        }
+        raise ErrorResult(cluster, msg, status={'hint': hint})
+
+
 def _parse_earmark(earmark: str) -> dict:
     parts = earmark.split('.')
 
diff --git a/src/pybind/mgr/smb/internal.py b/src/pybind/mgr/smb/internal.py
index 3571ed44400..57e7a0c0278 100644
--- a/src/pybind/mgr/smb/internal.py
+++ b/src/pybind/mgr/smb/internal.py
@@ -4,7 +4,7 @@ resources that the internal store holds.
 from typing import Collection, Tuple, Type, TypeVar
 
 from . import resources
-from .enums import AuthMode, ConfigNS, State
+from .enums import ConfigNS, State
 from .proto import (
     ConfigEntry,
     ConfigStore,
@@ -14,7 +14,6 @@ from .proto import (
     Simplifiable,
 )
 from .resources import SMBResource
-from .results import ErrorResult
 from .utils import one
 
 T = TypeVar('T')
@@ -108,43 +107,6 @@ class ClusterEntry(ResourceEntry):
     def get_cluster(self) -> resources.Cluster:
         return self.get_resource_type(resources.Cluster)
 
-    def create_or_update(self, resource: Simplifiable) -> State:
-        assert isinstance(resource, resources.Cluster)
-        try:
-            previous = self.config_entry.get()
-        except KeyError:
-            previous = None
-        current = resource.to_simplified()
-        if current == previous:
-            return State.PRESENT
-        elif previous is None:
-            self.config_entry.set(current)
-            return State.CREATED
-        # cluster is special in that is has some fields that we do not
-        # permit changing.
-        prev = getattr(
-            resources.Cluster, '_resource_config'
-        ).object_from_simplified(previous)
-        if resource.auth_mode != prev.auth_mode:
-            raise ErrorResult(
-                resource,
-                'auth_mode value may not be changed',
-                status={'existing_auth_mode': prev.auth_mode},
-            )
-        if resource.auth_mode == AuthMode.ACTIVE_DIRECTORY:
-            assert resource.domain_settings
-            assert prev.domain_settings
-            if resource.domain_settings.realm != prev.domain_settings.realm:
-                raise ErrorResult(
-                    resource,
-                    'domain/realm value may not be changed',
-                    status={
-                        'existing_domain_realm': prev.domain_settings.realm
-                    },
-                )
-        self.config_entry.set(current)
-        return State.UPDATED
-
 
 class ShareEntry(ResourceEntry):
     """Share resource getter/setter for the smb internal data store(s)."""
diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini
index f39ececa93d..5afbe93ace0 100644
--- a/src/pybind/mgr/tox.ini
+++ b/src/pybind/mgr/tox.ini
@@ -114,8 +114,7 @@ commands =
            -m telegraf \
            -m telemetry \
            -m test_orchestrator \
-           -m volumes \
-           -m zabbix
+           -m volumes 
 
 
 [testenv:test]
@@ -161,7 +160,7 @@ commands =
     flake8 --config=tox.ini {posargs} \
       {posargs:{[testenv:flake8]modules}}
     bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3'
-    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 8'
 
 [testenv:jinjalint]
 deps =
diff --git a/src/pybind/mgr/volumes/fs/async_cloner.py b/src/pybind/mgr/volumes/fs/async_cloner.py
index 463c1000596..1525f57c3f8 100644
--- a/src/pybind/mgr/volumes/fs/async_cloner.py
+++ b/src/pybind/mgr/volumes/fs/async_cloner.py
@@ -313,6 +313,8 @@ class Cloner(AsyncJobs):
     the driver. file types supported are directories, symbolic links and regular files.
     """
     def __init__(self, volume_client, tp_size, snapshot_clone_delay, clone_no_wait):
+        super(Cloner, self).__init__(volume_client, "cloner", tp_size)
+
         self.vc = volume_client
         self.snapshot_clone_delay = snapshot_clone_delay
         self.snapshot_clone_no_wait = clone_no_wait
@@ -323,7 +325,6 @@ class Cloner(AsyncJobs):
             SubvolumeStates.STATE_FAILED       : handle_clone_failed,
             SubvolumeStates.STATE_CANCELED     : handle_clone_failed,
         }
-        super(Cloner, self).__init__(volume_client, "cloner", tp_size)
 
     def reconfigure_max_concurrent_clones(self, tp_size):
         return super(Cloner, self).reconfigure_max_async_threads(tp_size)
diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py
index 83a119ca556..075fedf20a4 100644
--- a/src/pybind/mgr/volumes/fs/async_job.py
+++ b/src/pybind/mgr/volumes/fs/async_job.py
@@ -19,11 +19,12 @@ class JobThread(threading.Thread):
     MAX_RETRIES_ON_EXCEPTION = 10
 
     def __init__(self, async_job, volume_client, name):
+        threading.Thread.__init__(self, name=name)
+
         self.vc = volume_client
         self.async_job = async_job
         # event object to cancel jobs
         self.cancel_event = threading.Event()
-        threading.Thread.__init__(self, name=name)
 
     def run(self):
         retries = 0
@@ -117,16 +118,21 @@ class AsyncJobs(threading.Thread):
 
     def __init__(self, volume_client, name_pfx, nr_concurrent_jobs):
         threading.Thread.__init__(self, name="{0}.tick".format(name_pfx))
+
         self.vc = volume_client
-        # queue of volumes for starting async jobs
+        # self.q is a deque of names of a volumes for which async jobs needs
+        # to be started.
         self.q = deque()  # type: deque
-        # volume => job tracking
+
+        # self.jobs is a dictionary where volume name is the key and value is
+        # a tuple containing two members: the async job and an instance of
+        # threading.Thread that performs that job.
+        # in short, self.jobs = {volname: (async_job, thread instance)}.
         self.jobs = {}
+
         # lock, cv for kickstarting jobs
         self.lock = threading.Lock()
         self.cv = threading.Condition(self.lock)
-        # cv for job cancelation
-        self.waiting = False
         self.stopping = threading.Event()
         self.cancel_cv = threading.Condition(self.lock)
         self.nr_concurrent_jobs = nr_concurrent_jobs
@@ -136,11 +142,31 @@ class AsyncJobs(threading.Thread):
         self.wakeup_timeout = None
 
         self.threads = []
-        for i in range(self.nr_concurrent_jobs):
-            self.threads.append(JobThread(self, volume_client, name="{0}.{1}".format(self.name_pfx, i)))
-            self.threads[-1].start()
+        self.spawn_all_threads()
         self.start()
 
+    def spawn_new_thread(self, suffix):
+        t_name = f'{self.name_pfx}.{time.time()}.{suffix}'
+        log.debug(f'spawning new thread with name {t_name}')
+        t = JobThread(self, self.vc, name=t_name)
+        t.start()
+
+        self.threads.append(t)
+
+    def spawn_all_threads(self):
+        log.debug(f'spawning {self.nr_concurrent_jobs} to execute more jobs '
+                  'concurrently')
+        for i in range(self.nr_concurrent_jobs):
+            self.spawn_new_thread(i)
+
+    def spawn_more_threads(self):
+        c = len(self.threads)
+        diff = self.nr_concurrent_jobs - c
+        log.debug(f'spawning {diff} threads to execute more jobs concurrently')
+
+        for i in range(c, self.nr_concurrent_jobs):
+            self.spawn_new_thread(i)
+
     def set_wakeup_timeout(self):
         with self.lock:
             # not made configurable on purpose
@@ -163,10 +189,7 @@ class AsyncJobs(threading.Thread):
                     self.cv.notifyAll()
                 elif c < self.nr_concurrent_jobs:
                     # Increase concurrency: create more threads.
-                    log.debug("creating new threads to job increase")
-                    for i in range(c, self.nr_concurrent_jobs):
-                        self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(self.name_pfx, time.time(), i)))
-                        self.threads[-1].start()
+                    self.spawn_more_threads()
                 self.cv.wait(timeout=self.wakeup_timeout)
 
     def shutdown(self):
diff --git a/src/pybind/mgr/volumes/fs/purge_queue.py b/src/pybind/mgr/volumes/fs/purge_queue.py
index abace19d029..8917b475ac6 100644
--- a/src/pybind/mgr/volumes/fs/purge_queue.py
+++ b/src/pybind/mgr/volumes/fs/purge_queue.py
@@ -103,9 +103,10 @@ class ThreadPoolPurgeQueueMixin(AsyncJobs):
     _all_ threads purging entries for one volume (starving other volumes).
     """
     def __init__(self, volume_client, tp_size):
-        self.vc = volume_client
         super(ThreadPoolPurgeQueueMixin, self).__init__(volume_client, "purgejob", tp_size)
 
+        self.vc = volume_client
+
     def get_next_job(self, volname, running_jobs):
         return get_trash_entry_for_volume(self.fs_client, self.vc.volspec, volname, running_jobs)
 
diff --git a/src/pybind/mgr/zabbix/__init__.py b/src/pybind/mgr/zabbix/__init__.py
deleted file mode 100644
index 8f210ac9247..00000000000
--- a/src/pybind/mgr/zabbix/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .module import Module
diff --git a/src/pybind/mgr/zabbix/module.py b/src/pybind/mgr/zabbix/module.py
deleted file mode 100644
index 2e348ab0391..00000000000
--- a/src/pybind/mgr/zabbix/module.py
+++ /dev/null
@@ -1,476 +0,0 @@
-"""
-Zabbix module for ceph-mgr
-
-Collect statistics from Ceph cluster and every X seconds send data to a Zabbix
-server using the zabbix_sender executable.
-"""
-import logging
-import json
-import errno
-import re
-from subprocess import Popen, PIPE
-from threading import Event
-from mgr_module import CLIReadCommand, CLIWriteCommand, MgrModule, Option, OptionValue
-from typing import cast, Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union
-
-
-def avg(data: Sequence[Union[int, float]]) -> float:
-    if len(data):
-        return sum(data) / float(len(data))
-    else:
-        return 0
-
-
-class ZabbixSender(object):
-    def __init__(self, sender: str, host: str, port: int, log: logging.Logger) -> None:
-        self.sender = sender
-        self.host = host
-        self.port = port
-        self.log = log
-
-    def send(self, hostname: str, data: Mapping[str, Union[int, float, str]]) -> None:
-        if len(data) == 0:
-            return
-
-        cmd = [self.sender, '-z', self.host, '-p', str(self.port), '-s',
-               hostname, '-vv', '-i', '-']
-
-        self.log.debug('Executing: %s', cmd)
-
-        proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, encoding='utf-8')
-
-        for key, value in data.items():
-            assert proc.stdin
-            proc.stdin.write('{0} ceph.{1} {2}\n'.format(hostname, key, value))
-
-        stdout, stderr = proc.communicate()
-        if proc.returncode != 0:
-            raise RuntimeError('%s exited non-zero: %s' % (self.sender,
-                                                           stderr))
-
-        self.log.debug('Zabbix Sender: %s', stdout.rstrip())
-
-
-class Module(MgrModule):
-    run = False
-    config: Dict[str, OptionValue] = {}
-    ceph_health_mapping = {'HEALTH_OK': 0, 'HEALTH_WARN': 1, 'HEALTH_ERR': 2}
-    _zabbix_hosts: List[Dict[str, Union[str, int]]] = list()
-
-    @property
-    def config_keys(self) -> Dict[str, OptionValue]:
-        return dict((o['name'], o.get('default', None))
-                for o in self.MODULE_OPTIONS)
-
-    MODULE_OPTIONS = [
-        Option(
-            name='zabbix_sender',
-            default='/usr/bin/zabbix_sender'),
-        Option(
-            name='zabbix_host',
-            type='str',
-            default=None),
-        Option(
-            name='zabbix_port',
-            type='int',
-            default=10051),
-        Option(
-            name='identifier',
-            default=""),
-        Option(
-            name='interval',
-            type='secs',
-            default=60),
-        Option(
-            name='discovery_interval',
-            type='uint',
-            default=100)
-    ]
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super(Module, self).__init__(*args, **kwargs)
-        self.event = Event()
-
-    def init_module_config(self) -> None:
-        self.fsid = self.get('mon_map')['fsid']
-        self.log.debug('Found Ceph fsid %s', self.fsid)
-
-        for key, default in self.config_keys.items():
-            self.set_config_option(key, self.get_module_option(key, default))
-
-        if self.config['zabbix_host']:
-            self._parse_zabbix_hosts()
-
-    def set_config_option(self, option: str, value: OptionValue) -> bool:
-        if option not in self.config_keys.keys():
-            raise RuntimeError('{0} is a unknown configuration '
-                               'option'.format(option))
-
-        if option in ['zabbix_port', 'interval', 'discovery_interval']:
-            try:
-                int_value = int(value)  # type: ignore
-            except (ValueError, TypeError):
-                raise RuntimeError('invalid {0} configured. Please specify '
-                                   'a valid integer'.format(option))
-
-        if option == 'interval' and int_value < 10:
-            raise RuntimeError('interval should be set to at least 10 seconds')
-
-        if option == 'discovery_interval' and int_value < 10:
-            raise RuntimeError(
-                "discovery_interval should not be more frequent "
-                "than once in 10 regular data collection"
-            )
-
-        self.log.debug('Setting in-memory config option %s to: %s', option,
-                       value)
-        self.config[option] = value
-        return True
-
-    def _parse_zabbix_hosts(self) -> None:
-        self._zabbix_hosts = list()
-        servers = cast(str, self.config['zabbix_host']).split(",")
-        for server in servers:
-            uri = re.match(r"(?:(?:\[?)([a-z0-9-\.]+|[a-f0-9:\.]+)(?:\]?))(?:((?::))([0-9]{1,5}))?$", server)
-            if uri:
-                zabbix_host, sep, opt_zabbix_port = uri.groups()
-                if sep == ':':
-                    zabbix_port = int(opt_zabbix_port)
-                else:
-                    zabbix_port = cast(int, self.config['zabbix_port'])
-                self._zabbix_hosts.append({'zabbix_host': zabbix_host, 'zabbix_port': zabbix_port})
-            else:
-                self.log.error('Zabbix host "%s" is not valid', server)
-
-        self.log.error('Parsed Zabbix hosts: %s', self._zabbix_hosts)
-
-    def get_pg_stats(self) -> Dict[str, int]:
-        stats = dict()
-
-        pg_states = ['active', 'peering', 'clean', 'scrubbing', 'undersized',
-                     'backfilling', 'recovering', 'degraded', 'inconsistent',
-                     'remapped', 'backfill_toofull', 'backfill_wait',
-                     'recovery_wait']
-
-        for state in pg_states:
-            stats['num_pg_{0}'.format(state)] = 0
-
-        pg_status = self.get('pg_status')
-
-        stats['num_pg'] = pg_status['num_pgs']
-
-        for state in pg_status['pgs_by_state']:
-            states = state['state_name'].split('+')
-            for s in pg_states:
-                key = 'num_pg_{0}'.format(s)
-                if s in states:
-                    stats[key] += state['count']
-
-        return stats
-
-    def get_data(self) -> Dict[str, Union[int, float]]:
-        data = dict()
-
-        health = json.loads(self.get('health')['json'])
-        # 'status' is luminous+, 'overall_status' is legacy mode.
-        data['overall_status'] = health.get('status',
-                                            health.get('overall_status'))
-        data['overall_status_int'] = \
-            self.ceph_health_mapping.get(data['overall_status'])
-
-        mon_status = json.loads(self.get('mon_status')['json'])
-        data['num_mon'] = len(mon_status['monmap']['mons'])
-
-        df = self.get('df')
-        data['num_pools'] = len(df['pools'])
-        data['total_used_bytes'] = df['stats']['total_used_bytes']
-        data['total_bytes'] = df['stats']['total_bytes']
-        data['total_avail_bytes'] = df['stats']['total_avail_bytes']
-
-        wr_ops = 0
-        rd_ops = 0
-        wr_bytes = 0
-        rd_bytes = 0
-
-        for pool in df['pools']:
-            wr_ops += pool['stats']['wr']
-            rd_ops += pool['stats']['rd']
-            wr_bytes += pool['stats']['wr_bytes']
-            rd_bytes += pool['stats']['rd_bytes']
-            data['[{0},rd_bytes]'.format(pool['name'])] = pool['stats']['rd_bytes']
-            data['[{0},wr_bytes]'.format(pool['name'])] = pool['stats']['wr_bytes']
-            data['[{0},rd_ops]'.format(pool['name'])] = pool['stats']['rd']
-            data['[{0},wr_ops]'.format(pool['name'])] = pool['stats']['wr']
-            data['[{0},bytes_used]'.format(pool['name'])] = pool['stats']['bytes_used']
-            data['[{0},stored_raw]'.format(pool['name'])] = pool['stats']['stored_raw']
-            data['[{0},percent_used]'.format(pool['name'])] = pool['stats']['percent_used'] * 100
-
-        data['wr_ops'] = wr_ops
-        data['rd_ops'] = rd_ops
-        data['wr_bytes'] = wr_bytes
-        data['rd_bytes'] = rd_bytes
-
-        osd_map = self.get('osd_map')
-        data['num_osd'] = len(osd_map['osds'])
-        data['osd_nearfull_ratio'] = osd_map['nearfull_ratio']
-        data['osd_full_ratio'] = osd_map['full_ratio']
-        data['osd_backfillfull_ratio'] = osd_map['backfillfull_ratio']
-
-        data['num_pg_temp'] = len(osd_map['pg_temp'])
-
-        num_up = 0
-        num_in = 0
-        for osd in osd_map['osds']:
-            data['[osd.{0},up]'.format(int(osd['osd']))] = osd['up']
-            if osd['up'] == 1:
-                num_up += 1
-
-            data['[osd.{0},in]'.format(int(osd['osd']))] = osd['in']
-            if osd['in'] == 1:
-                num_in += 1
-
-        data['num_osd_up'] = num_up
-        data['num_osd_in'] = num_in
-
-        osd_fill = list()
-        osd_pgs = list()
-        osd_apply_latency_ns = list()
-        osd_commit_latency_ns = list()
-
-        osd_stats = self.get('osd_stats')
-        for osd in osd_stats['osd_stats']:
-            try:
-                osd_fill.append((float(osd['kb_used']) / float(osd['kb'])) * 100)
-                data['[osd.{0},osd_fill]'.format(osd['osd'])] = (
-                    float(osd['kb_used']) / float(osd['kb'])) * 100
-            except ZeroDivisionError:
-                continue
-            osd_pgs.append(osd['num_pgs'])
-            osd_apply_latency_ns.append(osd['perf_stat']['apply_latency_ns'])
-            osd_commit_latency_ns.append(osd['perf_stat']['commit_latency_ns'])
-            data['[osd.{0},num_pgs]'.format(osd['osd'])] = osd['num_pgs']
-            data[
-                '[osd.{0},osd_latency_apply]'.format(osd['osd'])
-            ] = osd['perf_stat']['apply_latency_ns']  / 1000000.0 # ns -> ms
-            data[
-                '[osd.{0},osd_latency_commit]'.format(osd['osd'])
-            ] = osd['perf_stat']['commit_latency_ns']  / 1000000.0 # ns -> ms
-
-        try:
-            data['osd_max_fill'] = max(osd_fill)
-            data['osd_min_fill'] = min(osd_fill)
-            data['osd_avg_fill'] = avg(osd_fill)
-            data['osd_max_pgs'] = max(osd_pgs)
-            data['osd_min_pgs'] = min(osd_pgs)
-            data['osd_avg_pgs'] = avg(osd_pgs)
-        except ValueError:
-            pass
-
-        try:
-            data['osd_latency_apply_max'] = max(osd_apply_latency_ns) / 1000000.0 # ns -> ms
-            data['osd_latency_apply_min'] = min(osd_apply_latency_ns) / 1000000.0 # ns -> ms
-            data['osd_latency_apply_avg'] = avg(osd_apply_latency_ns) / 1000000.0 # ns -> ms
-
-            data['osd_latency_commit_max'] = max(osd_commit_latency_ns) / 1000000.0 # ns -> ms
-            data['osd_latency_commit_min'] = min(osd_commit_latency_ns) / 1000000.0 # ns -> ms
-            data['osd_latency_commit_avg'] = avg(osd_commit_latency_ns) / 1000000.0 # ns -> ms
-        except ValueError:
-            pass
-
-        data.update(self.get_pg_stats())
-
-        return data
-
-    def send(self, data: Mapping[str, Union[int, float, str]]) -> bool:
-        identifier = cast(Optional[str], self.config['identifier'])
-        if identifier is None or len(identifier) == 0:
-            identifier = 'ceph-{0}'.format(self.fsid)
-
-        if not self.config['zabbix_host'] or not self._zabbix_hosts:
-            self.log.error('Zabbix server not set, please configure using: '
-                           'ceph zabbix config-set zabbix_host <zabbix_host>')
-            self.set_health_checks({
-                'MGR_ZABBIX_NO_SERVER': {
-                    'severity': 'warning',
-                    'summary': 'No Zabbix server configured',
-                    'detail': ['Configuration value zabbix_host not configured']
-                }
-            })
-            return False
-
-        result = True
-
-        for server in self._zabbix_hosts:
-            self.log.info(
-                'Sending data to Zabbix server %s, port %s as host/identifier %s',
-                server['zabbix_host'], server['zabbix_port'], identifier)
-            self.log.debug(data)
-
-            try:
-                zabbix = ZabbixSender(cast(str, self.config['zabbix_sender']),
-                                      cast(str, server['zabbix_host']),
-                                      cast(int, server['zabbix_port']), self.log)
-                zabbix.send(identifier, data)
-            except Exception as exc:
-                self.log.exception('Failed to send.')
-                self.set_health_checks({
-                    'MGR_ZABBIX_SEND_FAILED': {
-                        'severity': 'warning',
-                        'summary': 'Failed to send data to Zabbix',
-                        'detail': [str(exc)]
-                    }
-                })
-                result = False
-
-        self.set_health_checks(dict())
-        return result
-
-    def discovery(self) -> bool:
-        osd_map = self.get('osd_map')
-        osd_map_crush = self.get('osd_map_crush')
-
-        # Discovering ceph pools
-        pool_discovery = {
-            pool['pool_name']: step['item_name']
-            for pool in osd_map['pools']
-            for rule in osd_map_crush['rules'] if rule['rule_id'] == pool['crush_rule']
-            for step in rule['steps'] if step['op'] == "take"
-        }
-        pools_discovery_data = {"data": [
-            {
-                "{#POOL}": pool,
-                "{#CRUSH_RULE}": rule
-            }
-            for pool, rule in pool_discovery.items()
-        ]}
-
-        # Discovering OSDs
-        # Getting hosts for found crush rules
-        osd_roots = {
-            step['item_name']: [
-                item['id']
-                for item in root_bucket['items']
-            ]
-            for rule in osd_map_crush['rules']
-            for step in rule['steps'] if step['op'] == "take"
-            for root_bucket in osd_map_crush['buckets']
-            if root_bucket['id'] == step['item']
-        }
-        # Getting osds for hosts with map to crush_rule
-        osd_discovery = {
-            item['id']: crush_rule
-            for crush_rule, roots in osd_roots.items()
-            for root in roots
-            for bucket in osd_map_crush['buckets']
-            if bucket['id'] == root
-            for item in bucket['items']
-        }
-        osd_discovery_data = {"data": [
-            {
-                "{#OSD}": osd,
-                "{#CRUSH_RULE}": rule
-            }
-            for osd, rule in osd_discovery.items()
-        ]}
-        # Preparing recieved data for sending
-        data = {
-            "zabbix.pool.discovery": json.dumps(pools_discovery_data),
-            "zabbix.osd.discovery": json.dumps(osd_discovery_data)
-        }
-        return bool(self.send(data))
-
-    @CLIReadCommand('zabbix config-show')
-    def config_show(self) -> Tuple[int, str, str]:
-        """
-        Show current configuration
-        """
-        return 0, json.dumps(self.config, indent=4, sort_keys=True), ''
-
-    @CLIWriteCommand('zabbix config-set')
-    def config_set(self, key: str, value: str) -> Tuple[int, str, str]:
-        """
-        Set a configuration value
-        """
-        if not value:
-            return -errno.EINVAL, '', 'Value should not be empty or None'
-
-        self.log.debug('Setting configuration option %s to %s', key, value)
-        if self.set_config_option(key, value):
-            self.set_module_option(key, value)
-            if key == 'zabbix_host' or key == 'zabbix_port':
-                self._parse_zabbix_hosts()
-                return 0, 'Configuration option {0} updated'.format(key), ''
-        return 1,\
-            'Failed to update configuration option {0}'.format(key), ''
-
-    @CLIReadCommand('zabbix send')
-    def do_send(self) -> Tuple[int, str, str]:
-        """
-        Force sending data to Zabbix
-        """
-        data = self.get_data()
-        if self.send(data):
-            return 0, 'Sending data to Zabbix', ''
-
-        return 1, 'Failed to send data to Zabbix', ''
-
-    @CLIReadCommand('zabbix discovery')
-    def do_discovery(self) -> Tuple[int, str, str]:
-        """
-        Discovering Zabbix data
-        """
-        if self.discovery():
-            return 0, 'Sending discovery data to Zabbix', ''
-
-        return 1, 'Failed to send discovery data to Zabbix', ''
-
-    def shutdown(self) -> None:
-        self.log.info('Stopping zabbix')
-        self.run = False
-        self.event.set()
-
-    def serve(self) -> None:
-        self.log.info('Zabbix module starting up')
-        self.run = True
-
-        self.init_module_config()
-
-        discovery_interval = self.config['discovery_interval']
-        # We are sending discovery once plugin is loaded
-        discovery_counter = cast(int, discovery_interval)
-        while self.run:
-            self.log.debug('Waking up for new iteration')
-
-            if discovery_counter == discovery_interval:
-                try:
-                    self.discovery()
-                except Exception:
-                    # Shouldn't happen, but let's log it and retry next interval,
-                    # rather than dying completely.
-                    self.log.exception("Unexpected error during discovery():")
-                finally:
-                    discovery_counter = 0
-
-            try:
-                data = self.get_data()
-                self.send(data)
-            except Exception:
-                # Shouldn't happen, but let's log it and retry next interval,
-                # rather than dying completely.
-                self.log.exception("Unexpected error during send():")
-
-            interval = cast(float, self.config['interval'])
-            self.log.debug('Sleeping for %d seconds', interval)
-            discovery_counter += 1
-            self.event.wait(interval)
-
-    def self_test(self) -> None:
-        data = self.get_data()
-
-        if data['overall_status'] not in self.ceph_health_mapping:
-            raise RuntimeError('No valid overall_status found in data')
-
-        int(data['overall_status_int'])
-
-        if data['num_mon'] < 1:
-            raise RuntimeError('num_mon is smaller than 1')
diff --git a/src/pybind/mgr/zabbix/zabbix_template.xml b/src/pybind/mgr/zabbix/zabbix_template.xml
deleted file mode 100644
index 3b933bcf32e..00000000000
--- a/src/pybind/mgr/zabbix/zabbix_template.xml
+++ /dev/null
@@ -1,3249 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<zabbix_export>
-    <version>3.0</version>
-    <date>2019-01-25T10:12:41Z</date>
-    <groups>
-        <group>
-            <name>Templates</name>
-        </group>
-    </groups>
-    <templates>
-        <template>
-            <template>ceph-mgr Zabbix module</template>
-            <name>ceph-mgr Zabbix module</name>
-            <description/>
-            <groups>
-                <group>
-                    <name>Templates</name>
-                </group>
-            </groups>
-            <applications>
-                <application>
-                    <name>Ceph</name>
-                </application>
-            </applications>
-            <items>
-                <item>
-                    <name>Number of Monitors</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_mon</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Number of Monitors configured in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of OSDs</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_osd</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Number of OSDs in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of OSDs in state: IN</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_osd_in</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of IN OSDs in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of OSDs in state: UP</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_osd_up</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of UP OSDs in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Temporary state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_temp</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in pg_temp state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Active state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_active</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in active state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Clean state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_clean</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in clean state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Peering state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_peering</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in peering state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Scrubbing state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_scrubbing</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in scrubbing state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Undersized state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_undersized</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in undersized state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in Backfilling state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_backfilling</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in backfilling state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in degraded state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_degraded</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in degraded state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in inconsistent state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_inconsistent</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in inconsistent state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in remapped state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_remapped</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in remapped state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in recovering state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_recovering</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in recovering state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in backfill_toofull state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_backfill_toofull</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in backfill_toofull state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in backfill_wait state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_backfill_wait</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in backfill_wait state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Placement Groups in recovery_wait state</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pg_recovery_wait</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of Placement Groups in recovery_wait state</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Number of Pools</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.num_pools</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of pools in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD avg fill</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_avg_fill</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Average fill of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD max PGs</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_max_pgs</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Maximum amount of PGs on OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD min PGs</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_min_pgs</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Minimum amount of PGs on OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD avg PGs</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_avg_pgs</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Average amount of PGs on OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph backfill full ratio</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>1</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_backfillfull_ratio</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>100</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Backfill full ratio setting of Ceph cluster as configured on OSDMap</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph full ratio</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>1</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_full_ratio</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>100</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Full ratio setting of Ceph cluster as configured on OSDMap</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Apply latency Avg</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_apply_avg</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Average apply latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Apply latency Max</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_apply_max</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Maximum apply latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Apply latency Min</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_apply_min</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Miniumum apply latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Commit latency Avg</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_commit_avg</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Average commit latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Commit latency Max</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_commit_max</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Maximum commit latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD Commit latency Min</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_latency_commit_min</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Minimum commit latency of OSDs</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD max fill</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_max_fill</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Percentage fill of maximum filled OSD</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph OSD min fill</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_min_fill</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Percentage fill of minimum filled OSD</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph nearfull ratio</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>1</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.osd_nearfull_ratio</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>0</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>100</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Near full ratio setting of Ceph cluster as configured on OSDMap</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Overall Ceph status</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.overall_status</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>0</trends>
-                    <status>0</status>
-                    <value_type>4</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Overall Ceph cluster status, eg HEALTH_OK, HEALTH_WARN of HEALTH_ERR</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Overal Ceph status (numeric)</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.overall_status_int</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Overal Ceph status in numeric value. OK: 0, WARN: 1, ERR: 2</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph Read bandwidth</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.rd_bytes</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units>b</units>
-                    <delta>1</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Global read bandwidth</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph Read operations</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.rd_ops</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>1</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Global read operations per second</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Total bytes available</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.total_avail_bytes</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units>B</units>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total bytes available in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Total bytes</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.total_bytes</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units>B</units>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total (RAW) capacity of Ceph cluster in bytes</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Total number of objects</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.total_objects</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total number of objects in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Total bytes used</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.total_used_bytes</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units>B</units>
-                    <delta>0</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Total bytes used in Ceph cluster</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph Write bandwidth</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.wr_bytes</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units>b</units>
-                    <delta>1</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Global write bandwidth</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-                <item>
-                    <name>Ceph Write operations</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <multiplier>0</multiplier>
-                    <snmp_oid/>
-                    <key>ceph.wr_ops</key>
-                    <delay>0</delay>
-                    <history>90</history>
-                    <trends>365</trends>
-                    <status>0</status>
-                    <value_type>3</value_type>
-                    <allowed_hosts/>
-                    <units/>
-                    <delta>1</delta>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <formula>1</formula>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <data_type>0</data_type>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <description>Global write operations per second</description>
-                    <inventory_link>0</inventory_link>
-                    <applications>
-                        <application>
-                            <name>Ceph</name>
-                        </application>
-                    </applications>
-                    <valuemap/>
-                    <logtimefmt/>
-                </item>
-            </items>
-            <discovery_rules>
-                <discovery_rule>
-                    <name>Ceph OSD discovery</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <snmp_oid/>
-                    <key>ceph.zabbix.osd.discovery</key>
-                    <delay>0</delay>
-                    <status>0</status>
-                    <allowed_hosts/>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <filter>
-                        <evaltype>0</evaltype>
-                        <formula/>
-                        <conditions/>
-                    </filter>
-                    <lifetime>90</lifetime>
-                    <description/>
-                    <item_prototypes>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD in</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},in]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units/>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD PGs</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},num_pgs]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units/>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD fill</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},osd_fill]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>0</value_type>
-                            <allowed_hosts/>
-                            <units>%</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD latency apply</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},osd_latency_apply]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>0</value_type>
-                            <allowed_hosts/>
-                            <units>ms</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD latency commit</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},osd_latency_commit]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>0</value_type>
-                            <allowed_hosts/>
-                            <units>ms</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[osd.{#OSD}] OSD up</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[osd.{#OSD},up]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units/>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                    </item_prototypes>
-                    <trigger_prototypes>
-                        <trigger_prototype>
-                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},up].last()}=0</expression>
-                            <name>Ceph OSD osd.{#OSD} is DOWN</name>
-                            <url/>
-                            <status>0</status>
-                            <priority>2</priority>
-                            <description/>
-                            <type>0</type>
-                            <dependencies/>
-                        </trigger_prototype>
-                        <trigger_prototype>
-                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}&gt;={ceph-mgr Zabbix module:ceph.osd_full_ratio.last()}</expression>
-                            <name>Ceph OSD osd.{#OSD} is full: {ITEM.VALUE}%</name>
-                            <url/>
-                            <status>0</status>
-                            <priority>4</priority>
-                            <description/>
-                            <type>0</type>
-                            <dependencies/>
-                        </trigger_prototype>
-                        <trigger_prototype>
-                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}&gt;={ceph-mgr Zabbix module:ceph.osd_nearfull_ratio.last()}</expression>
-                            <name>Ceph OSD osd.{#OSD} is near full: {ITEM.VALUE}%</name>
-                            <url/>
-                            <status>0</status>
-                            <priority>2</priority>
-                            <description/>
-                            <type>0</type>
-                            <dependencies/>
-                        </trigger_prototype>
-                    </trigger_prototypes>
-                    <graph_prototypes/>
-                    <host_prototypes/>
-                </discovery_rule>
-                <discovery_rule>
-                    <name>Ceph pool discovery</name>
-                    <type>2</type>
-                    <snmp_community/>
-                    <snmp_oid/>
-                    <key>ceph.zabbix.pool.discovery</key>
-                    <delay>0</delay>
-                    <status>0</status>
-                    <allowed_hosts/>
-                    <snmpv3_contextname/>
-                    <snmpv3_securityname/>
-                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                    <snmpv3_authpassphrase/>
-                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                    <snmpv3_privpassphrase/>
-                    <delay_flex/>
-                    <params/>
-                    <ipmi_sensor/>
-                    <authtype>0</authtype>
-                    <username/>
-                    <password/>
-                    <publickey/>
-                    <privatekey/>
-                    <port/>
-                    <filter>
-                        <evaltype>0</evaltype>
-                        <formula/>
-                        <conditions/>
-                    </filter>
-                    <lifetime>90</lifetime>
-                    <description/>
-                    <item_prototypes>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Used</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},bytes_used]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>b</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool RAW Used</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},stored_raw]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>b</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Percent Used</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},percent_used]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>0</value_type>
-                            <allowed_hosts/>
-                            <units>%</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Read bandwidth</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},rd_bytes]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>bytes</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Read operations</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},rd_ops]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>ops</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Write bandwidth</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},wr_bytes]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>bytes</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                        <item_prototype>
-                            <name>[{#POOL}] Pool Write operations</name>
-                            <type>2</type>
-                            <snmp_community/>
-                            <multiplier>0</multiplier>
-                            <snmp_oid/>
-                            <key>ceph.[{#POOL},wr_ops]</key>
-                            <delay>0</delay>
-                            <history>90</history>
-                            <trends>365</trends>
-                            <status>0</status>
-                            <value_type>3</value_type>
-                            <allowed_hosts/>
-                            <units>ops</units>
-                            <delta>0</delta>
-                            <snmpv3_contextname/>
-                            <snmpv3_securityname/>
-                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
-                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
-                            <snmpv3_authpassphrase/>
-                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
-                            <snmpv3_privpassphrase/>
-                            <formula>1</formula>
-                            <delay_flex/>
-                            <params/>
-                            <ipmi_sensor/>
-                            <data_type>0</data_type>
-                            <authtype>0</authtype>
-                            <username/>
-                            <password/>
-                            <publickey/>
-                            <privatekey/>
-                            <port/>
-                            <description/>
-                            <inventory_link>0</inventory_link>
-                            <applications/>
-                            <valuemap/>
-                            <logtimefmt/>
-                            <application_prototypes>
-                                <application_prototype>
-                                    <name>Ceph CRUSH [{#CRUSH_RULE}]</name>
-                                </application_prototype>
-                            </application_prototypes>
-                        </item_prototype>
-                    </item_prototypes>
-                    <trigger_prototypes/>
-                    <graph_prototypes/>
-                    <host_prototypes/>
-                </discovery_rule>
-            </discovery_rules>
-            <macros/>
-            <templates/>
-            <screens>
-                <screen>
-                    <name>Ceph</name>
-                    <hsize>1</hsize>
-                    <vsize>7</vsize>
-                    <screen_items>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>500</width>
-                            <height>100</height>
-                            <x>0</x>
-                            <y>0</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph storage overview</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>1</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph free space</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>2</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph health</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>3</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph bandwidth</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>4</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph I/O</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>5</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph OSD utilization</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                        <screen_item>
-                            <resourcetype>0</resourcetype>
-                            <width>900</width>
-                            <height>200</height>
-                            <x>0</x>
-                            <y>6</y>
-                            <colspan>1</colspan>
-                            <rowspan>1</rowspan>
-                            <elements>0</elements>
-                            <valign>0</valign>
-                            <halign>0</halign>
-                            <style>0</style>
-                            <url/>
-                            <dynamic>0</dynamic>
-                            <sort_triggers>0</sort_triggers>
-                            <resource>
-                                <name>Ceph OSD latency</name>
-                                <host>ceph-mgr Zabbix module</host>
-                            </resource>
-                            <max_columns>3</max_columns>
-                            <application/>
-                        </screen_item>
-                    </screen_items>
-                </screen>
-            </screens>
-        </template>
-    </templates>
-    <triggers>
-        <trigger>
-            <expression>{ceph-mgr Zabbix module:ceph.overall_status_int.last()}=2</expression>
-            <name>Ceph cluster in ERR state</name>
-            <url/>
-            <status>0</status>
-            <priority>5</priority>
-            <description>Ceph cluster is in ERR state</description>
-            <type>0</type>
-            <dependencies/>
-        </trigger>
-        <trigger>
-            <expression>{ceph-mgr Zabbix module:ceph.overall_status_int.avg(1h)}=1</expression>
-            <name>Ceph cluster in WARN state</name>
-            <url/>
-            <status>0</status>
-            <priority>4</priority>
-            <description>Issue a trigger if Ceph cluster is in WARN state for &gt;1h</description>
-            <type>0</type>
-            <dependencies/>
-        </trigger>
-        <trigger>
-            <expression>{ceph-mgr Zabbix module:ceph.num_osd_in.abschange()}&gt;0</expression>
-            <name>Number of IN OSDs changed</name>
-            <url/>
-            <status>0</status>
-            <priority>2</priority>
-            <description>Amount of OSDs in IN state changed</description>
-            <type>0</type>
-            <dependencies/>
-        </trigger>
-        <trigger>
-            <expression>{ceph-mgr Zabbix module:ceph.num_osd_up.abschange()}&gt;0</expression>
-            <name>Number of UP OSDs changed</name>
-            <url/>
-            <status>0</status>
-            <priority>2</priority>
-            <description>Amount of OSDs in UP state changed</description>
-            <type>0</type>
-            <dependencies/>
-        </trigger>
-    </triggers>
-    <graphs>
-        <graph>
-            <name>Ceph bandwidth</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>100.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>1</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>0</ymin_type_1>
-            <ymax_type_1>0</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>1A7C11</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.rd_bytes</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>F63100</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.wr_bytes</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph free space</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>100.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>0</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>1</ymin_type_1>
-            <ymax_type_1>2</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>
-                <host>ceph-mgr Zabbix module</host>
-                <key>ceph.total_bytes</key>
-            </ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>00AA00</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.total_avail_bytes</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>DD0000</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.total_used_bytes</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph health</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>2.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>0</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>1</ymin_type_1>
-            <ymax_type_1>1</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>1A7C11</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>7</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.overall_status_int</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph I/O</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>100.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>1</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>1</ymin_type_1>
-            <ymax_type_1>0</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>1A7C11</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.rd_ops</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>F63100</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.wr_ops</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph OSD latency</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>100.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>0</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>0</ymin_type_1>
-            <ymax_type_1>0</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>1A7C11</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_apply_avg</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>F63100</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_commit_avg</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>2</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>2774A4</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_apply_max</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>3</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>A54F10</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_commit_max</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>4</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>FC6EA3</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_apply_min</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>5</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>6C59DC</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>4</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_latency_commit_min</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph OSD utilization</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>100.0000</yaxismax>
-            <show_work_period>1</show_work_period>
-            <show_triggers>1</show_triggers>
-            <type>0</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>1</ymin_type_1>
-            <ymax_type_1>1</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>0000CC</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_nearfull_ratio</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>F63100</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_full_ratio</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>2</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>CC00CC</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_backfillfull_ratio</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>3</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>A54F10</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_max_fill</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>4</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>FC6EA3</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_avg_fill</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>5</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>6C59DC</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.osd_min_fill</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-        <graph>
-            <name>Ceph storage overview</name>
-            <width>900</width>
-            <height>200</height>
-            <yaxismin>0.0000</yaxismin>
-            <yaxismax>0.0000</yaxismax>
-            <show_work_period>0</show_work_period>
-            <show_triggers>0</show_triggers>
-            <type>2</type>
-            <show_legend>1</show_legend>
-            <show_3d>0</show_3d>
-            <percent_left>0.0000</percent_left>
-            <percent_right>0.0000</percent_right>
-            <ymin_type_1>0</ymin_type_1>
-            <ymax_type_1>0</ymax_type_1>
-            <ymin_item_1>0</ymin_item_1>
-            <ymax_item_1>0</ymax_item_1>
-            <graph_items>
-                <graph_item>
-                    <sortorder>0</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>F63100</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.total_used_bytes</key>
-                    </item>
-                </graph_item>
-                <graph_item>
-                    <sortorder>1</sortorder>
-                    <drawtype>0</drawtype>
-                    <color>00CC00</color>
-                    <yaxisside>0</yaxisside>
-                    <calc_fnc>2</calc_fnc>
-                    <type>0</type>
-                    <item>
-                        <host>ceph-mgr Zabbix module</host>
-                        <key>ceph.total_avail_bytes</key>
-                    </item>
-                </graph_item>
-            </graph_items>
-        </graph>
-    </graphs>
-</zabbix_export>
diff --git a/src/python-common/CMakeLists.txt b/src/python-common/CMakeLists.txt
index e89bbe2feef..08660342a6a 100644
--- a/src/python-common/CMakeLists.txt
+++ b/src/python-common/CMakeLists.txt
@@ -3,5 +3,5 @@ distutils_install_module(ceph)
 
 if(WITH_TESTS)
   include(AddCephTest)
-  add_tox_test(python-common TOX_ENVS py3 lint)
+  add_tox_test(python-common TOX_ENVS __tox_defaults__)
 endif()
diff --git a/src/python-common/ceph/cephadm/__init__.py b/src/python-common/ceph/cephadm/__init__.py
new file mode 100644
index 00000000000..3c74dfd3941
--- /dev/null
+++ b/src/python-common/ceph/cephadm/__init__.py
@@ -0,0 +1,2 @@
+# this directory is meant for things that will be shared only between
+# the cephadm binary and cephadm mgr module
diff --git a/src/python-common/ceph/cephadm/images.py b/src/python-common/ceph/cephadm/images.py
new file mode 100644
index 00000000000..2399cdb6dc9
--- /dev/null
+++ b/src/python-common/ceph/cephadm/images.py
@@ -0,0 +1,19 @@
+# Default container images -----------------------------------------------------
+DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
+DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
+DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.3.3'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
+DEFAULT_ALERTMANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
+DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
+DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
+DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
+DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
+DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
+DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
+DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
+DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
+DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
+DEFAULT_SAMBA_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
+DEFAULT_SAMBA_METRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
diff --git a/src/python-common/ceph/deployment/drive_group.py b/src/python-common/ceph/deployment/drive_group.py
index c68ee01a728..43175aa79fb 100644
--- a/src/python-common/ceph/deployment/drive_group.py
+++ b/src/python-common/ceph/deployment/drive_group.py
@@ -2,7 +2,7 @@ import enum
 import yaml
 
 from ceph.deployment.inventory import Device
-from ceph.deployment.service_spec import (
+from ceph.deployment.service_spec import (  # noqa: F401 (type comments)
     CustomConfig,
     GeneralArgList,
     PlacementSpec,
@@ -11,7 +11,7 @@ from ceph.deployment.service_spec import (
 from ceph.deployment.hostspec import SpecValidationError
 
 try:
-    from typing import Optional, List, Dict, Any, Union
+    from typing import Optional, List, Dict, Any, Union  # noqa: F401
 except ImportError:
     pass
 
diff --git a/src/python-common/ceph/deployment/drive_selection/filter.py b/src/python-common/ceph/deployment/drive_selection/filter.py
index 0da1b5c3901..28f63ddc2f2 100644
--- a/src/python-common/ceph/deployment/drive_selection/filter.py
+++ b/src/python-common/ceph/deployment/drive_selection/filter.py
@@ -15,12 +15,10 @@ logger = logging.getLogger(__name__)
 
 
 class FilterGenerator(object):
-    def __init__(self, device_filter):
-        # type: (DeviceSelection) -> None
+    def __init__(self, device_filter: DeviceSelection) -> None:
         self.device_filter = device_filter
 
-    def __iter__(self):
-        # type: () -> Generator[Matcher, None, None]
+    def __iter__(self) -> Generator[Matcher, None, None]:
         if self.device_filter.actuators:
             yield EqualityMatcher('actuators', self.device_filter.actuators)
         if self.device_filter.size:
diff --git a/src/python-common/ceph/deployment/drive_selection/matchers.py b/src/python-common/ceph/deployment/drive_selection/matchers.py
index df502410aeb..a6a2147ce9e 100644
--- a/src/python-common/ceph/deployment/drive_selection/matchers.py
+++ b/src/python-common/ceph/deployment/drive_selection/matchers.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
-from typing import Tuple, Optional, Any, Union, Iterator
+# TODO: remove noqa and update to python3/mypy style type annotations
+from typing import Tuple, Optional, Any, Union, Iterator  # noqa: F401
 
-from ceph.deployment.inventory import Device
+from ceph.deployment.inventory import Device  # noqa: F401
 
 import re
 import logging
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 59ebbb6347e..85fc95cf394 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -3,7 +3,7 @@ import logging
 from typing import List, Optional, Dict, Callable
 
 from ..inventory import Device
-from ..drive_group import DriveGroupSpec, DeviceSelection, DriveGroupValidationError
+from ..drive_group import DriveGroupSpec, DeviceSelection, DriveGroupValidationError  # noqa: F401
 
 from .filter import FilterGenerator
 from .matchers import _MatchInvalid
diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py
index e2c1a5605f9..29475e94d82 100644
--- a/src/python-common/ceph/deployment/inventory.py
+++ b/src/python-common/ceph/deployment/inventory.py
@@ -1,5 +1,5 @@
 try:
-    from typing import List, Optional, Dict, Any, Union
+    from typing import List, Optional, Dict, Any, Union  # noqa: F401
 except ImportError:
     pass  # for type checking
 
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 459ab7df1a0..103fd3a8e98 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -527,8 +527,8 @@ pattern_type=PatternType.fnmatch))
         labels = [x for x in strings if 'label:' in x]
         if len(labels) > 1:
             raise SpecValidationError('more than one label provided: {}'.format(labels))
-        for l in labels:
-            strings.remove(l)
+        for lbl in labels:
+            strings.remove(lbl)
         label = labels[0][6:] if labels else None
 
         host_patterns = strings
@@ -701,7 +701,7 @@ class ArgumentSpec:
         if isinstance(data, str):
             return cls(data, split=True, origin=cls.OriginalType.STRING)
         if 'argument' not in data:
-            raise SpecValidationError(f'ArgumentSpec must have an "argument" field')
+            raise SpecValidationError('ArgumentSpec must have an "argument" field')
         for k in data.keys():
             if k not in cls._fields:
                 raise SpecValidationError(f'ArgumentSpec got an unknown field {k!r}')
@@ -1313,6 +1313,10 @@ class RGWSpec(ServiceSpec):
             raise SpecValidationError('"ssl" field must be set to true when "generate_cert" '
                                       'is set to true')
 
+        if self.generate_cert and self.rgw_frontend_ssl_certificate:
+            raise SpecValidationError('"generate_cert" field and "rgw_frontend_ssl_certificate" '
+                                      'field are mutually exclusive')
+
 
 yaml.add_representer(RGWSpec, ServiceSpec.yaml_representer)
 
@@ -1340,15 +1344,25 @@ class NvmeofServiceSpec(ServiceSpec):
                  allowed_consecutive_spdk_ping_failures: Optional[int] = 1,
                  spdk_ping_interval_in_seconds: Optional[float] = 2.0,
                  ping_spdk_under_lock: Optional[bool] = False,
+                 max_hosts_per_namespace: Optional[int] = 1,
+                 max_namespaces_with_netmask: Optional[int] = 1000,
+                 max_subsystems: Optional[int] = 128,
+                 max_namespaces: Optional[int] = 1024,
+                 max_namespaces_per_subsystem: Optional[int] = 256,
+                 max_hosts_per_subsystem: Optional[int] = 32,
                  server_key: Optional[str] = None,
                  server_cert: Optional[str] = None,
                  client_key: Optional[str] = None,
                  client_cert: Optional[str] = None,
                  root_ca_cert: Optional[str] = None,
+                 # unused and duplicate of tgt_path below, consider removing
                  spdk_path: Optional[str] = None,
+                 spdk_mem_size: Optional[int] = None,
                  tgt_path: Optional[str] = None,
                  spdk_timeout: Optional[float] = 60.0,
-                 spdk_log_level: Optional[str] = 'WARNING',
+                 spdk_log_level: Optional[str] = '',
+                 spdk_protocol_log_level: Optional[str] = 'WARNING',
+                 spdk_log_file_dir: Optional[str] = '',
                  rpc_socket_dir: Optional[str] = '/var/tmp/',
                  rpc_socket_name: Optional[str] = 'spdk.sock',
                  conn_retries: Optional[int] = 10,
@@ -1368,6 +1382,7 @@ class NvmeofServiceSpec(ServiceSpec):
                  log_directory: Optional[str] = '/var/log/ceph/',
                  monitor_timeout: Optional[float] = 1.0,
                  enable_monitor_client: bool = True,
+                 monitor_client_log_file_dir: Optional[str] = '',
                  placement: Optional[PlacementSpec] = None,
                  unmanaged: bool = False,
                  preview_only: bool = False,
@@ -1416,6 +1431,18 @@ class NvmeofServiceSpec(ServiceSpec):
         self.omap_file_lock_retry_sleep_interval = omap_file_lock_retry_sleep_interval
         #: ``omap_file_update_reloads`` number of attempt to reload OMAP when it differs from local
         self.omap_file_update_reloads = omap_file_update_reloads
+        #: ``max_hosts_per_namespace`` max number of hosts per namespace
+        self.max_hosts_per_namespace = max_hosts_per_namespace
+        #: ``max_namespaces_with_netmask`` max number of namespaces which are not auto visible
+        self.max_namespaces_with_netmask = max_namespaces_with_netmask
+        #: ``max_subsystems`` max number of subsystems
+        self.max_subsystems = max_subsystems
+        #: ``max_namespaces`` max number of namespaces on all subsystems
+        self.max_namespaces = max_namespaces
+        #: ``max_namespaces_per_subsystem`` max number of namespaces per one subsystem
+        self.max_namespaces_per_subsystem = max_namespaces_per_subsystem
+        #: ``max_hosts_per_subsystem`` max number of hosts per subsystems
+        self.max_hosts_per_subsystem = max_hosts_per_subsystem
         #: ``allowed_consecutive_spdk_ping_failures`` # of ping failures before aborting gateway
         self.allowed_consecutive_spdk_ping_failures = allowed_consecutive_spdk_ping_failures
         #: ``spdk_ping_interval_in_seconds`` sleep interval in seconds between SPDK pings
@@ -1434,14 +1461,20 @@ class NvmeofServiceSpec(ServiceSpec):
         self.client_cert = client_cert
         #: ``root_ca_cert`` CA cert for server/client certs
         self.root_ca_cert = root_ca_cert
-        #: ``spdk_path`` path to SPDK
+        #: ``spdk_path`` path is unused and duplicate of tgt_path below, consider removing
         self.spdk_path = spdk_path or '/usr/local/bin/nvmf_tgt'
+        #: ``spdk_mem_size`` memory size in MB for DPDK
+        self.spdk_mem_size = spdk_mem_size
         #: ``tgt_path`` nvmeof target path
         self.tgt_path = tgt_path or '/usr/local/bin/nvmf_tgt'
         #: ``spdk_timeout`` SPDK connectivity timeout
         self.spdk_timeout = spdk_timeout
         #: ``spdk_log_level`` the SPDK log level
-        self.spdk_log_level = spdk_log_level or 'WARNING'
+        self.spdk_log_level = spdk_log_level
+        #: ``spdk_protocol_log_level`` the SPDK protocol log level
+        self.spdk_protocol_log_level = spdk_protocol_log_level or 'WARNING'
+        #: ``spdk_log_file_dir`` the SPDK log output file file directory
+        self.spdk_log_file_dir = spdk_log_file_dir
         #: ``rpc_socket_dir`` the SPDK RPC socket file directory
         self.rpc_socket_dir = rpc_socket_dir or '/var/tmp/'
         #: ``rpc_socket_name`` the SPDK RPC socket file name
@@ -1478,6 +1511,8 @@ class NvmeofServiceSpec(ServiceSpec):
         self.monitor_timeout = monitor_timeout
         #: ``enable_monitor_client`` whether to connect to the ceph monitor or not
         self.enable_monitor_client = enable_monitor_client
+        #: ``monitor_client_log_file_dir`` the monitor client log output file file directory
+        self.monitor_client_log_file_dir = monitor_client_log_file_dir
 
     def get_port_start(self) -> List[int]:
         return [5500, 4420, 8009]
@@ -1522,6 +1557,16 @@ class NvmeofServiceSpec(ServiceSpec):
                     'Invalid SPDK log level. Valid values are: '
                     'DEBUG, INFO, WARNING, ERROR, NOTICE')
 
+        if self.spdk_protocol_log_level:
+            if self.spdk_protocol_log_level.lower() not in ['debug',
+                                                            'info',
+                                                            'warning',
+                                                            'error',
+                                                            'notice']:
+                raise SpecValidationError(
+                    'Invalid SPDK protocol log level. Valid values are: '
+                    'DEBUG, INFO, WARNING, ERROR, NOTICE')
+
         if (
             self.spdk_ping_interval_in_seconds
             and self.spdk_ping_interval_in_seconds < 1.0
@@ -1588,6 +1633,36 @@ class NvmeofServiceSpec(ServiceSpec):
         ):
             raise SpecValidationError("Log file directory backups can't be negative")
 
+        if (self.max_hosts_per_namespace and self.max_hosts_per_namespace < 0):
+            raise SpecValidationError("Max hosts per namespace can't be negative")
+
+        if (self.max_namespaces_with_netmask and self.max_namespaces_with_netmask < 0):
+            raise SpecValidationError("Max namespaces with netmask can't be negative")
+
+        if type(self.max_subsystems) != int:
+            raise SpecValidationError("Max subsystems must be an integer")
+
+        if self.max_subsystems <= 0:
+            raise SpecValidationError("Max subsystems must be greater than zero")
+
+        if type(self.max_namespaces) != int:
+            raise SpecValidationError("Max namespaces must be an integer")
+
+        if self.max_namespaces <= 0:
+            raise SpecValidationError("Max namespaces must be greater than zero")
+
+        if type(self.max_namespaces_per_subsystem) != int:
+            raise SpecValidationError("Max namespaces per subsystem must be an integer")
+
+        if self.max_namespaces_per_subsystem <= 0:
+            raise SpecValidationError("Max namespaces per subsystem must be greater than zero")
+
+        if type(self.max_hosts_per_subsystem) != int:
+            raise SpecValidationError("Max hosts per subsystem must be an integer")
+
+        if self.max_hosts_per_subsystem <= 0:
+            raise SpecValidationError("Max hosts per subsystem must be greater than zero")
+
         if (
             self.monitor_timeout
             and self.monitor_timeout < 0.0
@@ -1762,7 +1837,7 @@ class IngressSpec(ServiceSpec):
         if not self.keepalive_only and not self.frontend_port:
             raise SpecValidationError(
                 'Cannot add ingress: No frontend_port specified')
-        if not self.monitor_port:
+        if not self.keepalive_only and not self.monitor_port:
             raise SpecValidationError(
                 'Cannot add ingress: No monitor_port specified')
         if not self.virtual_ip and not self.virtual_ips_list:
@@ -1805,6 +1880,7 @@ class MgmtGatewaySpec(ServiceSpec):
                  ssl_protocols: Optional[List[str]] = None,
                  ssl_ciphers: Optional[List[str]] = None,
                  enable_health_check_endpoint: bool = False,
+                 virtual_ip: Optional[str] = None,
                  preview_only: bool = False,
                  unmanaged: bool = False,
                  extra_container_args: Optional[GeneralArgList] = None,
@@ -1851,6 +1927,7 @@ class MgmtGatewaySpec(ServiceSpec):
         #: List of supported secure SSL ciphers. Changing this list may reduce system security.
         self.ssl_ciphers = ssl_ciphers
         self.enable_health_check_endpoint = enable_health_check_endpoint
+        self.virtual_ip = virtual_ip
 
     def get_port_start(self) -> List[int]:
         ports = []
diff --git a/src/python-common/ceph/deployment/translate.py b/src/python-common/ceph/deployment/translate.py
index 49fb17da725..9dfe7cfcf81 100644
--- a/src/python-common/ceph/deployment/translate.py
+++ b/src/python-common/ceph/deployment/translate.py
@@ -5,7 +5,7 @@ try:
 except ImportError:
     pass
 
-from ceph.deployment.drive_selection.selector import DriveSelection
+from ceph.deployment.drive_selection.selector import DriveSelection  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/python-common/ceph/fs/earmarking.py b/src/python-common/ceph/fs/earmarking.py
index c5d4a59a4d5..f4fd4ddf96c 100644
--- a/src/python-common/ceph/fs/earmarking.py
+++ b/src/python-common/ceph/fs/earmarking.py
@@ -19,13 +19,25 @@ supported top-level scopes.
 import errno
 import enum
 import logging
-from typing import List, NamedTuple, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple, Protocol
 
 log = logging.getLogger(__name__)
 
 XATTR_SUBVOLUME_EARMARK_NAME = 'user.ceph.subvolume.earmark'
 
 
+class FSOperations(Protocol):
+    """Protocol class representing the file system operations earmarking
+    classes will perform.
+    """
+
+    def setxattr(
+        self, path: str, key: str, value: bytes, flags: int
+    ) -> None: ...
+
+    def getxattr(self, path: str, key: str) -> bytes: ...
+
+
 class EarmarkTopScope(enum.Enum):
     NFS = "nfs"
     SMB = "smb"
@@ -53,11 +65,11 @@ class EarmarkParseError(ValueError):
 
 
 class CephFSVolumeEarmarking:
-    def __init__(self, fs, path: str) -> None:
+    def __init__(self, fs: FSOperations, path: str) -> None:
         self.fs = fs
         self.path = path
 
-    def _handle_cephfs_error(self, e: Exception, action: str) -> None:
+    def _handle_cephfs_error(self, e: Exception, action: str) -> Optional[str]:
         if isinstance(e, ValueError):
             raise EarmarkException(errno.EINVAL, f"Invalid earmark specified: {e}") from e
         elif isinstance(e, OSError):
@@ -135,7 +147,7 @@ class CephFSVolumeEarmarking:
         except Exception as e:
             return self._handle_cephfs_error(e, "getting")
 
-    def set_earmark(self, earmark: str):
+    def set_earmark(self, earmark: str) -> None:
         # Validate the earmark before attempting to set it
         if not self._validate_earmark(earmark):
             raise EarmarkException(
diff --git a/src/python-common/ceph/tests/utils.py b/src/python-common/ceph/tests/utils.py
index 04b8a4e3895..20a39e4666b 100644
--- a/src/python-common/ceph/tests/utils.py
+++ b/src/python-common/ceph/tests/utils.py
@@ -35,8 +35,7 @@ def _mk_device(rotational=True,
     )]
 
 
-def _mk_inventory(devices):
-    # type: (Any) -> List[Device]
+def _mk_inventory(devices: Any) -> List[Device]:
     devs = []
     for dev_, name in zip(devices, map(chr, range(ord('a'), ord('z')))):
         dev = Device.from_json(dev_.to_json())
diff --git a/src/python-common/requirements-lint.txt b/src/python-common/requirements-lint.txt
deleted file mode 100644
index 2a7142182c2..00000000000
--- a/src/python-common/requirements-lint.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-flake8==3.7.8
-rstcheck==3.3.1
diff --git a/src/python-common/tox.ini b/src/python-common/tox.ini
index 313a4334d51..e0b59c700ca 100644
--- a/src/python-common/tox.ini
+++ b/src/python-common/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py3, mypy, lint
+envlist = lint, rstcheck, mypy, py3
 skip_missing_interpreters = true
 
 [testenv:py3]
@@ -26,9 +26,13 @@ exclude =
     __pycache__
 
 [testenv:lint]
-deps = 
-    -rrequirements-lint.txt
+deps =
+    flake8
 commands =
     flake8 {posargs:ceph}
-    rstcheck --report info --debug README.rst
 
+[testenv:rstcheck]
+deps =
+    rstcheck
+commands =
+    rstcheck --report-level info  README.rst
diff --git a/src/rgw/driver/dbstore/README.md b/src/rgw/driver/dbstore/README.md
index f7e5df331cc..2bf3391c8c4 100644
--- a/src/rgw/driver/dbstore/README.md
+++ b/src/rgw/driver/dbstore/README.md
@@ -15,23 +15,21 @@ Add below cmake option (enabled by default)
 
 
 ## Running Test cluster
-Edit ceph.conf to add below option
+Edit ceph.conf to add below options
 
     [client]
         rgw backend store = dbstore
         rgw config store = dbstore
 
-Start vstart cluster
+To start the `vstart` cluster, run the following cmd:
 
-    MON=1 RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -o rgw_config_store=dbstore -n -d
+    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -n -d --rgw_store dbstore
 
-The above vstart command brings up RGW server on dbstore. It creates default zonegroup, zone and few default users (eg., testid) to be used for s3 operations.
+The above `vstart` command brings up the RGW server on DBStore without the need for MONs or OSDs. It creates a default zonegroup, zone, and few default users (e.g., `testid`) to be used for S3 operations, and generates database files in the `dev` subdirectory, by default, to store them.
 
-`radosgw-admin` can be used to create and remove other users, zonegroups and zones.
-
-
-By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data and *'/var/lib/ceph/radosgw/dbstore-config.db'* file to store the configuration. This can be configured using below options in ceph.conf
+`radosgw-admin` command can be used to create and remove other users, zonegroups and zones.
 
+The location and prefix for the database files can be configured using the following options:
     [client]
         dbstore db dir = <path for the directory for storing the db backend store data>
         dbstore db name prefix = <prefix to the file names created by db backend store>
diff --git a/src/rgw/driver/posix/README.md b/src/rgw/driver/posix/README.md
index 02dc8dfbe85..73971edc86f 100644
--- a/src/rgw/driver/posix/README.md
+++ b/src/rgw/driver/posix/README.md
@@ -23,15 +23,15 @@ Edit ceph.conf to add below option
         rgw config store = dbstore
         rgw filter = posix
 
-Start vstart cluster
+To start the `vstart` cluster, run the following cmd:
 
-    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -o rgw_config_store=dbstore -o rgw_filter=posix -n -d
+    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -n -d --rgw_store posix
 
-The above vstart command brings up RGW server on POSIXDriver. It creates default zonegroup, zone and few default users (eg., testid) to be used for s3 operations.
+The above vstart command brings up RGW server on POSIXDriver. It creates default zonegroup, zone and few default users (e.g., testid) to be used for s3 operations.
 
-`radosgw-admin` can be used to create and remove other users, zonegroups and zones.
+`radosgw-admin` command can be used to create and remove other users, zonegroups and zones.
 
-By default, the directory exported is *'/tmp/rgw_posix_driver'*.   This can be changed with the `rgw_posix_base_path` option, either in ceph.conf or on the vstart command line above.
+By default, the directory exported, *'rgw_posix_driver'*, is created in the `dev` subdirectory.   This can be changed with the `rgw_posix_base_path` option.
 
-The POSIXDriver keeps a LMDB based cache of directories, so that it can provide ordered listings.  This directory lives in `rgw_posix_database_root`, which by default is in *'/var/lib/ceph/radosgw'*
+The POSIXDriver keeps a LMDB based cache of directories, so that it can provide ordered listings.  This directory lives in `rgw_posix_database_root`, which by default is created in the `dev` subdirectory
 
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.cc b/src/rgw/driver/rados/rgw_d3n_datacache.cc
index c81954fce1c..be1a4468696 100644
--- a/src/rgw/driver/rados/rgw_d3n_datacache.cc
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.cc
@@ -86,6 +86,8 @@ void D3nDataCache::init(CephContext *_cct) {
       // create the cache storage directory
       lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl;
       efs::create_directories(cache_location);
+      efs::permissions(cache_location, 
+        efs::perms::owner_all | efs::perms::group_all | efs::perms::others_read);
     }
   } catch (const efs::filesystem_error& e) {
     lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location <<
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
index e932c997621..b153a7b4a42 100644
--- a/src/rgw/driver/rados/rgw_lc_tier.cc
+++ b/src/rgw/driver/rados/rgw_lc_tier.cc
@@ -77,8 +77,9 @@ WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
 
 static inline string get_key_instance(const rgw_obj_key& key)
 {
-  if (!key.instance.empty() &&
-      !key.have_null_instance()) {
+  // if non-current entry, add versionID to the
+  // transitioned object name including "null".
+  if (!key.instance.empty()) {
     return "-" + key.instance;
   }
   return "";
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
index 7b31fd72bd4..5734284d1a3 100644
--- a/src/rgw/driver/rados/rgw_notify.cc
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -21,6 +21,7 @@
 #include "common/dout.h"
 #include "rgw_url.h"
 #include <chrono>
+#include <fmt/format.h>
 
 #define dout_subsys ceph_subsys_rgw_notification
 
@@ -769,9 +770,10 @@ public:
         });
 
     // start the worker threads to do the actual queue processing
-    const std::string WORKER_THREAD_NAME = "notif-worker";
     for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
-      workers.emplace_back([this]() {
+      workers.emplace_back([this,worker_id]() {
+        const auto thread_name = fmt::format("notif-worker-{}", worker_id);
+        ceph_pthread_setname(thread_name.c_str());
         try {
           io_context.run(); 
         } catch (const std::exception& err) {
@@ -779,11 +781,6 @@ public:
           throw err;
         }
       });
-      const auto thread_name = WORKER_THREAD_NAME+std::to_string(worker_id);
-      if (const auto rc = ceph_pthread_setname(workers.back().native_handle(), thread_name.c_str()); rc != 0) {
-        ldpp_dout(this, 1) << "ERROR: failed to set notification manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
     }
     ldpp_dout(this, 10) << "INfO: started notification manager with: " << worker_count << " workers" << dendl;
   }
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index d154082994e..a133b54dc59 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -5089,7 +5089,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
 
 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
                              RGWBucketInfo& bucket_info,
-                             const rgw_obj& obj,
+                             rgw_obj obj,
                              const rgw_placement_rule& placement_rule,
                              const real_time& mtime,
                              uint64_t olh_epoch,
@@ -5120,6 +5120,11 @@ int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
     return -ECANCELED;
   }
 
+  // bi expects empty instance for the entries created when bucket versioning
+  // is not enabled or suspended.
+  if (obj.key.instance == "null") {
+    obj.key.instance.clear();
+  }
   attrs.erase(RGW_ATTR_ID_TAG);
   attrs.erase(RGW_ATTR_TAIL_TAG);
 
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index 9a2ba0af0e2..b24823b60dc 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -1234,7 +1234,7 @@ public:
 
   int transition_obj(RGWObjectCtx& obj_ctx,
                      RGWBucketInfo& bucket_info,
-                     const rgw_obj& obj,
+                     rgw_obj obj,
                      const rgw_placement_rule& placement_rule,
                      const real_time& mtime,
                      uint64_t olh_epoch,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index bb416b0c2c3..11b86a25841 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -2792,6 +2792,13 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
 {
   rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
   map<string, bufferlist> attrs = get_attrs();
+  rgw_obj_key& obj_key = get_key();
+  // bi expects empty instance for the entries created when bucket versioning
+  // is not enabled or suspended.
+  if (obj_key.instance == "null") {
+      obj_key.instance.clear();
+  }
+
   RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
   RGWRados::Object::Write obj_op(&op_target);
 
diff --git a/src/rgw/rgw_amqp.cc b/src/rgw/rgw_amqp.cc
index 7504d47c6c9..5bc5d173c73 100644
--- a/src/rgw/rgw_amqp.cc
+++ b/src/rgw/rgw_amqp.cc
@@ -650,6 +650,9 @@ private:
   // (4) TODO reconnect on connection errors
   // (5) TODO cleanup timedout callbacks
   void run() noexcept {
+    // give the runner thread a name for easier debugging
+    ceph_pthread_setname("amqp_manager");
+
     amqp_frame_t frame;
     while (!stopped) {
 
@@ -838,12 +841,6 @@ public:
       // This is to prevent rehashing so that iterators are not invalidated
       // when a new connection is added.
       connections.max_load_factor(10.0);
-      // give the runner thread a name for easier debugging
-      const char* thread_name = "amqp_manager";
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-        ldout(cct, 1) << "ERROR: failed to set amqp manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
   }
 
   // non copyable
diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc
index c0ec3dc2c55..0807993338d 100644
--- a/src/rgw/rgw_kafka.cc
+++ b/src/rgw/rgw_kafka.cc
@@ -503,6 +503,7 @@ private:
   }
 
   void run() noexcept {
+    ceph_pthread_setname("kafka_manager");
     while (!stopped) {
 
       // publish all messages in the queue
@@ -575,12 +576,6 @@ public:
       // This is to prevent rehashing so that iterators are not invalidated 
       // when a new connection is added.
       connections.max_load_factor(10.0);
-      // give the runner thread a name for easier debugging
-      const char* thread_name = "kafka_manager";
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-        ldout(cct, 1) << "ERROR: failed to set kafka manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
   }
 
   // non copyable
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index 78807888dfd..a7f2ceabad3 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -495,6 +495,14 @@ struct lc_op_ctx {
       octx(env.driver), dpp(dpp), wq(wq)
     {
       obj = bucket->get_object(o.key);
+      /* once bucket versioning is enabled, the non-current entries with
+       * instance empty should have instance set to "null" to be able
+       * to correctly read its olh version entry.
+       */
+      if (o.key.instance.empty() && bucket->versioned() && !o.is_current()) {
+        rgw_obj_key& obj_key = obj->get_key();
+        obj_key.instance = "null";
+      }
     }
 
   bool next_has_same_name(const std::string& key_name) {
@@ -1355,9 +1363,9 @@ public:
   int delete_tier_obj(lc_op_ctx& oc) {
     int ret = 0;
 
-    /* If bucket is versioned, create delete_marker for current version
+    /* If bucket has versioning enabled, create delete_marker for current version
      */
-    if (! oc.bucket->versioned()) {
+    if (! oc.bucket->versioning_enabled()) {
       ret =
           remove_expired_obj(oc.dpp, oc, true, {rgw::notify::ObjectTransition});
       ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key
@@ -1387,9 +1395,10 @@ public:
 
   int transition_obj_to_cloud(lc_op_ctx& oc) {
     int ret{0};
-    /* If CurrentVersion object, remove it & create delete marker */
+    /* If CurrentVersion object & bucket has versioning enabled, remove it &
+     * create delete marker */
     bool delete_object = (!oc.tier->retain_head_object() ||
-                     (oc.o.is_current() && oc.bucket->versioned()));
+                     (oc.o.is_current() && oc.bucket->versioning_enabled()));
 
     /* notifications */
     auto& bucket = oc.bucket;
diff --git a/src/rgw/rgw_lua_background.cc b/src/rgw/rgw_lua_background.cc
index ef97a5d6f65..c5b815f93f5 100644
--- a/src/rgw/rgw_lua_background.cc
+++ b/src/rgw/rgw_lua_background.cc
@@ -83,11 +83,6 @@ void Background::start() {
   }
   started = true;
   runner = std::thread(&Background::run, this);
-  const char* thread_name = "lua_background";
-  if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-    ldout(cct, 1) << "ERROR: failed to set lua background thread name to: " << thread_name
-      << ". error: " << rc << dendl;
-  }
 }
 
 void Background::pause() {
@@ -127,6 +122,7 @@ const BackgroundMapValue& Background::get_table_value(const std::string& key) co
 //(2) Executes the script
 //(3) Sleep (configurable)
 void Background::run() {
+  ceph_pthread_setname("lua_background");
   const DoutPrefixProvider* const dpp = &dp;
   lua_state_guard lguard(cct->_conf->rgw_lua_max_memory_per_state, dpp);
   auto L = lguard.get();
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 67829e6320a..0dcf1e0f7d5 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -6811,6 +6811,8 @@ void RGWCompleteMultipart::execute(optional_yield y)
   if (upload->cksum_type != rgw::cksum::Type::none) {
     op_ret = try_sum_part_cksums(this, s->cct, upload.get(), parts, cksum, y);
     if (op_ret < 0) {
+      ldpp_dout(this, 16) << "ERROR: try_sum_part_cksums failed, obj="
+			  << meta_obj << " ret=" << op_ret << dendl;
       return;
     }
   }
@@ -6835,13 +6837,23 @@ void RGWCompleteMultipart::execute(optional_yield y)
       rgw::putobj::find_hdr_cksum(*(s->info.env));
 
       ldpp_dout_fmt(this, 10,
-		    "INFO: client supplied checksum {}: {}",
+		    "INFO: client supplied checksum {}: {} ",
 		    hdr_cksum.header_name(), supplied_cksum);
 
     if (! (supplied_cksum.empty()) &&
 	(supplied_cksum != armored_cksum)) {
-      op_ret = -ERR_INVALID_REQUEST;
-      return;
+      /* some minio SDK clients assert a checksum that is cryptographically
+       * valid but omits the part count */
+      auto parts_suffix = fmt::format("-{}", parts->parts.size());
+      auto suffix_len = armored_cksum->size() - parts_suffix.size();
+      if (armored_cksum->compare(0, suffix_len, supplied_cksum) != 0) {
+	ldpp_dout_fmt(this, 4,
+		      "{} content checksum mismatch"
+		      "\n\tcalculated={} != \n\texpected={}",
+		      hdr_cksum.header_name(), armored_cksum, supplied_cksum);
+	op_ret = -ERR_INVALID_REQUEST;
+	return;
+      }
     }
 
     buffer::list cksum_bl;
diff --git a/src/rgw/rgw_ratelimit.h b/src/rgw/rgw_ratelimit.h
index 0db1813f050..beb0eb3b1d2 100644
--- a/src/rgw/rgw_ratelimit.h
+++ b/src/rgw/rgw_ratelimit.h
@@ -239,6 +239,7 @@ class ActiveRateLimiter : public DoutPrefix  {
   std::atomic_uint8_t current_active = 0;
   std::shared_ptr<RateLimiter> ratelimit[2];
   void replace_active() {
+    ceph_pthread_setname("ratelimit_gc");
     using namespace std::chrono_literals;
     std::unique_lock<std::mutex> lk(cv_m);
     while (!stopped) {
@@ -286,8 +287,5 @@ class ActiveRateLimiter : public DoutPrefix  {
     void start() {
       ldpp_dout(this, 20) << "starting ratelimit_gc thread" << dendl;
       runner = std::thread(&ActiveRateLimiter::replace_active, this);
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), "ratelimit_gc"); rc != 0) {
-        ldpp_dout(this, 1) << "ERROR: failed to set ratelimit_gc thread name. error: " << rc << dendl;
-      }
     }
 };
diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc
index c0345a4f88a..adfc86d87cb 100644
--- a/src/rgw/rgw_rest_pubsub.cc
+++ b/src/rgw/rgw_rest_pubsub.cc
@@ -494,11 +494,11 @@ void RGWPSListTopicsOp::execute(optional_yield y) {
 
   const RGWPubSub ps(driver, get_account_or_tenant(s->owner.id), *s->penv.site);
   if (rgw::all_zonegroups_support(*s->penv.site, rgw::zone_features::notification_v2) &&
-      driver->stat_topics_v1(s->bucket->get_tenant(), null_yield, this) == -ENOENT) {
-    op_ret = ps.get_topics_v1(this, result, y);
-  } else {
+      driver->stat_topics_v1(get_account_or_tenant(s->owner.id), null_yield, this) == -ENOENT) {
     constexpr int max_items = 100;
     op_ret = ps.get_topics_v2(this, start_token, max_items, result, next_token, y);
+  } else {
+    op_ret = ps.get_topics_v1(this, result, y);
   }
   // if there are no topics it is not considered an error
   op_ret = op_ret == -ENOENT ? 0 : op_ret;
diff --git a/src/rgw/rgw_s3select.cc b/src/rgw/rgw_s3select.cc
index 800d276a6aa..d8be76a6b1c 100644
--- a/src/rgw/rgw_s3select.cc
+++ b/src/rgw/rgw_s3select.cc
@@ -344,7 +344,7 @@ RGWSelectObj_ObjStore_S3::~RGWSelectObj_ObjStore_S3()
 
 int RGWSelectObj_ObjStore_S3::get_params(optional_yield y)
 {
-  if(m_s3select_query.empty() == false) {
+  if (m_s3select_query.empty() == false) {
     return 0;
   }
 #ifndef _ARROW_EXIST
@@ -416,14 +416,14 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_csv(const char* query, const char*
   if (output_escape_char.size()) {
     csv.output_escape_char = *output_escape_char.c_str();
   }
-  if(output_quote_fields.compare("ALWAYS") == 0) {
+  if (output_quote_fields.compare("ALWAYS") == 0) {
     csv.quote_fields_always = true;
-  } else if(output_quote_fields.compare("ASNEEDED") == 0) {
+  } else if (output_quote_fields.compare("ASNEEDED") == 0) {
     csv.quote_fields_asneeded = true;
   }
-  if(m_header_info.compare("IGNORE")==0) {
+  if (m_header_info.compare("IGNORE")==0) {
     csv.ignore_header_info=true;
-  } else if(m_header_info.compare("USE")==0) {
+  } else if (m_header_info.compare("USE")==0) {
     csv.use_header_info=true;
   }
 
@@ -478,6 +478,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query)
   if (!m_s3_parquet_object.is_set()) {
     //parsing the SQL statement.
     s3select_syntax.parse_query(m_sql_query.c_str());
+    parquet_object::csv_definitions parquet;
 
   m_s3_parquet_object.set_external_system_functions(fp_s3select_continue,
 						fp_s3select_result_format,
@@ -485,8 +486,10 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query)
 						fp_debug_mesg);
 
     try {
+      //setting the Parquet-reader properties. i.e. the buffer-size for the Parquet-reader
+      parquet::ceph::S3select_Config::getInstance().set_s3select_reader_properties(s->cct->_conf->rgw_parquet_buffer_size);
       //at this stage the Parquet-processing requires for the meta-data that reside on Parquet object 
-      m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api);
+      m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api, parquet);
     } catch(base_s3select_exception& e) {
       ldpp_dout(this, 10) << "S3select: failed upon parquet-reader construction: " << e.what() << dendl;
       fp_result_header_format(m_aws_response_handler.get_sql_result());
@@ -524,6 +527,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char
 						fp_s3select_result_format,
 						fp_result_header_format,
 						fp_debug_mesg);
+  json_object::csv_definitions json;
 
   m_aws_response_handler.init_response();
 
@@ -547,8 +551,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char
   }
     
   //initializing json processor
-  json_object::csv_definitions output_definition;
-  m_s3_json_object.set_json_query(&s3select_syntax,output_definition);
+  m_s3_json_object.set_json_query(&s3select_syntax, json);
 
   if (input == nullptr) {
     input = "";
@@ -706,6 +709,7 @@ int RGWSelectObj_ObjStore_S3::range_request(int64_t ofs, int64_t len, void* buff
   RGWGetObj::parse_range();
   requested_buffer.clear();
   m_request_range = len;
+  m_aws_response_handler.update_processed_size(len);
   ldout(s->cct, 10) << "S3select: calling execute(async):" << " request-offset :" << ofs << " request-length :" << len << " buffer size : " << requested_buffer.size() << dendl;
   RGWGetObj::execute(y);
   if (buff) {
@@ -730,7 +734,7 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
     m_aws_response_handler.set(s, this, fp_chunked_transfer_encoding);
   }
 
-  if(s->cct->_conf->rgw_disable_s3select == true)
+  if (s->cct->_conf->rgw_disable_s3select == true)
   {
       std::string error_msg="s3select : is disabled by rgw_disable_s3select configuration parameter";
       ldpp_dout(this, 10) << error_msg << dendl;
@@ -749,12 +753,26 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
       return;
     }
     s3select_syntax.parse_query(m_sql_query.c_str());
+    //the run_s3select_on_parquet() calling the s3select-query-engine, that read and process the parquet object with RGW::range_request, 
+    //upon query-engine finish the processing, the control is back to execute()
+    //the parquet-reader indicates the end of the parquet object.
     status = run_s3select_on_parquet(m_sql_query.c_str());
     if (status) {
       ldout(s->cct, 10) << "S3select: failed to process query <" << m_sql_query << "> on object " << s->object->get_name() << dendl;
       op_ret = -ERR_INVALID_REQUEST;
     } else {
-      ldout(s->cct, 10) << "S3select: complete query with success " << dendl;
+      //status per amount of processed data
+#ifdef _ARROW_EXIST
+      m_aws_response_handler.update_total_bytes_returned(m_s3_parquet_object.get_return_result_size());
+#endif
+      m_aws_response_handler.init_stats_response();
+      m_aws_response_handler.send_stats_response();
+      m_aws_response_handler.init_end_response();
+      ldpp_dout(this, 10) << "s3select : reached the end of parquet query request : aws_response_handler.get_processed_size() " 
+      << m_aws_response_handler.get_processed_size()
+      << "m_object_size_for_processing : " << m_object_size_for_processing << dendl;
+       
+      ldout(s->cct, 10) << "S3select: complete parquet query with success " << dendl;
     }
     } else { 
 	//CSV or JSON processing
@@ -762,7 +780,7 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
 
 	  m_requested_range = (m_end_scan_sz - m_start_scan_sz);
 	    
-	  if(m_is_trino_request){
+	  if (m_is_trino_request){
 	  // fetch more than requested(m_scan_offset), that additional bytes are scanned for end of row, 
 	  // thus the additional length will be processed, and no broken row for Trino.
 	  // assumption: row is smaller than m_scan_offset. (a different approach is to request for additional range)
@@ -778,7 +796,8 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
 }
 
 int RGWSelectObj_ObjStore_S3::parquet_processing(bufferlist& bl, off_t ofs, off_t len)
-{
+{//purpose: to process the returned buffer from range-request, and to send it to the Parquet-reader.
+ //range_request() is called by arrow::ReadAt, and upon request completion the control is back to RGWSelectObj_ObjStore_S3::execute()
     fp_chunked_transfer_encoding();
     size_t append_in_callback = 0;
     int part_no = 1;
@@ -809,7 +828,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 //the purpose is to return "perfect" results, with no broken or missing lines.
 
   off_t new_offset = 0;
-  if(m_scan_range_ind){//only upon range-scan
+  if (m_scan_range_ind){//only upon range-scan
   int64_t sc=0;
   int64_t start =0;
   const char* row_delimiter = m_row_delimiter.c_str();
@@ -817,10 +836,10 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
     ldpp_dout(this, 10) << "s3select query: per Trino request the first and last chunk should modified." << dendl;
 
     //chop the head of the first chunk and only upon the slice does not include the head of the object.
-    if(m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){
+    if (m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){
       char* p = const_cast<char*>(it_cp+ofs);
       while(strncmp(row_delimiter,p,1) && (p - (it_cp+ofs)) < len)p++;
-      if(!strncmp(row_delimiter,p,1)){
+      if (!strncmp(row_delimiter,p,1)){
 	new_offset += (p - (it_cp+ofs))+1;
       } 
     }
@@ -831,14 +850,14 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 
     //chop the end of the last chunk for this request
     //if it's the last chunk, search for first row-delimiter for the following different use-cases
-    if((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ 
+    if ((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ 
     //had pass the requested range, start to search for first delimiter
-      if(m_aws_response_handler.get_processed_size()>m_requested_range){
+      if (m_aws_response_handler.get_processed_size()>m_requested_range){
 	//the previous chunk contain the complete request(all data) and an extra bytes.
 	//thus, search for the first row-delimiter
 	//[:previous (RR) ... ][:current (RD) ]
 	start = 0;
-      } else if(m_aws_response_handler.get_processed_size()){
+      } else if (m_aws_response_handler.get_processed_size()){
 	//the *current* chunk contain the complete request in the middle of the chunk. 
 	//thus, search for the first row-delimiter after the complete request position
 	//[:current (RR) .... (RD) ]
@@ -852,7 +871,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
       for(sc=start;sc<len;sc++)//assumption : row-delimiter must exist or its end ebject
       {
 	char* p = const_cast<char*>(it_cp) + ofs + sc;
-	if(!strncmp(row_delimiter,p,1)){
+	if (!strncmp(row_delimiter,p,1)){
 	      ldout(s->cct, 10) << "S3select: found row-delimiter on " << sc << " get_processed_size = " << m_aws_response_handler.get_processed_size() <<  dendl;
 	      len = sc + 1;//+1 is for delimiter.  TODO what about m_object_size_for_processing (to update according to len)
 	      //the end of row exist in current chunk.
@@ -872,7 +891,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t len)
 {
   int status = 0;
-  if(m_skip_next_chunk == true){
+  if (m_skip_next_chunk == true){
     return status;
   } 
 
@@ -894,13 +913,13 @@ int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t le
       }
 
 
-      if(ofs > it.length()){
+      if (ofs > it.length()){
       //safety check
 	ldpp_dout(this, 10) << "offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
 	ofs = 0;
       }
 
-    if(m_is_trino_request){
+    if (m_is_trino_request){
       //TODO replace len with it.length() ? ; test Trino flow with compressed objects.
       //is it possible to send get-by-ranges? in parallel?
       shape_chunk_per_trino_requests(&(it)[0], ofs, len); 
@@ -964,7 +983,7 @@ int RGWSelectObj_ObjStore_S3::json_processing(bufferlist& bl, off_t ofs, off_t l
         continue;
       }
 
-      if((ofs + len) > it.length()){
+      if ((ofs + len) > it.length()){
 	ldpp_dout(this, 10) << "s3select: offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
 	ofs = 0;
 	len = it.length();
@@ -1025,7 +1044,7 @@ int RGWSelectObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t ofs, off_
   if (len == 0 && s->obj_size != 0) {
     return 0;
   }
-  if (m_parquet_type) {
+  if (m_parquet_type) {//bufferlist sendback upon range-request
     return parquet_processing(bl,ofs,len);
   }
   if (m_json_type) {
diff --git a/src/s3select b/src/s3select
-Subproject f333ec82e6e8a3f7eb9ba1041d1442b2c7cd0f0
+Subproject 0a0f6d439441f5b121ed1052dac54542e4f1d89
diff --git a/src/script/ceph-debug-docker.sh b/src/script/ceph-debug-docker.sh
index 76d3b126153..c3edc5df76d 100755
--- a/src/script/ceph-debug-docker.sh
+++ b/src/script/ceph-debug-docker.sh
@@ -117,13 +117,15 @@ FROM ${env}
 
 WORKDIR /root
 RUN apt-get update --yes --quiet && \
-    apt-get install --yes --quiet screen gdb software-properties-common apt-transport-https curl
+    apt-get install --yes --quiet screen gdb software-properties-common apt-transport-https curl debuginfod ubuntu-dbgsym-keyring
 COPY cephdev.asc cephdev.asc
 RUN apt-key add cephdev.asc && \
     curl -L $repo_url | tee /etc/apt/sources.list.d/ceph_dev.list && \
     cat /etc/apt/sources.list.d/ceph_dev.list|sed -e 's/^deb/deb-src/' >>/etc/apt/sources.list.d/ceph_dev.list && \
     apt-get update --yes && \
-    DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get --assume-yes -q --no-install-recommends install -o Dpkg::Options::=--force-confnew --allow-unauthenticated ceph ceph-osd-dbg ceph-mds-dbg ceph-mgr-dbg ceph-mon-dbg ceph-common-dbg ceph-fuse-dbg ceph-test-dbg radosgw-dbg python3-cephfs python3-rados
+    DEBIAN_FRONTEND=noninteractive DEBIAN_PRIORITY=critical apt-get --assume-yes -q --no-install-recommends install -o Dpkg::Options::=--force-confnew --allow-unauthenticated ceph ceph-osd-dbg ceph-mds-dbg ceph-mgr-dbg ceph-mon-dbg ceph-common-dbg ceph-fuse-dbg ceph-test-dbg radosgw-dbg python3-cephfs python3-rados ; \
+    printf 'set debuginfod enabled on\n' | tee -a ~/.gdbinit
+ENV DEBUGINFOD_URLS="https://debuginfod.ubuntu.com"
 EOF
         time run $SUDO docker build $CACHE --tag "$tag" .
     else
@@ -180,7 +182,7 @@ EOF
 
     printf "built image %s\n" "$tag"
 
-    run $SUDO docker run $PRIVILEGED -ti -v /ceph:/ceph:ro -v /cephfs:/cephfs:ro -v /teuthology:/teuthology:ro "$tag"
+    run $SUDO docker run $PRIVILEGED -ti -v /teuthology:/teuthology:ro "$tag"
     return 0
 }
 
diff --git a/src/spdk b/src/spdk
-Subproject 1a527e501f810e2b39b9862c96f3e8bdc465db8
+Subproject fcfcc4aab16419c49f208032ca77a0a8de80d35
diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h
index de84ede9049..8f6381dd52b 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.h
+++ b/src/test/ObjectMap/KeyValueDBMemory.h
@@ -69,7 +69,14 @@ public:
 
     explicit TransactionImpl_(KeyValueDBMemory *db) : db(db) {}
 
-
+    // dummy implementation
+    size_t get_count() const override {
+      return 0;
+    }
+    // dummy implementation
+    size_t get_size_bytes() const override {
+      return 0;
+    }
     struct SetOp : public Context {
       KeyValueDBMemory *db;
       std::pair<std::string,std::string> key;
diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc
index 69a3cbefd0e..dea29f96f11 100644
--- a/src/test/admin_socket.cc
+++ b/src/test/admin_socket.cc
@@ -27,6 +27,8 @@
 #include <sys/un.h>
 #include <signal.h>
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 class AdminSocketTest
diff --git a/src/test/admin_socket_output.h b/src/test/admin_socket_output.h
index 1df12e4a9a5..5d22e8757ee 100644
--- a/src/test/admin_socket_output.h
+++ b/src/test/admin_socket_output.h
@@ -16,6 +16,7 @@
 #define CEPH_ADMIN_SOCKET_OUTPUT_H
 
 #include <filesystem>
+#include <iostream> // for std::cout
 #include <string>
 #include <map>
 #include <set>
diff --git a/src/test/bench_log.cc b/src/test/bench_log.cc
index 60fda462e87..9e7c02afc41 100644
--- a/src/test/bench_log.cc
+++ b/src/test/bench_log.cc
@@ -1,6 +1,8 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <iostream> // for std::cout
+
 #include "include/types.h"
 #include "common/Thread.h"
 #include "common/debug.h"
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index 013335d8177..4b3ca95ca6c 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -23,6 +23,8 @@
 #include <errno.h>
 #include <sys/uio.h>
 
+#include <iostream> // for std::cout
+
 #include "include/buffer.h"
 #include "include/buffer_raw.h"
 #include "include/compat.h"
diff --git a/src/test/ceph_argparse.cc b/src/test/ceph_argparse.cc
index 738879c5ba8..436ddc86363 100644
--- a/src/test/ceph_argparse.cc
+++ b/src/test/ceph_argparse.cc
@@ -15,6 +15,7 @@
 #include "common/ceph_argparse.h"
 
 #include "gtest/gtest.h"
+#include <iostream> // for std::cout
 #include <vector>
 #include "include/stringify.h"
 
diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc
index b36d0a901de..6ca05f6dae3 100644
--- a/src/test/common/Throttle.cc
+++ b/src/test/common/Throttle.cc
@@ -23,6 +23,7 @@
 #include <signal.h>
 
 #include <chrono>
+#include <iostream> // for std::cout
 #include <list>
 #include <mutex>
 #include <random>
diff --git a/src/test/common/test_cdc.cc b/src/test/common/test_cdc.cc
index 620ecf4679f..61a5aa3708c 100644
--- a/src/test/common/test_cdc.cc
+++ b/src/test/common/test_cdc.cc
@@ -3,6 +3,7 @@
 
 #include <vector>
 #include <cstring>
+#include <iostream> // for std::cout
 #include <random>
 
 #include "include/types.h"
diff --git a/src/test/common/test_config.cc b/src/test/common/test_config.cc
index a70d567a434..4805c14a32e 100644
--- a/src/test/common/test_config.cc
+++ b/src/test/common/test_config.cc
@@ -19,6 +19,9 @@
  *
  *
  */
+
+#include <iostream> // for std::cout
+
 #include "common/config_proxy.h"
 #include "common/errno.h"
 #include "gtest/gtest.h"
diff --git a/src/test/common/test_context.cc b/src/test/common/test_context.cc
index 889d000da85..8afded98951 100644
--- a/src/test/common/test_context.cc
+++ b/src/test/common/test_context.cc
@@ -19,6 +19,9 @@
  *
  *
  */
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "include/types.h"
 #include "include/msgr.h"
diff --git a/src/test/common/test_shared_cache.cc b/src/test/common/test_shared_cache.cc
index 91120c7e59f..b7a392426d5 100644
--- a/src/test/common/test_shared_cache.cc
+++ b/src/test/common/test_shared_cache.cc
@@ -22,6 +22,9 @@
 
 #include <stdio.h>
 #include <signal.h>
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "common/Thread.h"
 #include "common/shared_cache.hpp"
diff --git a/src/test/common/test_url_escape.cc b/src/test/common/test_url_escape.cc
index 6c27b64da7a..52de8db8d9c 100644
--- a/src/test/common/test_url_escape.cc
+++ b/src/test/common/test_url_escape.cc
@@ -3,6 +3,8 @@
 
 #include "common/url_escape.h"
 
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 
 TEST(url_escape, escape) {
diff --git a/src/test/compressor/test_compression.cc b/src/test/compressor/test_compression.cc
index 98ef159dfb8..c5e4724cefc 100644
--- a/src/test/compressor/test_compression.cc
+++ b/src/test/compressor/test_compression.cc
@@ -17,6 +17,9 @@
 #include <errno.h>
 #include <signal.h>
 #include <stdlib.h>
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
 #include "common/config.h"
diff --git a/src/test/crimson/seastar_runner.h b/src/test/crimson/seastar_runner.h
index 63cc50d9f05..590eef13adf 100644
--- a/src/test/crimson/seastar_runner.h
+++ b/src/test/crimson/seastar_runner.h
@@ -71,6 +71,19 @@ struct SeastarRunner {
     auto ret = app.run(argc, argv, [this] {
       on_end.reset(new seastar::readable_eventfd);
       return seastar::now().then([this] {
+// FIXME: The stall detector uses glibc backtrace function to
+// collect backtraces, this causes ASAN failures on ARM.
+// For now we just extend timeout duration to 10000h in order to
+// get the same effect as disabling the stall detector which is not provided by seastar.
+// the ticket about migrating to libunwind: https://github.com/scylladb/seastar/issues/1878
+// Will remove once the ticket fixed.
+// Ceph ticket see: https://tracker.ceph.com/issues/65635
+#ifdef __aarch64__
+	seastar::smp::invoke_on_all([] {
+	  using namespace std::chrono;
+	  seastar::engine().update_blocked_reactor_notify_ms(duration_cast<milliseconds>(10000h));
+	}).get();
+#endif
 	begin_signaled = true;
 	[[maybe_unused]] auto r = ::eventfd_write(begin_fd.get(), APP_RUNNING);
 	assert(r == 0);
diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc
index e7aabf2c8af..5dbc3748e5b 100644
--- a/src/test/crimson/seastore/test_object_data_handler.cc
+++ b/src/test/crimson/seastore/test_object_data_handler.cc
@@ -218,14 +218,20 @@ struct object_data_handler_test_t:
     objaddr_t offset,
     extent_len_t length) {
     auto ret = with_trans_intr(t, [&](auto &t) {
-      return tm->get_pins(t, laddr_t::from_byte_offset(offset), length);
+      auto &layout = onode->get_layout();
+      auto odata = layout.object_data.get();
+      auto obase = odata.get_reserved_data_base();
+      return tm->get_pins(t, (obase + offset).checked_to_laddr(), length);
     }).unsafe_get();
     return ret;
   }
   std::list<LBAMappingRef> get_mappings(objaddr_t offset, extent_len_t length) {
     auto t = create_mutate_transaction();
     auto ret = with_trans_intr(*t, [&](auto &t) {
-      return tm->get_pins(t, laddr_t::from_byte_offset(offset), length);
+      auto &layout = onode->get_layout();
+      auto odata = layout.object_data.get();
+      auto obase = odata.get_reserved_data_base();
+      return tm->get_pins(t, (obase + offset).checked_to_laddr(), length);
     }).unsafe_get();
     return ret;
   }
@@ -253,12 +259,16 @@ struct object_data_handler_test_t:
 
   ObjectDataBlockRef get_extent(
     Transaction &t,
-    laddr_t addr,
+    loffset_t addr,
     extent_len_t len) {
+    auto &layout = onode->get_layout();
+    auto odata = layout.object_data.get();
+    auto obase = odata.get_reserved_data_base();
     auto ext = with_trans_intr(t, [&](auto& trans) {
-	return tm->read_extent<ObjectDataBlock>(trans, addr, len);
-	}).unsafe_get();
-    EXPECT_EQ(addr, ext->get_laddr());
+      return tm->read_extent<ObjectDataBlock>(
+	trans, (obase + addr).checked_to_laddr(), len);
+    }).unsafe_get();
+    EXPECT_EQ((obase + addr).checked_to_laddr(), ext->get_laddr());
     return ext;
   }
 
@@ -798,7 +808,7 @@ TEST_P(object_data_handler_test_t, overwrite_then_read_within_transaction) {
       auto pins = get_mappings(*t, base, len);
       assert(pins.size() == 1);
       auto pin1 = remap_pin(*t, std::move(pins.front()), 4096, 8192);
-      auto ext = get_extent(*t, laddr_t::from_byte_offset(base + 4096), 4096 * 2);
+      auto ext = get_extent(*t, base + 4096, 4096 * 2);
       ASSERT_TRUE(ext->is_exist_clean());
       write(*t, base + 4096, 4096, 'y');
       ASSERT_TRUE(ext->is_exist_mutation_pending());
diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc
index 6ad111dca5b..2d20c5fff94 100644
--- a/src/test/crimson/seastore/test_transaction_manager.cc
+++ b/src/test/crimson/seastore/test_transaction_manager.cc
@@ -26,6 +26,10 @@ namespace {
   }
 }
 
+laddr_t get_laddr_hint(uint64_t offset) {
+  return laddr_t::from_byte_offset(RootMetaBlock::SIZE + offset);
+}
+
 struct test_extent_record_t {
   test_extent_desc_t desc;
   unsigned refcount = 0;
@@ -67,8 +71,9 @@ struct transaction_manager_test_t :
   }
 
   laddr_t get_random_laddr(size_t block_size, size_t limit) {
-    return laddr_t::from_byte_offset(block_size *
-      std::uniform_int_distribution<>(0, (limit / block_size) - 1)(gen));
+    auto offset =  block_size *
+      std::uniform_int_distribution<>(0, (limit / block_size) - 1)(gen);
+    return get_laddr_hint(offset);
   }
 
   char get_random_contents() {
@@ -719,7 +724,7 @@ struct transaction_manager_test_t :
       [this, &overlay](auto &t) {
 	return lba_manager->scan_mappings(
 	  t,
-	  L_ADDR_MIN,
+	  get_laddr_hint(0),
 	  L_ADDR_MAX,
 	  [iter=overlay.begin(), &overlay](auto l, auto p, auto len) mutable {
 	    EXPECT_NE(iter, overlay.end());
@@ -830,9 +835,9 @@ struct transaction_manager_test_t :
 	auto t = create_transaction();
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * BSIZE),
+	  get_laddr_hint(i * BSIZE),
 	  BSIZE);
-	ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+	ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
 	submit_transaction(std::move(t));
       }
 
@@ -844,7 +849,7 @@ struct transaction_manager_test_t :
 	    boost::make_counting_iterator(0lu),
 	    boost::make_counting_iterator(BLOCKS),
 	    [this, &t](auto i) {
-	    return tm->read_extent<TestBlock>(t, laddr_t::from_byte_offset(i * BSIZE), BSIZE
+	    return tm->read_extent<TestBlock>(t, get_laddr_hint(i * BSIZE), BSIZE
 	    ).si_then([](auto) {
 	      return seastar::now();
 	    });
@@ -870,9 +875,9 @@ struct transaction_manager_test_t :
               auto t = create_transaction();
               auto extent = alloc_extent(
                 t,
-                laddr_t::from_byte_offset(i * BSIZE),
+                get_laddr_hint(i * BSIZE),
                 BSIZE);
-              ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+              ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
               if (try_submit_transaction(std::move(t)))
                 break;
             }
@@ -973,6 +978,7 @@ struct transaction_manager_test_t :
         extent_types_t::ROOT,
         extent_types_t::LADDR_INTERNAL,
         extent_types_t::LADDR_LEAF,
+	extent_types_t::ROOT_META,
         extent_types_t::OMAP_INNER,
         extent_types_t::OMAP_LEAF,
         extent_types_t::ONODE_BLOCK_STAGED,
@@ -1346,9 +1352,9 @@ struct transaction_manager_test_t :
   void test_remap_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t l_offset = laddr_t::from_byte_offset(32 << 10);
+      laddr_t l_offset = get_laddr_hint(32 << 10);
       size_t l_len = 32 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t r_offset = get_laddr_hint(64 << 10);
       size_t r_len = 32 << 10;
       {
 	auto t = create_transaction();
@@ -1400,12 +1406,12 @@ struct transaction_manager_test_t :
   void test_clone_and_remap_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t l_offset = laddr_t::from_byte_offset(32 << 10);
+      laddr_t l_offset = get_laddr_hint(32 << 10);
       size_t l_len = 32 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t r_offset = get_laddr_hint(64 << 10);
       size_t r_len = 32 << 10;
-      laddr_t l_clone_offset = laddr_t::from_byte_offset(96 << 10);
-      laddr_t r_clone_offset = laddr_t::from_byte_offset(128 << 10);
+      laddr_t l_clone_offset = get_laddr_hint(96 << 10);
+      laddr_t r_clone_offset = get_laddr_hint(128 << 10);
       {
 	auto t = create_transaction();
 	auto lext = alloc_extent(t, l_offset, l_len);
@@ -1455,11 +1461,11 @@ struct transaction_manager_test_t :
   void test_overwrite_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t m_offset = laddr_t::from_byte_offset(8 << 10);
+      laddr_t m_offset = get_laddr_hint(8 << 10);
       size_t m_len = 56 << 10;
-      laddr_t l_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t l_offset = get_laddr_hint(64 << 10);
       size_t l_len = 64 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(128 << 10);
+      laddr_t r_offset = get_laddr_hint(128 << 10);
       size_t r_len = 64 << 10;
       {
 	auto t = create_transaction();
@@ -1538,7 +1544,7 @@ struct transaction_manager_test_t :
     run_async([this] {
       disable_max_extent_size();
       constexpr unsigned REMAP_NUM = 32;
-      constexpr laddr_t offset = L_ADDR_MIN;
+      laddr_t offset = get_laddr_hint(0);
       constexpr size_t length = 256 << 10;
       {
 	auto t = create_transaction();
@@ -1575,7 +1581,7 @@ struct transaction_manager_test_t :
 	      if (off == 0 || off >= 255) {
 		continue;
 	      }
-              auto new_off = laddr_t::from_byte_offset(off << 10)
+              auto new_off = get_laddr_hint(off << 10)
 		  .get_byte_distance<extent_len_t>(last_pin->get_key());
               auto new_len = last_pin->get_length() - new_off;
               //always remap right extent at new split_point
@@ -1621,7 +1627,7 @@ struct transaction_manager_test_t :
     run_async([this] {
       disable_max_extent_size();
       constexpr unsigned REMAP_NUM = 32;
-      constexpr laddr_t offset = L_ADDR_MIN;
+      laddr_t offset = get_laddr_hint(0);
       constexpr size_t length = 256 << 10;
       {
 	auto t = create_transaction();
@@ -1661,12 +1667,12 @@ struct transaction_manager_test_t :
 	    ASSERT_TRUE(!split_points.empty());
             while(!split_points.empty()) {
               // new overwrite area: start_off ~ end_off
-              auto start_off = split_points.front();
+              auto start_off = split_points.front() + 4 /*RootMetaBlock*/;
               split_points.pop_front();
-              auto end_off = split_points.front();
+              auto end_off = split_points.front() + 4 /*RootMetaBlock*/;
               split_points.pop_front();
               ASSERT_TRUE(start_off <= end_off);
-              if ((laddr_t::from_byte_offset(end_off << 10) == pin0->get_key() + pin0->get_length())
+              if ((get_laddr_hint(end_off << 10) == pin0->get_key() + pin0->get_length())
                 || (start_off == end_off)) {
                 if (split_points.empty() && empty_transaction) {
                   early_exit++;
@@ -1675,7 +1681,7 @@ struct transaction_manager_test_t :
                 continue;
               }
               empty_transaction = false;
-              auto new_off = laddr_t::from_byte_offset(start_off << 10)
+              auto new_off = get_laddr_hint(start_off << 10)
 		  .get_byte_distance<extent_len_t>(last_rpin->get_key());
               auto new_len = (end_off - start_off) << 10;
               bufferlist bl;
@@ -1768,7 +1774,7 @@ struct tm_random_block_device_test_t :
 TEST_P(tm_random_block_device_test_t, scatter_allocation)
 {
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * 4096);
+    laddr_t ADDR = get_laddr_hint(0xFF * 4096);
     epm->prefill_fragmented_devices();
     auto t = create_transaction();
     for (int i = 0; i < 1991; i++) {
@@ -1786,7 +1792,7 @@ TEST_P(tm_single_device_test_t, basic)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1807,7 +1813,7 @@ TEST_P(tm_single_device_test_t, mutate)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1845,8 +1851,8 @@ TEST_P(tm_single_device_test_t, allocate_lba_conflict)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
-    laddr_t ADDR2 = laddr_t::from_byte_offset(0xFE * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
+    laddr_t ADDR2 = get_laddr_hint(0xFE * SIZE);
     auto t = create_transaction();
     auto t2 = create_transaction();
 
@@ -1883,7 +1889,7 @@ TEST_P(tm_single_device_test_t, mutate_lba_conflict)
       for (unsigned i = 0; i < 300; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       check_mappings(t);
@@ -1891,7 +1897,7 @@ TEST_P(tm_single_device_test_t, mutate_lba_conflict)
       check();
     }
 
-    laddr_t ADDR = laddr_t::from_byte_offset(150 * SIZE);
+    laddr_t ADDR = get_laddr_hint(150 * SIZE);
     {
       auto t = create_transaction();
       auto t2 = create_transaction();
@@ -1917,15 +1923,15 @@ TEST_P(tm_single_device_test_t, concurrent_mutate_lba_no_conflict)
 {
   constexpr size_t SIZE = 4096;
   constexpr size_t NUM = 500;
-  laddr_t addr = L_ADDR_MIN;
-  laddr_t addr2 = laddr_t::from_byte_offset(SIZE * (NUM - 1));
+  laddr_t addr = get_laddr_hint(0);
+  laddr_t addr2 = get_laddr_hint(SIZE * (NUM - 1));
   run_async([this, addr, addr2] {
     {
       auto t = create_transaction();
       for (unsigned i = 0; i < NUM; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       submit_transaction(std::move(t));
@@ -1949,7 +1955,7 @@ TEST_P(tm_single_device_test_t, create_remove_same_transaction)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1985,7 +1991,7 @@ TEST_P(tm_single_device_test_t, split_merge_read_same_transaction)
       for (unsigned i = 0; i < 300; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       check_mappings(t);
@@ -1997,7 +2003,7 @@ TEST_P(tm_single_device_test_t, split_merge_read_same_transaction)
       for (unsigned i = 0; i < 240; ++i) {
 	dec_ref(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE));
+	  get_laddr_hint(i * SIZE));
       }
       check_mappings(t);
       submit_transaction(std::move(t));
@@ -2010,7 +2016,7 @@ TEST_P(tm_single_device_test_t, inc_dec_ref)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -2061,10 +2067,10 @@ TEST_P(tm_single_device_test_t, cause_lba_split)
       auto t = create_transaction();
       auto extent = alloc_extent(
 	t,
-	laddr_t::from_byte_offset(i * SIZE),
+	get_laddr_hint(i * SIZE),
 	SIZE,
 	(char)(i & 0xFF));
-      ASSERT_EQ(laddr_t::from_byte_offset(i * SIZE), extent->get_laddr());
+      ASSERT_EQ(get_laddr_hint(i * SIZE), extent->get_laddr());
       submit_transaction(std::move(t));
     }
     check();
@@ -2082,9 +2088,9 @@ TEST_P(tm_single_device_test_t, random_writes)
       auto t = create_transaction();
       auto extent = alloc_extent(
 	t,
-	laddr_t::from_byte_offset(i * BSIZE),
+	get_laddr_hint(i * BSIZE),
 	BSIZE);
-      ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+      ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
       submit_transaction(std::move(t));
     }
 
@@ -2100,7 +2106,7 @@ TEST_P(tm_single_device_test_t, random_writes)
 	  // pad out transaction
 	  auto paddings = alloc_extents(
 	    t,
-	    laddr_t::from_byte_offset(TOTAL + (k * PADDING_SIZE)),
+	    get_laddr_hint(TOTAL + (k * PADDING_SIZE)),
 	    PADDING_SIZE);
 	  for (auto &padding : paddings) {
 	    dec_ref(t, padding->get_laddr());
@@ -2133,7 +2139,7 @@ TEST_P(tm_single_device_test_t, find_hole_assert_trigger)
 
 TEST_P(tm_single_device_intergrity_check_test_t, remap_lazy_read)
 {
-  constexpr laddr_t offset = L_ADDR_MIN;
+  laddr_t offset = get_laddr_hint(0);
   constexpr size_t length = 256 << 10;
    run_async([this, offset] {
     disable_max_extent_size();
@@ -2186,7 +2192,7 @@ TEST_P(tm_single_device_test_t, invalid_lba_mapping_detect)
       for (int i = 0; i < LEAF_NODE_CAPACITY; i++) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * 4096),
+	  get_laddr_hint(i * 4096),
 	  4096,
 	  'a');
       }
@@ -2195,12 +2201,12 @@ TEST_P(tm_single_device_test_t, invalid_lba_mapping_detect)
 
     {
       auto t = create_transaction();
-      auto pin = get_pin(t, laddr_t::from_byte_offset((LEAF_NODE_CAPACITY - 1) * 4096));
+      auto pin = get_pin(t, get_laddr_hint((LEAF_NODE_CAPACITY - 1) * 4096));
       assert(pin->is_parent_viewable());
-      auto extent = alloc_extent(t, laddr_t::from_byte_offset(LEAF_NODE_CAPACITY * 4096), 4096, 'a');
+      auto extent = alloc_extent(t, get_laddr_hint(LEAF_NODE_CAPACITY * 4096), 4096, 'a');
       assert(!pin->is_parent_viewable());
-      pin = get_pin(t, laddr_t::from_byte_offset(LEAF_NODE_CAPACITY * 4096));
-      std::ignore = alloc_extent(t, laddr_t::from_byte_offset((LEAF_NODE_CAPACITY + 1) * 4096), 4096, 'a');
+      pin = get_pin(t, get_laddr_hint(LEAF_NODE_CAPACITY * 4096));
+      std::ignore = alloc_extent(t, get_laddr_hint((LEAF_NODE_CAPACITY + 1) * 4096), 4096, 'a');
       assert(pin->is_parent_viewable());
       assert(pin->parent_modified());
       pin->maybe_fix_pos();
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index df743327aaa..7e058c80ed6 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -91,9 +91,11 @@ struct FakePrimary {
   eversion_t last_update;
   eversion_t projected_last_update;
   eversion_t log_tail;
+  PGLog pg_log;
+  PGLog::IndexedLog projected_log;
 
   FakePrimary(FakeStore&& store)
-    : store(std::move(store)) {
+    : store(std::move(store)), pg_log(nullptr) {
   }
 };
 
@@ -234,6 +236,10 @@ struct BackfillFixture::PeeringFacade
     return backfill_source.log_tail;
   }
 
+  const PGLog& get_pg_log() const override {
+    return backfill_source.pg_log;
+  }
+
   void scan_log_after(eversion_t, scan_log_func_t) const override {
     /* NOP */
   }
@@ -263,6 +269,11 @@ struct BackfillFixture::PGFacade : public crimson::osd::BackfillState::PGFacade
   const eversion_t& get_projected_last_update() const override {
     return backfill_source.projected_last_update;
   }
+
+  const PGLog::IndexedLog& get_projected_log() const override {
+    return backfill_source.projected_log;
+  }
+
 };
 
 BackfillFixture::BackfillFixture(
diff --git a/src/test/crypto.cc b/src/test/crypto.cc
index 819d41c7218..67fb440eeb9 100644
--- a/src/test/crypto.cc
+++ b/src/test/crypto.cc
@@ -1,6 +1,8 @@
 #include <errno.h>
 #include <time.h>
 
+#include <iostream> // for std::cout
+
 #include <boost/container/small_vector.hpp>
 
 #include "gtest/gtest.h"
diff --git a/src/test/daemon_config.cc b/src/test/daemon_config.cc
index cdea3b05932..4c7abd70b20 100644
--- a/src/test/daemon_config.cc
+++ b/src/test/daemon_config.cc
@@ -21,6 +21,8 @@
 #include "include/rados/librados.h"
 
 #include <errno.h>
+
+#include <iostream> // for std::cout
 #include <sstream>
 #include <string>
 #include <string.h>
diff --git a/src/test/encoding.cc b/src/test/encoding.cc
index 3c83716b048..3d508909d6d 100644
--- a/src/test/encoding.cc
+++ b/src/test/encoding.cc
@@ -4,6 +4,8 @@
 #include <fmt/format.h>
 #include "gtest/gtest.h"
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 template < typename T >
diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index c86e58697c0..00054bf4784 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -52,9 +52,9 @@ int ErasureCodeBench::setup(int argc, char** argv) {
   desc.add_options()
     ("help,h", "produce help message")
     ("verbose,v", "explain what happens")
-    ("size,s", po::value<int>()->default_value(1024 * 1024),
+    ("size,s", po::value<int>()->default_value(80 * 1024 * 1024),
      "size of the buffer to be encoded")
-    ("iterations,i", po::value<int>()->default_value(1),
+    ("iterations,i", po::value<int>()->default_value(100),
      "number of encode/decode runs")
     ("plugin,p", po::value<string>()->default_value("jerasure"),
      "erasure code plugin name")
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 2dc870411bb..062980d8655 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -23,6 +23,8 @@
 #include "common/Thread.h"
 #include "include/stringify.h"
 #include "osd/ReplicatedBackend.h"
+
+#include <iostream> // for std::cout
 #include <sstream>
 
 using namespace std;
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index 0bfdd48eb98..b32c5660112 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -23,6 +23,7 @@
 #include "MemWriteback.h"
 
 #include <atomic>
+#include <iostream> // for std::cout
 
 using namespace std;
 
diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc
index 7ab9561bc19..b75e6a50825 100644
--- a/src/test/perf_counters.cc
+++ b/src/test/perf_counters.cc
@@ -15,7 +15,7 @@
 #include "include/types.h" // FIXME: ordering shouldn't be important, but right 
                            // now, this include has to come before the others.
 
-
+#include "include/utime.h"
 #include "common/perf_counters_key.h"
 #include "common/perf_counters_collection.h"
 #include "common/admin_socket_client.h"
diff --git a/src/test/pybind/test_cephfs.py b/src/test/pybind/test_cephfs.py
index 3761056efdf..577cb9e4171 100644
--- a/src/test/pybind/test_cephfs.py
+++ b/src/test/pybind/test_cephfs.py
@@ -610,10 +610,10 @@ def test_ftruncate(testdir):
 def test_fallocate(testdir):
     fd = cephfs.open(b'/file-fallocate', 'w', 0o755)
     assert_raises(TypeError, cephfs.fallocate, b'/file-fallocate', 0, 10)
-    cephfs.fallocate(fd, 0, 10)
+    assert_raises(libcephfs.OperationNotSupported, cephfs.fallocate, fd, 0, 10)
     stat = cephfs.fsync(fd, 0)
     st = cephfs.fstat(fd)
-    assert_equal(st.st_size, 10)
+    assert_equal(st.st_size, 0)
     cephfs.close(fd)
     cephfs.unlink(b'/file-fallocate')
 
diff --git a/src/test/strtol.cc b/src/test/strtol.cc
index ec3f6715b8e..aac52c6627f 100644
--- a/src/test/strtol.cc
+++ b/src/test/strtol.cc
@@ -184,6 +184,18 @@ TEST(IECStrToLL, WithUnits) {
   units["Ti"] = 40;
   units["Pi"] = 50;
   units["Ei"] = 60;
+  units["KB"] = 10;
+  units["MB"] = 20;
+  units["GB"] = 30;
+  units["TB"] = 40;
+  units["PB"] = 50;
+  units["EB"] = 60;
+  units["KiB"] = 10;
+  units["MiB"] = 20;
+  units["GiB"] = 30;
+  units["TiB"] = 40;
+  units["PiB"] = 50;
+  units["EiB"] = 60;
 
   for (std::map<std::string,int>::iterator p = units.begin();
        p != units.end(); ++p) {
@@ -259,6 +271,21 @@ TEST(StrictIECCast, Error) {
   }
   {
     std::string err;
+    (void)strict_iec_cast<int64_t>("1GT", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
+    (void)strict_iec_cast<int64_t>("1TG", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
+    (void)strict_iec_cast<int64_t>("1KD", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
     (void)strict_iec_cast<int64_t>("2E", &err);
     ASSERT_EQ(err, "");
   }
diff --git a/src/test/test_addrs.cc b/src/test/test_addrs.cc
index 4062d0431c3..e70d234d743 100644
--- a/src/test/test_addrs.cc
+++ b/src/test/test_addrs.cc
@@ -17,6 +17,7 @@
 #include "msg/msg_types.h"
 #include "gtest/gtest.h"
 
+#include <iostream> // for std::cout
 #include <sstream>
 
 using namespace std;
diff --git a/src/test/test_denc.cc b/src/test/test_denc.cc
index 02dd1454ef8..c9144bd05bf 100644
--- a/src/test/test_denc.cc
+++ b/src/test/test_denc.cc
@@ -15,6 +15,8 @@
  */
 
 #include <stdio.h>
+
+#include <iostream> // for std::cout
 #include <numeric>
 
 #include "global/global_init.h"
diff --git a/src/test/test_features.cc b/src/test/test_features.cc
index 1ae758bfb34..bdd8838224b 100644
--- a/src/test/test_features.cc
+++ b/src/test/test_features.cc
@@ -2,6 +2,8 @@
 // vim: ts=8 sw=2 smarttab
 #include <stdio.h>
 
+#include <iostream> // for std::cout
+
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "common/ceph_releases.h"
diff --git a/src/test/test_mempool.cc b/src/test/test_mempool.cc
index bb46b19aa4e..b806282d039 100644
--- a/src/test/test_mempool.cc
+++ b/src/test/test_mempool.cc
@@ -16,6 +16,8 @@
 
 #include <stdio.h>
 
+#include <iostream> // for std::cout
+
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
diff --git a/src/test/test_perf_counters_cache.cc b/src/test/test_perf_counters_cache.cc
index 1fa147ee273..fa2d541b7f7 100644
--- a/src/test/test_perf_counters_cache.cc
+++ b/src/test/test_perf_counters_cache.cc
@@ -4,6 +4,7 @@
 #include "global/global_context.h"
 #include "global/global_init.h"
 #include "include/msgr.h" // for CEPH_ENTITY_TYPE_CLIENT
+#include "include/utime.h"
 #include "gtest/gtest.h"
 
 using namespace ceph::perf_counters;
diff --git a/src/test/test_rewrite_latency.cc b/src/test/test_rewrite_latency.cc
index 348c8dde5c6..48a95cf183b 100644
--- a/src/test/test_rewrite_latency.cc
+++ b/src/test/test_rewrite_latency.cc
@@ -1,5 +1,6 @@
 
 #include <unistd.h>
+#include <iostream> // for std::cout
 #include <map>
 #include <errno.h>
 
diff --git a/src/test/test_snap_mapper.cc b/src/test/test_snap_mapper.cc
index a47d2538c3a..7a9ac62defe 100644
--- a/src/test/test_snap_mapper.cc
+++ b/src/test/test_snap_mapper.cc
@@ -1,4 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include <iostream> // for std::cout
 #include <iterator>
 #include <map>
 #include <set>
diff --git a/src/test/test_striper.cc b/src/test/test_striper.cc
index ee70304ebc8..1e5f93a49be 100644
--- a/src/test/test_striper.cc
+++ b/src/test/test_striper.cc
@@ -3,6 +3,8 @@
 
 #include "osdc/Striper.h"
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 TEST(Striper, Stripe1)
diff --git a/src/test/test_utime.cc b/src/test/test_utime.cc
index b1cee0e805c..de1d6f46878 100644
--- a/src/test/test_utime.cc
+++ b/src/test/test_utime.cc
@@ -1,4 +1,7 @@
 #include "include/utime.h"
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "include/stringify.h"
 #include "common/ceph_context.h"
diff --git a/src/test/test_workqueue.cc b/src/test/test_workqueue.cc
index 771b9d65952..5c2fc459da2 100644
--- a/src/test/test_workqueue.cc
+++ b/src/test/test_workqueue.cc
@@ -1,6 +1,9 @@
 #include "gtest/gtest.h"
 
 #include "common/WorkQueue.h"
+
+#include <iostream> // for std::cout
+
 #include "common/ceph_argparse.h"
 
 using namespace std;
diff --git a/src/test/testcrypto.cc b/src/test/testcrypto.cc
index 2efb9b219b9..8e3337babea 100644
--- a/src/test/testcrypto.cc
+++ b/src/test/testcrypto.cc
@@ -1,6 +1,8 @@
 #include "auth/Crypto.h"
-#include "common/Clock.h"
 
+#include <iostream> // for std::cout
+
+#include "common/Clock.h"
 #include "common/config.h"
 #include "common/debug.h"
 
diff --git a/src/test/testkeys.cc b/src/test/testkeys.cc
index 85d0b56676f..dacddb08786 100644
--- a/src/test/testkeys.cc
+++ b/src/test/testkeys.cc
@@ -1,4 +1,7 @@
 #include "auth/cephx/CephxKeyServer.h"
+
+#include <iostream> // for std::cout
+
 #include "common/ceph_argparse.h"
 #include "global/global_init.h"
 #include "common/config.h"
diff --git a/src/tools/radosacl.cc b/src/tools/radosacl.cc
index 3bfef8fb157..a6c9b9f8dc4 100644
--- a/src/tools/radosacl.cc
+++ b/src/tools/radosacl.cc
@@ -16,6 +16,8 @@
 #include <time.h>
 #include <errno.h>
 
+#include <iostream> // for std::cerr
+
 #include "include/types.h"
 #include "include/rados/librados.hpp"
 
diff --git a/src/vstart.sh b/src/vstart.sh
index 4b62db677d1..45d3ba9b070 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -192,7 +192,6 @@ if [[ "$(get_cmake_variable WITH_MGR_DASHBOARD_FRONTEND)" != "ON" ]] ||
     debug echo "ceph-mgr dashboard not built - disabling."
     with_mgr_dashboard=false
 fi
-with_mgr_restful=false
 
 kstore_path=
 declare -a block_devs
@@ -205,7 +204,6 @@ VSTART_SEC="client.vstart.sh"
 
 MON_ADDR=""
 DASH_URLS=""
-RESTFUL_URLS=""
 
 conf_fn="$CEPH_CONF_PATH/ceph.conf"
 keyring_fn="$CEPH_CONF_PATH/keyring"
@@ -558,9 +556,6 @@ case $1 in
     --without-dashboard)
         with_mgr_dashboard=false
         ;;
-    --with-restful)
-        with_mgr_restful=true
-        ;;
     --seastore-device-size)
         seastore_size="$2"
         shift
@@ -782,9 +777,6 @@ prepare_conf() {
     if $with_mgr_dashboard; then
         mgr_modules+=" dashboard"
     fi
-    if $with_mgr_restful; then
-        mgr_modules+=" restful"
-    fi
 
     local msgr_conf=''
     if [ $msgr -eq 21 ]; then
@@ -1010,7 +1002,7 @@ EOF
         ; see src/vstart.sh for more info
         public bind addr =
 EOF
-    fi   
+    fi
 }
 
 write_logrotate_conf() {
@@ -1254,22 +1246,6 @@ EOF
     fi
 }
 
-create_mgr_restful_secret() {
-    while ! ceph_adm -h | grep -c -q ^restful ; do
-        debug echo 'waiting for mgr restful module to start'
-        sleep 1
-    done
-    local secret_file
-    if ceph_adm restful create-self-signed-cert > /dev/null; then
-        secret_file=`mktemp`
-        ceph_adm restful create-key admin -o $secret_file
-        RESTFUL_SECRET=`cat $secret_file`
-        rm $secret_file
-    else
-        debug echo MGR Restful is not working, perhaps the package is not installed?
-    fi
-}
-
 start_mgr() {
     local mgr=0
     local ssl=${DASHBOARD_SSL:-1}
@@ -1309,15 +1285,7 @@ EOF
 	    MGR_PORT=$(($MGR_PORT + 1000))
 	    ceph_adm config set mgr mgr/prometheus/$name/server_port $PROMETHEUS_PORT --force
 	    PROMETHEUS_PORT=$(($PROMETHEUS_PORT + 1000))
-
-	    ceph_adm config set mgr mgr/restful/$name/server_port $MGR_PORT --force
-            if [ $mgr -eq 1 ]; then
-                RESTFUL_URLS="https://$IP:$MGR_PORT"
-            else
-                RESTFUL_URLS+=", https://$IP:$MGR_PORT"
-            fi
-	    MGR_PORT=$(($MGR_PORT + 1000))
-        fi
+       fi
 
         debug echo "Starting mgr.${name}"
         run 'mgr' $name $CEPH_BIN/ceph-mgr -i $name $ARGS
@@ -1327,7 +1295,7 @@ EOF
         debug echo 'waiting for mgr to become available'
         sleep 1
     done
-    
+
     if [ "$new" -eq 1 ]; then
         # setting login credentials for dashboard
         if $with_mgr_dashboard; then
@@ -1353,9 +1321,6 @@ EOF
                 ceph_adm dashboard nvmeof-gateway-add -i <(echo "${NVMEOF_GW}") "${NVMEOF_GW/:/_}"
             fi
         fi
-        if $with_mgr_restful; then
-            create_mgr_restful_secret
-        fi
     fi
 
     if [ "$cephadm" -eq 1 ]; then
@@ -2046,12 +2011,6 @@ dashboard urls: $DASH_URLS
   w/ user/pass: admin / admin
 EOF
     fi
-    if $with_mgr_restful; then
-        cat <<EOF
-restful urls: $RESTFUL_URLS
-  w/ user/pass: admin / $RESTFUL_SECRET
-EOF
-    fi
 fi
 
 echo ""