198 files changed, 1625 insertions, 665 deletions
diff --git a/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py b/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
index ba3719cd3f3..aa11d553723 100644
--- a/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
+++ b/src/ceph-volume/ceph_volume/objectstore/lvmbluestore.py
@@ -367,7 +367,7 @@ class LvmBlueStore(BlueStore):
         if is_encrypted:
             osd_lv_path = '/dev/mapper/%s' % osd_block_lv.__dict__['lv_uuid']
             lockbox_secret = osd_block_lv.tags['ceph.cephx_lockbox_secret']
-            self.with_tpm = bool(osd_block_lv.tags.get('ceph.with_tpm', 0))
+            self.with_tpm = osd_block_lv.tags.get('ceph.with_tpm') == '1'
             if not self.with_tpm:
                 encryption_utils.write_lockbox_keyring(osd_id,
                                                        osd_fsid,
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index 5a917fa807c..ba8726a2be3 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -81,7 +81,7 @@ static void handle_mds_signal(int signum)
 
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-mds");
+  ceph_pthread_setname("ceph-mds");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/ceph_mgr.cc b/src/ceph_mgr.cc
index 67bda0c51be..bd2c643bc6b 100644
--- a/src/ceph_mgr.cc
+++ b/src/ceph_mgr.cc
@@ -41,7 +41,7 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-mgr");
+  ceph_pthread_setname("ceph-mgr");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 279fdb20ccb..63eb252e38f 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -250,7 +250,7 @@ int main(int argc, const char **argv)
 {
   // reset our process name, in case we did a respawn, so that it's not
   // left as "exe".
-  ceph_pthread_setname(pthread_self(), "ceph-mon");
+  ceph_pthread_setname("ceph-mon");
 
   int err;
 
diff --git a/src/ceph_nvmeof_monitor_client.cc b/src/ceph_nvmeof_monitor_client.cc
index 05457998cb8..fa41bed08ad 100644
--- a/src/ceph_nvmeof_monitor_client.cc
+++ b/src/ceph_nvmeof_monitor_client.cc
@@ -45,7 +45,7 @@ static void usage()
  */
 int main(int argc, const char **argv)
 {
-  ceph_pthread_setname(pthread_self(), "ceph-nvmeof-monitor-client");
+  ceph_pthread_setname("ceph-nvmeof-monitor-client");
 
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index f2172bf9083..d2ddf564116 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -57,6 +57,7 @@ from cephadmlib.constants import (
     LOG_DIR_MODE,
     SYSCTL_DIR,
     UNIT_DIR,
+    DAEMON_FAILED_ERROR,
 )
 from cephadmlib.context import CephadmContext
 from cephadmlib.context_getters import (
@@ -72,6 +73,7 @@ from cephadmlib.exceptions import (
     ClusterAlreadyExists,
     Error,
     UnauthorizedRegistryError,
+    DaemonStartException,
 )
 from cephadmlib.exe_utils import find_executable, find_program
 from cephadmlib.call_wrappers import (
@@ -1246,7 +1248,11 @@ def deploy_daemon_units(
         call_throws(ctx, ['systemctl', 'enable', unit_name])
     if start:
         clean_cgroup(ctx, ident.fsid, unit_name)
-        call_throws(ctx, ['systemctl', 'start', unit_name])
+        try:
+            call_throws(ctx, ['systemctl', 'start', unit_name])
+        except Exception as e:
+            logger.error(f'systemctl start failed for {unit_name}: {str(e)}')
+            raise DaemonStartException()
 
 
 def _osd_unit_run_commands(
@@ -3050,7 +3056,10 @@ def get_deployment_type(
 @deprecated_command
 def command_deploy(ctx):
     # type: (CephadmContext) -> None
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def apply_deploy_config_to_ctx(
@@ -3093,7 +3102,10 @@ def command_deploy_from(ctx: CephadmContext) -> None:
     config_data = read_configuration_source(ctx)
     logger.debug('Loaded deploy configuration: %r', config_data)
     apply_deploy_config_to_ctx(config_data, ctx)
-    _common_deploy(ctx)
+    try:
+        _common_deploy(ctx)
+    except DaemonStartException:
+        sys.exit(DAEMON_FAILED_ERROR)
 
 
 def _common_deploy(ctx: CephadmContext) -> None:
@@ -4489,8 +4501,9 @@ def _rm_cluster(ctx: CephadmContext, keep_logs: bool, zap_osds: bool) -> None:
 ##################################
 
 
-def check_time_sync(ctx, enabler=None):
-    # type: (CephadmContext, Optional[Packager]) -> bool
+def check_time_sync(
+    ctx: CephadmContext, enabler: Optional[Packager] = None
+) -> bool:
     units = [
         'chrony.service',  # 18.04 (at least)
         'chronyd.service',  # el / opensuse
diff --git a/src/cephadm/cephadmlib/call_wrappers.py b/src/cephadm/cephadmlib/call_wrappers.py
index 3fe2171e99d..d3d327c218c 100644
--- a/src/cephadm/cephadmlib/call_wrappers.py
+++ b/src/cephadm/cephadmlib/call_wrappers.py
@@ -311,14 +311,14 @@ def call_throws(
     return out, err, ret
 
 
-def call_timeout(ctx, command, timeout):
-    # type: (CephadmContext, List[str], int) -> int
+def call_timeout(
+    ctx: CephadmContext, command: List[str], timeout: int
+) -> int:
     logger.debug(
         'Running command (timeout=%s): %s' % (timeout, ' '.join(command))
     )
 
-    def raise_timeout(command, timeout):
-        # type: (List[str], int) -> NoReturn
+    def raise_timeout(command: List[str], timeout: int) -> NoReturn:
         msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
         logger.debug(msg)
         raise TimeoutExpired(msg)
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 215d207729d..1df46353fb3 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -36,3 +36,4 @@ QUIET_LOG_LEVEL = 9  # DEBUG is 10, so using 9 to be lower level than DEBUG
 NO_DEPRECATED = False
 UID_NOBODY = 65534
 GID_NOGROUP = 65534
+DAEMON_FAILED_ERROR = 17
diff --git a/src/cephadm/cephadmlib/daemon_identity.py b/src/cephadm/cephadmlib/daemon_identity.py
index 52a18092bf0..bfe1a855186 100644
--- a/src/cephadm/cephadmlib/daemon_identity.py
+++ b/src/cephadm/cephadmlib/daemon_identity.py
@@ -157,7 +157,7 @@ class DaemonSubIdentity(DaemonIdentity):
         )
 
     def sidecar_script(self, base_data_dir: Union[str, os.PathLike]) -> str:
-        sname = f'sidecar-{ self.subcomponent }.run'
+        sname = f'sidecar-{self.subcomponent}.run'
         return str(pathlib.Path(self.data_dir(base_data_dir)) / sname)
 
     @property
diff --git a/src/cephadm/cephadmlib/daemons/ingress.py b/src/cephadm/cephadmlib/daemons/ingress.py
index 8f4f6b08991..c88e39ac025 100644
--- a/src/cephadm/cephadmlib/daemons/ingress.py
+++ b/src/cephadm/cephadmlib/daemons/ingress.py
@@ -82,8 +82,7 @@ class HAproxy(ContainerDaemonForm):
     def get_daemon_args(self) -> List[str]:
         return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
 
-    def validate(self):
-        # type: () -> None
+    def validate(self) -> None:
         if not is_fsid(self.fsid):
             raise Error('not an fsid: %s' % self.fsid)
         if not self.daemon_id:
@@ -99,12 +98,10 @@ class HAproxy(ContainerDaemonForm):
                         'required file missing from config-json: %s' % fname
                     )
 
-    def get_daemon_name(self):
-        # type: () -> str
+    def get_daemon_name(self) -> str:
         return '%s.%s' % (self.daemon_type, self.daemon_id)
 
-    def get_container_name(self, desc=None):
-        # type: (Optional[str]) -> str
+    def get_container_name(self, desc: Optional[str] = None) -> str:
         cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
         if desc:
             cname = '%s-%s' % (cname, desc)
@@ -212,8 +209,7 @@ class Keepalived(ContainerDaemonForm):
         # populate files from the config-json
         populate_files(data_dir, self.files, uid, gid)
 
-    def validate(self):
-        # type: () -> None
+    def validate(self) -> None:
         if not is_fsid(self.fsid):
             raise Error('not an fsid: %s' % self.fsid)
         if not self.daemon_id:
@@ -229,20 +225,17 @@ class Keepalived(ContainerDaemonForm):
                         'required file missing from config-json: %s' % fname
                     )
 
-    def get_daemon_name(self):
-        # type: () -> str
+    def get_daemon_name(self) -> str:
         return '%s.%s' % (self.daemon_type, self.daemon_id)
 
-    def get_container_name(self, desc=None):
-        # type: (Optional[str]) -> str
+    def get_container_name(self, desc: Optional[str] = None) -> str:
         cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
         if desc:
             cname = '%s-%s' % (cname, desc)
         return cname
 
     @staticmethod
-    def get_container_envs():
-        # type: () -> List[str]
+    def get_container_envs() -> List[str]:
         envs = [
             'KEEPALIVED_AUTOCONF=false',
             'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
diff --git a/src/cephadm/cephadmlib/daemons/nfs.py b/src/cephadm/cephadmlib/daemons/nfs.py
index f09374d5f46..70ccea65b5b 100644
--- a/src/cephadm/cephadmlib/daemons/nfs.py
+++ b/src/cephadm/cephadmlib/daemons/nfs.py
@@ -42,9 +42,13 @@ class NFSGanesha(ContainerDaemonForm):
         return cls.daemon_type == daemon_type
 
     def __init__(
-        self, ctx, fsid, daemon_id, config_json, image=DEFAULT_IMAGE
-    ):
-        # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
+        self,
+        ctx: CephadmContext,
+        fsid: str,
+        daemon_id: Union[int, str],
+        config_json: Dict,
+        image: str = DEFAULT_IMAGE,
+    ) -> None:
         self.ctx = ctx
         self.fsid = fsid
         self.daemon_id = daemon_id
@@ -62,8 +66,9 @@ class NFSGanesha(ContainerDaemonForm):
         self.validate()
 
     @classmethod
-    def init(cls, ctx, fsid, daemon_id):
-        # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
+    def init(
+        cls, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str]
+    ) -> 'NFSGanesha':
         return cls(ctx, fsid, daemon_id, fetch_configs(ctx), ctx.image)
 
     @classmethod
diff --git a/src/cephadm/cephadmlib/daemons/nvmeof.py b/src/cephadm/cephadmlib/daemons/nvmeof.py
index 2c20a900f45..d916c7e6391 100644
--- a/src/cephadm/cephadmlib/daemons/nvmeof.py
+++ b/src/cephadm/cephadmlib/daemons/nvmeof.py
@@ -33,9 +33,13 @@ class CephNvmeof(ContainerDaemonForm):
         return cls.daemon_type == daemon_type
 
     def __init__(
-        self, ctx, fsid, daemon_id, config_json, image=DEFAULT_NVMEOF_IMAGE
-    ):
-        # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
+        self,
+        ctx: CephadmContext,
+        fsid: str,
+        daemon_id: Union[int, str],
+        config_json: Dict,
+        image: str = DEFAULT_NVMEOF_IMAGE,
+    ) -> None:
         self.ctx = ctx
         self.fsid = fsid
         self.daemon_id = daemon_id
@@ -48,8 +52,9 @@ class CephNvmeof(ContainerDaemonForm):
         self.validate()
 
     @classmethod
-    def init(cls, ctx, fsid, daemon_id):
-        # type: (CephadmContext, str, Union[int, str]) -> CephNvmeof
+    def init(
+        cls, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str]
+    ) -> 'CephNvmeof':
         return cls(ctx, fsid, daemon_id, fetch_configs(ctx), ctx.image)
 
     @classmethod
@@ -73,13 +78,18 @@ class CephNvmeof(ContainerDaemonForm):
             os.path.join(data_dir, 'ceph-nvmeof.conf')
         ] = '/src/ceph-nvmeof.conf:z'
         mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
-        mounts['/dev/hugepages'] = '/dev/hugepages'
-        mounts['/dev/vfio/vfio'] = '/dev/vfio/vfio'
         mounts[log_dir] = '/var/log/ceph:z'
         if mtls_dir:
             mounts[mtls_dir] = '/src/mtls:z'
         return mounts
 
+    def _get_huge_pages_mounts(self, files: Dict[str, str]) -> Dict[str, str]:
+        mounts = dict()
+        if 'spdk_mem_size' not in files:
+            mounts['/dev/hugepages'] = '/dev/hugepages'
+            mounts['/dev/vfio/vfio'] = '/dev/vfio/vfio'
+        return mounts
+
     def _get_tls_cert_key_mounts(
         self, data_dir: str, files: Dict[str, str]
     ) -> Dict[str, str]:
@@ -111,6 +121,7 @@ class CephNvmeof(ContainerDaemonForm):
             )
         else:
             mounts.update(self._get_container_mounts(data_dir, log_dir))
+        mounts.update(self._get_huge_pages_mounts(self.files))
         mounts.update(self._get_tls_cert_key_mounts(data_dir, self.files))
 
     def customize_container_binds(
@@ -198,11 +209,13 @@ class CephNvmeof(ContainerDaemonForm):
             )
         return cmd.split()
 
-    @staticmethod
-    def get_sysctl_settings() -> List[str]:
-        return [
-            'vm.nr_hugepages = 4096',
-        ]
+    def get_sysctl_settings(self) -> List[str]:
+        if 'spdk_mem_size' not in self.files:
+            return [
+                'vm.nr_hugepages = 4096',
+            ]
+        else:
+            return []
 
     def container(self, ctx: CephadmContext) -> CephContainer:
         ctr = daemon_to_container(ctx, self)
@@ -222,4 +235,6 @@ class CephNvmeof(ContainerDaemonForm):
         args.append(ctx.container_engine.unlimited_pids_option)
         args.extend(['--ulimit', 'memlock=-1:-1'])
         args.extend(['--ulimit', 'nofile=10240'])
-        args.extend(['--cap-add=SYS_ADMIN', '--cap-add=CAP_SYS_NICE'])
+        args.extend(['--cap-add=CAP_SYS_NICE'])
+        if 'spdk_mem_size' not in self.files:
+            args.extend(['--cap-add=SYS_ADMIN'])
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 0ab8b38d2b5..9caef3f72e5 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -189,8 +189,9 @@ def normalize_image_digest(digest: str) -> str:
     return digest
 
 
-def get_legacy_config_fsid(cluster, legacy_dir=None):
-    # type: (str, Optional[str]) -> Optional[str]
+def get_legacy_config_fsid(
+    cluster: str, legacy_dir: Optional[str] = None
+) -> Optional[str]:
     config_file = '/etc/ceph/%s.conf' % cluster
     if legacy_dir is not None:
         config_file = os.path.abspath(legacy_dir + config_file)
diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py
index 0d215fdd332..762ce782127 100644
--- a/src/cephadm/cephadmlib/exceptions.py
+++ b/src/cephadm/cephadmlib/exceptions.py
@@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error):
 
 class PortOccupiedError(Error):
     pass
+
+
+class DaemonStartException(Exception):
+    """
+    Special exception type we raise when the
+    systemctl start command fails during daemon
+    deployment. Necessary because the cephadm mgr module
+    needs to handle this case differently than a failure
+    earlier in the deploy process where no attempt was made
+    to actually start the daemon
+    """
+
+    pass
diff --git a/src/cephadm/cephadmlib/file_utils.py b/src/cephadm/cephadmlib/file_utils.py
index 399729f2dcc..27e70e31756 100644
--- a/src/cephadm/cephadmlib/file_utils.py
+++ b/src/cephadm/cephadmlib/file_utils.py
@@ -52,8 +52,9 @@ def write_new(
     os.rename(tempname, destination)
 
 
-def populate_files(config_dir, config_files, uid, gid):
-    # type: (str, Dict, int, int) -> None
+def populate_files(
+    config_dir: str, config_files: Dict, uid: int, gid: int
+) -> None:
     """create config files for different services"""
     for fname in config_files:
         config_file = os.path.join(config_dir, fname)
@@ -71,8 +72,7 @@ def touch(
         os.chown(file_path, uid, gid)
 
 
-def write_tmp(s, uid, gid):
-    # type: (str, int, int) -> IO[str]
+def write_tmp(s: str, uid: int, gid: int) -> IO[str]:
     tmp_f = tempfile.NamedTemporaryFile(mode='w', prefix='ceph-tmp')
     os.fchown(tmp_f.fileno(), uid, gid)
     tmp_f.write(s)
@@ -97,8 +97,7 @@ def recursive_chown(path: str, uid: int, gid: int) -> None:
             os.chown(os.path.join(dirpath, filename), uid, gid)
 
 
-def read_file(path_list, file_name=''):
-    # type: (List[str], str) -> str
+def read_file(path_list: List[str], file_name: str = '') -> str:
     """Returns the content of the first file found within the `path_list`
 
     :param path_list: list of file paths to search
@@ -123,14 +122,12 @@ def read_file(path_list, file_name=''):
     return 'Unknown'
 
 
-def pathify(p):
-    # type: (str) -> str
+def pathify(p: str) -> str:
     p = os.path.expanduser(p)
     return os.path.abspath(p)
 
 
-def get_file_timestamp(fn):
-    # type: (str) -> Optional[str]
+def get_file_timestamp(fn: str) -> Optional[str]:
     try:
         mt = os.path.getmtime(fn)
         return datetime.datetime.fromtimestamp(
diff --git a/src/cephadm/cephadmlib/systemd.py b/src/cephadm/cephadmlib/systemd.py
index a07757eccad..1956957d457 100644
--- a/src/cephadm/cephadmlib/systemd.py
+++ b/src/cephadm/cephadmlib/systemd.py
@@ -11,8 +11,7 @@ from .packagers import Packager
 logger = logging.getLogger()
 
 
-def check_unit(ctx, unit_name):
-    # type: (CephadmContext, str) -> Tuple[bool, str, bool]
+def check_unit(ctx: CephadmContext, unit_name: str) -> Tuple[bool, str, bool]:
     # NOTE: we ignore the exit code here because systemctl outputs
     # various exit codes based on the state of the service, but the
     # string result is more explicit (and sufficient).
@@ -56,8 +55,9 @@ def check_unit(ctx, unit_name):
     return (enabled, state, installed)
 
 
-def check_units(ctx, units, enabler=None):
-    # type: (CephadmContext, List[str], Optional[Packager]) -> bool
+def check_units(
+    ctx: CephadmContext, units: List[str], enabler: Optional[Packager] = None
+) -> bool:
     for u in units:
         (enabled, state, installed) = check_unit(ctx, u)
         if enabled and state == 'running':
diff --git a/src/cephadm/tests/test_agent.py b/src/cephadm/tests/test_agent.py
index 52cce74e1fb..8e453e3ac3c 100644
--- a/src/cephadm/tests/test_agent.py
+++ b/src/cephadm/tests/test_agent.py
@@ -668,7 +668,7 @@ def test_mgr_listener_run(_load_cert_chain, _load_verify_locations, _handle_json
         agent.mgr_listener.run()
 
         # verify payload was correctly extracted
-        assert _handle_json_payload.called_with(json.loads(payload))
+        _handle_json_payload.assert_called_with(json.loads(payload))
         FakeConn.send.assert_called_once_with(b'ACK')
 
         # second run, with bad json data received
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index f27b9bcd362..bbaaf2d39f8 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -1,5 +1,6 @@
 # type: ignore
 
+import contextlib
 import copy
 import errno
 import json
@@ -38,6 +39,13 @@ def get_ceph_conf(
         mon_host = {mon_host}
 '''
 
+@contextlib.contextmanager
+def bootstrap_test_ctx(*args, **kwargs):
+    with with_cephadm_ctx(*args, **kwargs) as ctx:
+        ctx.no_cleanup_on_failure = True
+        yield ctx
+
+
 class TestCephAdm(object):
 
     @mock.patch('cephadm.logger')
@@ -1432,13 +1440,13 @@ class TestBootstrap(object):
             '--config', conf_file,
         )
 
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             msg = r'No such file or directory'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cephadm_fs.create_file(conf_file)
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1446,7 +1454,7 @@ class TestBootstrap(object):
         funkypatch.patch('cephadmlib.systemd.call')
 
         cmd = self._get_cmd()
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             msg = r'must specify --mon-ip or --mon-addrv'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
@@ -1455,13 +1463,13 @@ class TestBootstrap(object):
         funkypatch.patch('cephadmlib.systemd.call')
         cmd = self._get_cmd('--mon-ip', '192.168.1.1')
 
-        with with_cephadm_ctx(cmd, list_networks={}) as ctx:
+        with bootstrap_test_ctx(cmd, list_networks={}) as ctx:
             msg = r'--skip-mon-network'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cmd += ['--skip-mon-network']
-        with with_cephadm_ctx(cmd, list_networks={}) as ctx:
+        with bootstrap_test_ctx(cmd, list_networks={}) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1540,12 +1548,12 @@ class TestBootstrap(object):
 
         cmd = self._get_cmd('--mon-ip', mon_ip)
         if not result:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 msg = r'--skip-mon-network'
                 with pytest.raises(_cephadm.Error, match=msg):
                     _cephadm.command_bootstrap(ctx)
         else:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 retval = _cephadm.command_bootstrap(ctx)
                 assert retval == 0
 
@@ -1604,11 +1612,11 @@ class TestBootstrap(object):
 
         cmd = self._get_cmd('--mon-addrv', mon_addrv)
         if err:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 with pytest.raises(_cephadm.Error, match=err):
                     _cephadm.command_bootstrap(ctx)
         else:
-            with with_cephadm_ctx(cmd, list_networks=list_networks) as ctx:
+            with bootstrap_test_ctx(cmd, list_networks=list_networks) as ctx:
                 retval = _cephadm.command_bootstrap(ctx)
                 assert retval == 0
 
@@ -1621,13 +1629,13 @@ class TestBootstrap(object):
             '--skip-mon-network',
         )
 
-        with with_cephadm_ctx(cmd, hostname=hostname) as ctx:
+        with bootstrap_test_ctx(cmd, hostname=hostname) as ctx:
             msg = r'--allow-fqdn-hostname'
             with pytest.raises(_cephadm.Error, match=msg):
                 _cephadm.command_bootstrap(ctx)
 
         cmd += ['--allow-fqdn-hostname']
-        with with_cephadm_ctx(cmd, hostname=hostname) as ctx:
+        with bootstrap_test_ctx(cmd, hostname=hostname) as ctx:
             retval = _cephadm.command_bootstrap(ctx)
             assert retval == 0
 
@@ -1646,7 +1654,7 @@ class TestBootstrap(object):
             '--fsid', fsid,
         )
 
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             if err:
                 with pytest.raises(_cephadm.Error, match=err):
                     _cephadm.command_bootstrap(ctx)
@@ -1661,7 +1669,7 @@ class TestShell(object):
         fsid = '00000000-0000-0000-0000-0000deadbeef'
 
         cmd = ['shell', '--fsid', fsid]
-        with with_cephadm_ctx(cmd) as ctx:
+        with bootstrap_test_ctx(cmd) as ctx:
             retval = _cephadm.command_shell(ctx)
             assert retval == 0
             assert ctx.fsid == fsid
diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py
index 58f212beff6..c5094db335f 100644
--- a/src/cephadm/tests/test_deploy.py
+++ b/src/cephadm/tests/test_deploy.py
@@ -495,6 +495,7 @@ def test_deploy_ceph_exporter_container(cephadm_fs, funkypatch):
 def test_deploy_and_rm_iscsi(cephadm_fs, funkypatch):
     # Test that the deploy and remove paths for iscsi (which has sidecar container)
     # create and remove the correct unit files.
+    funkypatch.patch('shutil.rmtree')  # fakefs + shutil.rmtree breaks on py3.12
     mocks = _common_patches(funkypatch)
     _firewalld = mocks['Firewalld']
     fsid = 'b01dbeef-701d-9abe-0000-e1e5a47004a7'
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index b999a0f552b..d643b1ba74f 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -12,14 +12,14 @@ skipsdist = true
 max-line-length = 100
 inline-quotes = '
 ignore =
-    E501, \
+    E501,
     W503,
 exclude =
-    .tox, \
-    .vagrant, \
-    __pycache__, \
-    *.pyc, \
-    templates, \
+    .tox,
+    .vagrant,
+    __pycache__,
+    *.pyc,
+    templates,
     .eggs
 statistics = True
 
@@ -53,7 +53,7 @@ commands = mypy --config-file ../mypy.ini {posargs:cephadm.py cephadmlib}
 [testenv:flake8]
 allowlist_externals = bash
 deps =
-    flake8 == 5.0.4
+    flake8
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 21555d0d07c..c404057b929 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -3843,6 +3843,7 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
 				   want,
 				   flush,
 				   cap->mseq,
+                                   cap->issue_seq,
                                    cap_epoch_barrier);
   /*
    * Since the setattr will check the cephx mds auth access before
@@ -3856,7 +3857,6 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
   m->caller_uid = -1;
   m->caller_gid = -1;
 
-  m->head.issue_seq = cap->issue_seq;
   m->set_tid(flush_tid);
 
   m->head.uid = in->uid;
@@ -5521,10 +5521,10 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<
         if (it != in->caps.end()) {
 	  Cap &tcap = it->second;
 	  if (tcap.cap_id == m->peer.cap_id &&
-	      ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
+	      ceph_seq_cmp(tcap.seq, m->peer.issue_seq) < 0) {
 	    tcap.cap_id = m->peer.cap_id;
-	    tcap.seq = m->peer.seq - 1;
-	    tcap.issue_seq = tcap.seq;
+	    tcap.seq = m->peer.issue_seq - 1;
+	    tcap.issue_seq = tcap.issue_seq;
 	    tcap.issued |= cap.issued;
 	    tcap.implemented |= cap.issued;
 	    if (&cap == in->auth_cap)
@@ -5534,7 +5534,7 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<
 	  }
         } else {
 	  add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
-		         m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
+		         m->peer.issue_seq - 1, m->peer.mseq, (uint64_t)-1,
 		         &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
 		         cap.latest_perms);
         }
diff --git a/src/client/MetaSession.cc b/src/client/MetaSession.cc
index b5160a84331..3baa833851f 100644
--- a/src/client/MetaSession.cc
+++ b/src/client/MetaSession.cc
@@ -56,7 +56,7 @@ void MetaSession::enqueue_cap_release(inodeno_t ino, uint64_t cap_id, ceph_seq_t
   ceph_mds_cap_item i;
   i.ino = ino;
   i.cap_id = cap_id;
-  i.seq = iseq;
+  i.issue_seq = iseq;
   i.migrate_seq = mseq;
   release->caps.push_back(i);
 }
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index 3b408dd3f2d..6b315d2dee3 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -290,6 +290,7 @@ SyntheticClient::SyntheticClient(StandaloneClient *client, int w)
 
 void *synthetic_client_thread_entry(void *ptr)
 {
+  ceph_pthread_setname("client");
   SyntheticClient *sc = static_cast<SyntheticClient*>(ptr);
   //int r = 
   sc->run();
@@ -945,7 +946,6 @@ int SyntheticClient::start_thread()
 
   pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this);
   ceph_assert(thread_id);
-  ceph_pthread_setname(thread_id, "client");
   return 0;
 }
 
diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h
index 9455ecc5a33..30570c72a30 100644
--- a/src/common/DecayCounter.h
+++ b/src/common/DecayCounter.h
@@ -16,7 +16,6 @@
 #define CEPH_DECAYCOUNTER_H
 
 #include "include/buffer.h"
-#include "common/Formatter.h"
 #include "common/StackStringStream.h"
 #include "common/ceph_time.h"
 
@@ -24,6 +23,8 @@
 #include <list>
 #include <sstream>
 
+namespace ceph { class Formatter; }
+
 /**
  *
  * TODO: normalize value based on some function of half_life, 
diff --git a/src/common/Graylog.cc b/src/common/Graylog.cc
index cbd63fab25f..099acacd803 100644
--- a/src/common/Graylog.cc
+++ b/src/common/Graylog.cc
@@ -2,6 +2,9 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "Graylog.h"
+
+#include <iostream> // for std::cerr
+
 #include "common/Formatter.h"
 #include "common/LogEntry.h"
 #include "log/Entry.h"
diff --git a/src/common/Journald.cc b/src/common/Journald.cc
index 164b65834a6..12e1a97e998 100644
--- a/src/common/Journald.cc
+++ b/src/common/Journald.cc
@@ -14,6 +14,9 @@
 #include <sys/un.h>
 #include <syslog.h>
 #include <unistd.h>
+
+#include <iostream> // for std::cerr
+
 #include <fmt/format.h>
 #include <fmt/ostream.h>
 
@@ -23,7 +26,6 @@
 #include "log/SubsystemMap.h"
 #include "msg/msg_fmt.h"
 
-
 namespace ceph::logging {
 
 namespace {
diff --git a/src/common/StackStringStream.h b/src/common/StackStringStream.h
index 8cb48ff6fcd..6a144fb938a 100644
--- a/src/common/StackStringStream.h
+++ b/src/common/StackStringStream.h
@@ -18,10 +18,9 @@
 #include <boost/container/small_vector.hpp>
 
 #include <algorithm>
-#include <iostream>
 #include <memory>
 #include <ostream>
-#include <sstream>
+#include <string>
 #include <string_view>
 #include <vector>
 
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 3903e8c0ed7..c714aa0aa87 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -83,7 +83,7 @@ void *Thread::entry_wrapper()
   if (pid && cpuid >= 0)
     _set_affinity(cpuid);
 
-  ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str());
+  ceph_pthread_setname(thread_name.c_str());
   return entry();
 }
 
@@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize)
 void Thread::create(const char *name, size_t stacksize)
 {
   ceph_assert(strlen(name) < 16);
-  Thread::thread_name = name;
+  thread_name = name;
 
   int ret = try_create(stacksize);
   if (ret != 0) {
@@ -203,24 +203,6 @@ int Thread::set_affinity(int id)
 // Functions for std::thread
 // =========================
 
-void set_thread_name(std::thread& t, const std::string& s) {
-  int r = ceph_pthread_setname(t.native_handle(), s.c_str());
-  if (r != 0) {
-    throw std::system_error(r, std::generic_category());
-  }
-}
-std::string get_thread_name(const std::thread& t) {
-  std::string s(256, '\0');
-
-  int r = ceph_pthread_getname(const_cast<std::thread&>(t).native_handle(),
-			       s.data(), s.length());
-  if (r != 0) {
-    throw std::system_error(r, std::generic_category());
-  }
-  s.resize(std::strlen(s.data()));
-  return s;
-}
-
 void kill(std::thread& t, int signal)
 {
   auto r = ceph_pthread_kill(t.native_handle(), signal);
diff --git a/src/common/Thread.h b/src/common/Thread.h
index d3892c1b36b..8dc0e6c3cbe 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -17,8 +17,8 @@
 #define CEPH_THREAD_H
 
 #include <functional>
+#include <string>
 #include <string_view>
-#include <system_error>
 #include <thread>
 #include <cstring>
 
@@ -27,7 +27,6 @@
 
 #include "include/ceph_assert.h"
 #include "include/compat.h"
-#include "include/spinlock.h"
 
 extern pid_t ceph_gettid();
 
@@ -36,7 +35,7 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int cpuid;
-  static inline thread_local std::string thread_name;
+  std::string thread_name;
 
   void *entry_wrapper();
 
@@ -64,15 +63,10 @@ class Thread {
   int join(void **prval = 0);
   int detach();
   int set_affinity(int cpuid);
-  static const std::string get_thread_name() {
-    return Thread::thread_name;
-  }
 };
 
 // Functions for with std::thread
 
-void set_thread_name(std::thread& t, const std::string& s);
-std::string get_thread_name(const std::thread& t);
 void kill(std::thread& t, int signal);
 
 template<typename Fun, typename... Args>
@@ -81,7 +75,7 @@ std::thread make_named_thread(std::string_view n,
 			      Args&& ...args) {
 
   return std::thread([n = std::string(n)](auto&& fun, auto&& ...args) {
-		       ceph_pthread_setname(pthread_self(), n.data());
+		       ceph_pthread_setname(n.data());
 		       std::invoke(std::forward<Fun>(fun),
 				   std::forward<Args>(args)...);
 		     }, std::forward<Fun>(fun), std::forward<Args>(args)...);
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index e190b946c45..fb5d949b438 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -6,7 +6,7 @@
 
 #include <atomic>
 #include <chrono>
-#include <iostream>
+#include <iosfwd>
 #include <list>
 #include <map>
 
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 1e73ce0836a..55b87de3207 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -12,7 +12,13 @@
  *
  */
 #include <poll.h>
+#include <signal.h>
 #include <sys/un.h>
+
+#ifndef WIN32
+#include <sys/wait.h>
+#endif
+
 #include <optional>
 
 #include <stdlib.h>
diff --git a/src/common/assert.cc b/src/common/assert.cc
index 7fb4c2d726b..68ad99c878e 100644
--- a/src/common/assert.cc
+++ b/src/common/assert.cc
@@ -44,8 +44,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     ostringstream tss;
     tss << ceph_clock_now();
@@ -122,8 +121,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
     BackTrace *bt = new ClibBackTrace(1);
@@ -168,8 +166,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BackTrace *bt = new ClibBackTrace(1);
     snprintf(g_assert_msg, sizeof(g_assert_msg),
@@ -210,8 +207,7 @@ namespace ceph {
     g_assert_line = line;
     g_assert_func = func;
     g_assert_thread = (unsigned long long)pthread_self();
-    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
-		       sizeof(g_assert_thread_name));
+    ceph_pthread_getname(g_assert_thread_name, sizeof(g_assert_thread_name));
 
     BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
     BackTrace *bt = new ClibBackTrace(1);
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index b4640979289..4443ef14124 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -19,6 +19,8 @@
 
 #include <sys/uio.h>
 
+#include <iostream>
+
 #include "include/ceph_assert.h"
 #include "include/types.h"
 #include "include/buffer_raw.h"
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index 9b989fe7270..ad12e0b6764 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -16,6 +16,7 @@
 #include "auth/Auth.h"
 #include "common/ceph_argparse.h"
 #include "common/config.h"
+#include "common/strtol.h" // for strict_strtof()
 #include "common/version.h"
 #include "include/str_list.h"
 
diff --git a/src/common/ceph_argparse.h b/src/common/ceph_argparse.h
index d63a2bdd796..5a160dd0b79 100644
--- a/src/common/ceph_argparse.h
+++ b/src/common/ceph_argparse.h
@@ -29,6 +29,8 @@
 #include "common/entity_name.h"
 #include "include/encoding.h"
 
+class entity_addrvec_t;
+
 /////////////////////// Types ///////////////////////
 class CephInitParameters
 {
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
index bae038862cf..01feff4c063 100644
--- a/src/common/ceph_time.h
+++ b/src/common/ceph_time.h
@@ -16,7 +16,7 @@
 #define COMMON_CEPH_TIME_H
 
 #include <chrono>
-#include <iostream>
+#include <iosfwd>
 #include <string>
 #include <optional>
 #include <fmt/chrono.h>
diff --git a/src/common/ceph_timer.h b/src/common/ceph_timer.h
index bc324bfa243..7fb2c7bac12 100644
--- a/src/common/ceph_timer.h
+++ b/src/common/ceph_timer.h
@@ -98,6 +98,7 @@ class timer {
   std::thread thread;
 
   void timer_thread() {
+    ceph_pthread_setname("ceph_timer");
     std::unique_lock l(lock);
     while (!suspended) {
       auto now = TC::now();
@@ -155,7 +156,6 @@ class timer {
 public:
   timer() : suspended(false) {
     thread = std::thread(&timer::timer_thread, this);
-    set_thread_name(thread, "ceph_timer");
   }
 
   // Create a suspended timer, jobs will be executed in order when
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
index 14d55f60c30..21633fc5d41 100644
--- a/src/common/code_environment.cc
+++ b/src/common/code_environment.cc
@@ -11,6 +11,7 @@
  * Foundation.  See file COPYING.
  *
  */
+#include "include/compat.h"
 
 #include "common/code_environment.h"
 
@@ -18,10 +19,6 @@
 
 #include "acconfig.h"
 
-#ifdef HAVE_PTHREAD_GETNAME_NP
-#include <pthread.h>
-#endif
-
 #include <string.h>
 
 code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
@@ -57,7 +54,7 @@ int get_process_name(char *buf, int len)
   }
   // FIPS zeroization audit 20191115: this memset is not security related.
   memset(buf, 0, len);
-  return pthread_getname_np(pthread_self(), buf, len);
+  return ceph_pthread_getname(buf, len);
 }
 
 #elif defined(HAVE_GETPROGNAME)
diff --git a/src/common/compat.cc b/src/common/compat.cc
index 82b57ad94b5..84a395c5a19 100644
--- a/src/common/compat.cc
+++ b/src/common/compat.cc
@@ -565,3 +565,66 @@ ssize_t get_self_exe_path(char* path, int buff_length) {
 }
 
 #endif /* _WIN32 */
+
+
+static thread_local char cached_thread_name[256]{};
+
+int ceph_pthread_setname(char const* name)
+{
+  strncpy(cached_thread_name, name, sizeof cached_thread_name - 1);
+#if defined(_WIN32) && defined(__clang__) && \
+    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+  // In this case, llvm doesn't use the pthread api for std::thread.
+  // We cannot use native_handle() with the pthread api, nor can we pass
+  // it to Windows API functions.
+  return 0;
+#elif defined(HAVE_PTHREAD_SETNAME_NP)
+  #if defined(__APPLE__)
+      return pthread_setname_np(name);
+  #else
+      return pthread_setname_np(pthread_self(), name);
+  #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+  pthread_set_name_np(pthread_self(), name);          \
+  return 0;
+#else
+  return 0;
+#endif
+}
+
+int ceph_pthread_getname(char* name, size_t len)
+{
+  if (cached_thread_name[0]) {
+    if (len > 0) {
+      strncpy(name, cached_thread_name, len);
+      name[len-1] = 0;
+    }
+    return 0;
+  } else {
+#if defined(_WIN32) && defined(__clang__) && \
+    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
+    if (len > 0) {
+      strcpy(name, "");
+    }
+    return 0;
+#elif defined(HAVE_PTHREAD_GETNAME_NP) || defined(HAVE_PTHREAD_GET_NAME_NP)
+#  if defined(HAVE_PTHREAD_GETNAME_NP)
+    int rc = pthread_getname_np(pthread_self(), cached_thread_name, sizeof cached_thread_name);
+#  else
+    int rc = pthread_get_name_np(pthread_self(), cached_thread_name, sizeof cached_thread_name);
+#  endif
+    if (rc == 0) {
+      strncpy(name, cached_thread_name, len);
+      name[len-1] = 0;
+      return 0;
+    } else {
+      return rc;
+    }
+#else
+    if (len > 0) {
+      strcpy(name, "");
+    }
+    return 0;
+#endif
+  }
+}
diff --git a/src/common/config_cacher.h b/src/common/config_cacher.h
index a84bad08eee..91b8152dde1 100644
--- a/src/common/config_cacher.h
+++ b/src/common/config_cacher.h
@@ -50,7 +50,7 @@ public:
     conf.remove_observer(this);
   }
 
-  operator ValueT() const {
+  ValueT operator*() const {
     return value_cache.load();
   }
 };
diff --git a/src/common/error_code.cc b/src/common/error_code.cc
index ed0e681b22b..9c981a21077 100644
--- a/src/common/error_code.cc
+++ b/src/common/error_code.cc
@@ -13,10 +13,9 @@
  * COPYING.
  */
 
-#include <exception>
+#include "common/error_code.h"
 
 #include <boost/asio/error.hpp>
-#include "common/error_code.h"
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
diff --git a/src/common/error_code.h b/src/common/error_code.h
index e39122f8ce3..93a1bf31c00 100644
--- a/src/common/error_code.h
+++ b/src/common/error_code.h
@@ -16,9 +16,8 @@
 #ifndef COMMON_CEPH_ERROR_CODE
 #define COMMON_CEPH_ERROR_CODE
 
-#include <netdb.h>
-
-#include <boost/system.hpp>
+#include <boost/system/error_code.hpp>
+#include <boost/system/system_error.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 32ecc958618..f5e744e2339 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -99,6 +99,7 @@ ostream& ObjBencher::out(ostream& os)
 }
 
 void *ObjBencher::status_printer(void *_bencher) {
+  ceph_pthread_setname("OB::stat_print");
   ObjBencher *bencher = static_cast<ObjBencher *>(_bencher);
   bench_data& data = bencher->data;
   Formatter *formatter = bencher->formatter;
@@ -453,7 +454,6 @@ int ObjBencher::write_bench(int secondsToRun,
   pthread_t print_thread;
 
   pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "write_stat");
   std::unique_lock locker{lock};
   data.finished = 0;
   data.start_time = mono_clock::now();
@@ -691,7 +691,6 @@ int ObjBencher::seq_read_bench(
 
   pthread_t print_thread;
   pthread_create(&print_thread, NULL, status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "seq_read_stat");
 
   mono_time finish_time = data.start_time + time_to_run;
   //start initial reads
@@ -903,7 +902,6 @@ int ObjBencher::rand_read_bench(
 
   pthread_t print_thread;
   pthread_create(&print_thread, NULL, status_printer, (void *)this);
-  ceph_pthread_setname(print_thread, "rand_read_stat");
 
   mono_time finish_time = data.start_time + time_to_run;
   //start initial reads
diff --git a/src/common/options.cc b/src/common/options.cc
index a68e2474a3d..3f6894b01c1 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -5,6 +5,7 @@
 #include "options.h"
 #include "common/Formatter.h"
 #include "common/options/build_options.h"
+#include "common/strtol.h" // for strict_si_cast()
 
 // Helpers for validators
 #include "include/stringify.h"
diff --git a/src/common/options.h b/src/common/options.h
index abded4cc0dd..ec6db7770c3 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <chrono>
+#include <iostream> // for std::cerr
 #include <string>
 #include <variant>
 #include <vector>
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index 18efba561ed..94824faef6b 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -586,16 +586,6 @@ options:
   min: 1
   services:
   - mds
-- name: mds_log_major_segment_event_ratio
-  type: uint
-  level: advanced
-  desc: multiple of mds_log_events_per_segment between major segments
-  default: 12
-  services:
-  - mds
-  min: 1
-  see_also:
-  - mds_log_events_per_segment
 # segment size for mds log, default to default file_layout_t
 - name: mds_log_segment_size
   type: size
@@ -1741,3 +1731,12 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_log_minor_segments_per_major_segment
+  type: uint
+  level: advanced
+  desc: number of minor segments per major segment.
+  long_desc: The number of minor mds log segments since last major segment after which a major segment is started/logged.
+  default: 16
+  services:
+  - mds
+  min: 8
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index 3bfff92c907..49099f42b71 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -346,7 +346,7 @@ options:
   default: 5
   see_also:
   - osd_scrub_chunk_max
-  with_legacy: true
+  with_legacy: false
 - name: osd_scrub_chunk_max
   type: int
   level: advanced
@@ -357,7 +357,7 @@ options:
   default: 15
   see_also:
   - osd_scrub_chunk_min
-  with_legacy: true
+  with_legacy: false
 - name: osd_shallow_scrub_chunk_min
   type: int
   level: advanced
@@ -369,7 +369,7 @@ options:
   see_also:
   - osd_shallow_scrub_chunk_max
   - osd_scrub_chunk_min
-  with_legacy: true
+  with_legacy: false
 - name: osd_shallow_scrub_chunk_max
   type: int
   level: advanced
@@ -380,7 +380,7 @@ options:
   see_also:
   - osd_shallow_scrub_chunk_min
   - osd_scrub_chunk_max
-  with_legacy: true
+  with_legacy: false
 # sleep between [deep]scrub ops
 - name: osd_scrub_sleep
   type: float
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
index b5e361b505c..2eeaa80aae8 100644
--- a/src/common/perf_counters.cc
+++ b/src/common/perf_counters.cc
@@ -18,6 +18,7 @@
 #include "common/dout.h"
 #include "common/valgrind.h"
 #include "include/common_fwd.h"
+#include "include/utime.h"
 
 using std::ostringstream;
 using std::make_pair;
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
index 942edf6d7e5..0d0fe86a092 100644
--- a/src/common/perf_counters.h
+++ b/src/common/perf_counters.h
@@ -17,6 +17,8 @@
 #ifndef CEPH_COMMON_PERF_COUNTERS_H
 #define CEPH_COMMON_PERF_COUNTERS_H
 
+#include <functional>
+#include <set>
 #include <string>
 #include <vector>
 #include <memory>
@@ -24,11 +26,12 @@
 #include <cstdint>
 
 #include "common/perf_histogram.h"
-#include "include/utime.h"
 #include "include/common_fwd.h"
 #include "common/ceph_mutex.h"
 #include "common/ceph_time.h"
 
+class utime_t;
+
 namespace TOPNSPC::common {
   class CephContext;
   class PerfCountersBuilder;
diff --git a/src/common/perf_counters_cache.h b/src/common/perf_counters_cache.h
index 866f56ee350..aa786fc5bf0 100644
--- a/src/common/perf_counters_cache.h
+++ b/src/common/perf_counters_cache.h
@@ -3,6 +3,7 @@
 #include "common/perf_counters.h"
 #include "common/ceph_context.h"
 #include "common/intrusive_lru.h"
+#include "include/utime.h"
 
 namespace ceph::perf_counters {
 
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index aa6b765bc56..d125d7171e0 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -15,6 +15,7 @@
 #include "common/pick_address.h"
 
 #include <bitset>
+#include <ifaddrs.h> // for struct ifaddrs
 #include <netdb.h>
 #include <netinet/in.h>
 #ifdef _WIN32
@@ -40,6 +41,7 @@
 #include "common/debug.h"
 #include "common/errno.h"
 #include "common/numa.h"
+#include "common/safe_io.h"
 
 #ifndef HAVE_IN_ADDR_T
 typedef uint32_t in_addr_t;
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index 3fd2bb1fd15..f390823a8a0 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -590,6 +590,19 @@ seastar::future<struct stat> AlienStore::stat(
   });
 }
 
+seastar::future<std::string> AlienStore::get_default_device_class()
+{
+  logger().debug("{}", __func__);
+  assert(tp);
+  return op_gates.simple_dispatch("get_default_device_class", [=, this] {
+    return tp->submit([=, this] {
+      return store->get_default_device_class();
+    }).then([] (std::string device_class) {
+      return seastar::make_ready_future<std::string>(device_class);
+    });
+  });
+}
+
 auto AlienStore::omap_get_header(CollectionRef ch,
                                  const ghobject_t& oid)
   -> get_attr_errorator::future<ceph::bufferlist>
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index d36f449afd8..853585dac9c 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -98,6 +98,7 @@ public:
   seastar::future<struct stat> stat(
     CollectionRef,
     const ghobject_t&) final;
+  seastar::future<std::string> get_default_device_class() final;
   get_attr_errorator::future<ceph::bufferlist> omap_get_header(
     CollectionRef,
     const ghobject_t&) final;
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
index 5cf9590e61e..277055ec51e 100644
--- a/src/crimson/os/alienstore/thread_pool.cc
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -27,7 +27,7 @@ ThreadPool::ThreadPool(size_t n_threads,
         pin(*cpus);
       }
       block_sighup();
-      (void) pthread_setname_np(pthread_self(), "alien-store-tp");
+      (void) ceph_pthread_setname("alien-store-tp");
       loop(queue_max_wait, i);
     });
   }
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
index 7b945e5aa15..3f861a9271f 100644
--- a/src/crimson/os/cyanstore/cyan_store.cc
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -143,6 +143,12 @@ CyanStore::list_collections()
   });
 }
 
+seastar::future<std::string>
+CyanStore::get_default_device_class()
+{
+  return seastar::make_ready_future<std::string>("");
+}
+
 CyanStore::mount_ertr::future<> CyanStore::Shard::mount()
 {
   static const char read_file_errmsg[]{"read_file"};
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
index 99583d07d36..e9394991bc2 100644
--- a/src/crimson/os/cyanstore/cyan_store.h
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -221,6 +221,8 @@ public:
 
   seastar::future<std::vector<coll_core_t>> list_collections() final;
 
+  seastar::future<std::string> get_default_device_class() final;
+
 private:
   seastar::sharded<CyanStore::Shard> shard_stores;
   const std::string path;
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index 0dca695ba3a..51ef2331014 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -203,6 +203,7 @@ public:
   using coll_core_t = std::pair<coll_t, core_id_t>;
   virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0;
 
+  virtual seastar::future<std::string> get_default_device_class() = 0;
 protected:
   const core_id_t primary_core;
 };
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 5dcb7514ee1..70fec7caca4 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -172,6 +172,7 @@ void Cache::register_metrics()
     {extent_types_t::LADDR_INTERNAL,      sm::label_instance("ext", "LADDR_INTERNAL")},
     {extent_types_t::LADDR_LEAF,          sm::label_instance("ext", "LADDR_LEAF")},
     {extent_types_t::DINK_LADDR_LEAF,     sm::label_instance("ext", "DINK_LADDR_LEAF")},
+    {extent_types_t::ROOT_META,           sm::label_instance("ext", "ROOT_META")},
     {extent_types_t::OMAP_INNER,          sm::label_instance("ext", "OMAP_INNER")},
     {extent_types_t::OMAP_LEAF,           sm::label_instance("ext", "OMAP_LEAF")},
     {extent_types_t::ONODE_BLOCK_STAGED,  sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
@@ -1093,6 +1094,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
   case extent_types_t::LADDR_LEAF:
     return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>(
       t, length, hint, gen);
+  case extent_types_t::ROOT_META:
+    return alloc_new_non_data_extent<RootMetaBlock>(
+      t, length, hint, gen);
   case extent_types_t::ONODE_BLOCK_STAGED:
     return alloc_new_non_data_extent<onode::SeastoreNodeExtent>(
       t, length, hint, gen);
@@ -2193,6 +2197,12 @@ Cache::do_get_caching_extent_by_type(
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
+    case extent_types_t::ROOT_META:
+      return do_get_caching_extent<RootMetaBlock>(
+	offset, length, std::move(extent_init_func), std::move(on_cache)
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
     case extent_types_t::OMAP_INNER:
       return do_get_caching_extent<omap_manager::OMapInnerNode>(
           offset, length, std::move(extent_init_func), std::move(on_cache)
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index dba3610e95f..c37d9c5c7cd 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -978,7 +978,8 @@ public:
     auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen);
 #endif
     if (!result) {
-      return nullptr;
+      SUBERRORT(seastore_cache, "insufficient space", t);
+      std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
     }
     auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result->bp));
     ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
@@ -1019,6 +1020,10 @@ public:
 #else
     auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen);
 #endif
+    if (results.empty()) {
+      SUBERRORT(seastore_cache, "insufficient space", t);
+      std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
+    }
     std::vector<TCachedExtentRef<T>> extents;
     for (auto &result : results) {
       auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
index 072c57864be..fa2ed65c0f3 100644
--- a/src/crimson/os/seastore/onode.h
+++ b/src/crimson/os/seastore/onode.h
@@ -36,8 +36,8 @@ struct onode_layout_t {
 
   object_data_le_t object_data;
 
-  char oi[MAX_OI_LENGTH];
-  char ss[MAX_SS_LENGTH];
+  char oi[MAX_OI_LENGTH] = {0};
+  char ss[MAX_SS_LENGTH] = {0};
 } __attribute__((packed));
 
 class Transaction;
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
index 9f6a566d15c..97b7902edf5 100644
--- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
@@ -188,10 +188,10 @@ BlockRBManager::write_ertr::future<> BlockRBManager::write(
 void BlockRBManager::prefill_fragmented_device()
 {
   LOG_PREFIX(BlockRBManager::prefill_fragmented_device);
-  // the first 2 blocks must be allocated to lba root
+  // the first 3 blocks must be allocated to lba root
   // and backref root during mkfs
-  for (size_t block = get_block_size() * 2;
-      block <= get_size() - get_block_size() * 2;
+  for (size_t block = get_block_size() * 3;
+      block <= get_size() - get_block_size() * 3;
       block += get_block_size() * 2) {
     DEBUG("marking {}~{} used",
       get_start_rbm_addr() + block,
diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h
new file mode 100644
index 00000000000..edf082f1e38
--- /dev/null
+++ b/src/crimson/os/seastore/root_meta.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+struct RootMetaBlock : LogicalCachedExtent {
+  using meta_t = std::map<std::string, std::string>;
+  using Ref = TCachedExtentRef<RootMetaBlock>;
+  static constexpr size_t SIZE = 4096;
+  static constexpr int MAX_META_LENGTH = 1024;
+
+  explicit RootMetaBlock(ceph::bufferptr &&ptr)
+    : LogicalCachedExtent(std::move(ptr)) {}
+  explicit RootMetaBlock(extent_len_t length)
+    : LogicalCachedExtent(length) {}
+  RootMetaBlock(const RootMetaBlock &rhs)
+    : LogicalCachedExtent(rhs) {}
+
+  CachedExtentRef duplicate_for_write(Transaction&) final {
+    return CachedExtentRef(new RootMetaBlock(*this));
+  }
+
+  static constexpr extent_types_t TYPE = extent_types_t::ROOT_META;
+  extent_types_t get_type() const final {
+    return extent_types_t::ROOT_META;
+  }
+
+  /// dumps root meta as delta
+  ceph::bufferlist get_delta() final {
+    ceph::bufferlist bl;
+    ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH);
+    bl.append(bptr);
+    return bl;
+  }
+
+  /// overwrites root
+  void apply_delta(const ceph::bufferlist &_bl) final
+  {
+    assert(_bl.length() == MAX_META_LENGTH);
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str());
+  }
+
+  meta_t get_meta() const {
+    bufferlist bl;
+    bl.append(get_bptr());
+    meta_t ret;
+    auto iter = bl.cbegin();
+    decode(ret, iter);
+    return ret;
+  }
+
+  void set_meta(const meta_t &m) {
+    ceph::bufferlist bl;
+    encode(m, bl);
+    ceph_assert(bl.length() <= MAX_META_LENGTH);
+    bl.rebuild();
+    get_bptr().zero(0, MAX_META_LENGTH);
+    get_bptr().copy_in(0, bl.length(), bl.front().c_str());
+  }
+
+};
+using RootMetaBlockRef = RootMetaBlock::Ref;
+
+} // crimson::os::seastore
+
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock>
+  : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index d90edbb20db..2bf55d5cde7 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -2721,6 +2721,13 @@ SeaStore::read_meta(const std::string& key)
   );
 }
 
+seastar::future<std::string> SeaStore::get_default_device_class()
+{
+  using crimson::common::get_conf;
+  std::string type = get_conf<std::string>("seastore_main_device_type");
+  return seastar::make_ready_future<std::string>(type);
+}
+
 uuid_d SeaStore::Shard::get_fsid() const
 {
   return device->get_meta().seastore_id;
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index 185072744f2..fd7e177da63 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -191,6 +191,8 @@ public:
     seastar::future<> write_meta(const std::string& key,
                                  const std::string& value);
 
+    seastar::future<std::string> get_default_device_class();
+
     store_statfs_t stat() const;
 
     uuid_d get_fsid() const;
@@ -567,6 +569,8 @@ public:
 
   seastar::future<std::vector<coll_core_t>> list_collections() final;
 
+  seastar::future<std::string> get_default_device_class() final;
+
   FuturizedStore::Shard& get_sharded_store() final {
     return shard_stores.local();
   }
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index f379dd0117c..450118e5e75 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
     return out << "LADDR_LEAF";
   case extent_types_t::ONODE_BLOCK_STAGED:
     return out << "ONODE_BLOCK_STAGED";
+  case extent_types_t::ROOT_META:
+    return out << "ROOT_META";
   case extent_types_t::OMAP_INNER:
     return out << "OMAP_INNER";
   case extent_types_t::OMAP_LEAF:
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index df5c184e7ab..65cad878fba 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -1378,23 +1378,24 @@ enum class extent_types_t : uint8_t {
   LADDR_INTERNAL = 1,
   LADDR_LEAF = 2,
   DINK_LADDR_LEAF = 3, // should only be used for unitttests
-  OMAP_INNER = 4,
-  OMAP_LEAF = 5,
-  ONODE_BLOCK_STAGED = 6,
-  COLL_BLOCK = 7,
-  OBJECT_DATA_BLOCK = 8,
-  RETIRED_PLACEHOLDER = 9,
+  ROOT_META = 4,
+  OMAP_INNER = 5,
+  OMAP_LEAF = 6,
+  ONODE_BLOCK_STAGED = 7,
+  COLL_BLOCK = 8,
+  OBJECT_DATA_BLOCK = 9,
+  RETIRED_PLACEHOLDER = 10,
   // the following two types are not extent types,
   // they are just used to indicates paddr allocation deltas
-  ALLOC_INFO = 10,
-  JOURNAL_TAIL = 11,
+  ALLOC_INFO = 11,
+  JOURNAL_TAIL = 12,
   // Test Block Types
-  TEST_BLOCK = 12,
-  TEST_BLOCK_PHYSICAL = 13,
-  BACKREF_INTERNAL = 14,
-  BACKREF_LEAF = 15,
+  TEST_BLOCK = 13,
+  TEST_BLOCK_PHYSICAL = 14,
+  BACKREF_INTERNAL = 15,
+  BACKREF_LEAF = 16,
   // None and the number of valid extent_types_t
-  NONE = 16,
+  NONE = 17,
 };
 using extent_types_le_t = uint8_t;
 constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
@@ -1409,12 +1410,12 @@ constexpr bool is_data_type(extent_types_t type) {
 }
 
 constexpr bool is_logical_metadata_type(extent_types_t type) {
-  return type >= extent_types_t::OMAP_INNER &&
+  return type >= extent_types_t::ROOT_META &&
          type <= extent_types_t::COLL_BLOCK;
 }
 
 constexpr bool is_logical_type(extent_types_t type) {
-  if ((type >= extent_types_t::OMAP_INNER &&
+  if ((type >= extent_types_t::ROOT_META &&
        type <= extent_types_t::OBJECT_DATA_BLOCK) ||
       type == extent_types_t::TEST_BLOCK) {
     assert(is_logical_metadata_type(type) ||
@@ -1926,44 +1927,18 @@ using backref_root_t = phy_tree_root_t;
  * TODO: generalize this to permit more than one lba_manager implementation
  */
 struct __attribute__((packed)) root_t {
-  using meta_t = std::map<std::string, std::string>;
-
-  static constexpr int MAX_META_LENGTH = 1024;
-
   backref_root_t backref_root;
   lba_root_t lba_root;
   laddr_le_t onode_root;
   coll_root_le_t collection_root;
+  laddr_le_t meta;
 
-  char meta[MAX_META_LENGTH];
-
-  root_t() {
-    set_meta(meta_t{});
-  }
+  root_t() = default;
 
   void adjust_addrs_from_base(paddr_t base) {
     lba_root.adjust_addrs_from_base(base);
     backref_root.adjust_addrs_from_base(base);
   }
-
-  meta_t get_meta() {
-    bufferlist bl;
-    bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
-    meta_t ret;
-    auto iter = bl.cbegin();
-    decode(ret, iter);
-    return ret;
-  }
-
-  void set_meta(const meta_t &m) {
-    ceph::bufferlist bl;
-    encode(m, bl);
-    ceph_assert(bl.length() < MAX_META_LENGTH);
-    bl.rebuild();
-    auto &bptr = bl.front();
-    ::memset(meta, 0, MAX_META_LENGTH);
-    ::memcpy(meta, bptr.c_str(), bl.length());
-  }
 };
 
 struct alloc_blk_t {
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index f4e3b0858f2..717c3822db9 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -74,6 +74,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
         return lba_manager->mkfs(t);
       }).si_then([this, &t] {
         return backref_manager->mkfs(t);
+      }).si_then([this, &t] {
+        return init_root_meta(t);
       }).si_then([this, FNAME, &t] {
         INFOT("submitting mkfs transaction", t);
         return submit_transaction_direct(t);
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index c7a94a9ef11..841c5638abc 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -23,6 +23,7 @@
 #include "crimson/os/seastore/logging.h"
 #include "crimson/os/seastore/seastore_types.h"
 #include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/root_meta.h"
 #include "crimson/os/seastore/lba_manager.h"
 #include "crimson/os/seastore/backref_manager.h"
 #include "crimson/os/seastore/journal.h"
@@ -303,10 +304,6 @@ public:
       len,
       placement_hint,
       INIT_GENERATION);
-    if (!ext) {
-      SUBERRORT(seastore_tm, "insufficient space!", t);
-      return crimson::ct_error::enospc::make();
-    }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
@@ -342,10 +339,6 @@ public:
       len,
       placement_hint,
       INIT_GENERATION);
-    if (exts.empty()) {
-      SUBERRORT(seastore_tm, "insufficient space!", t);
-      return crimson::ct_error::enospc::make();
-    }
     return lba_manager->alloc_extents(
       t,
       laddr_hint,
@@ -690,9 +683,11 @@ public:
     const std::string &key) {
     return cache->get_root(
       t
-    ).si_then([&key, &t](auto root) {
+    ).si_then([&t, this](auto root) {
+      return read_extent<RootMetaBlock>(t, root->root.meta);
+    }).si_then([key, &t](auto mblock) {
       LOG_PREFIX(TransactionManager::read_root_meta);
-      auto meta = root->root.get_meta();
+      auto meta = mblock->get_meta();
       auto iter = meta.find(key);
       if (iter == meta.end()) {
         SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
@@ -701,7 +696,35 @@ public:
         SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
 	return seastar::make_ready_future<read_root_meta_bare>(iter->second);
       }
-    });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
+  }
+
+  /**
+   * init_root_meta
+   *
+   * create the root meta block
+   */
+  using init_root_meta_iertr = base_iertr;
+  using init_root_meta_ret = init_root_meta_iertr::future<>;
+  init_root_meta_ret init_root_meta(Transaction &t) {
+    return alloc_non_data_extent<RootMetaBlock>(
+      t, L_ADDR_MIN, RootMetaBlock::SIZE
+    ).si_then([this, &t](auto meta) {
+      meta->set_meta(RootMetaBlock::meta_t{});
+      return cache->get_root(t
+      ).si_then([this, &t, meta](auto root) {
+	auto mroot = cache->duplicate_for_write(
+	  t, root)->template cast<RootBlock>();
+	mroot->root.meta = meta->get_laddr();
+	return seastar::now();
+      });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
   }
 
   /**
@@ -719,15 +742,21 @@ public:
     SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
-    ).si_then([this, &t, &key, &value](RootBlockRef root) {
-      root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
+    ).si_then([this, &t](RootBlockRef root) {
+      return read_extent<RootMetaBlock>(t, root->root.meta);
+    }).si_then([this, key, value, &t](auto mblock) {
+      mblock = get_mutable_extent(t, mblock
+	)->template cast<RootMetaBlock>();
 
-      auto meta = root->root.get_meta();
+      auto meta = mblock->get_meta();
       meta[key] = value;
 
-      root->root.set_meta(meta);
+      mblock->set_meta(meta);
       return seastar::now();
-    });
+    }).handle_error_interruptible(
+      crimson::ct_error::input_output_error::pass_further{},
+      crimson::ct_error::assert_all{"unexpected error!"}
+    );
   }
 
   /**
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index a77cbe87652..837fd2eb2af 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -379,16 +379,17 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-    } else {
+      backfill_listener().maybe_flush();
+    } else if (!primary_bi.empty()) {
       auto result = update_on_peers(check);
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      if (!primary_bi.empty()) {
-	primary_bi.pop_front();
-      }
+      primary_bi.pop_front();
+      backfill_listener().maybe_flush();
+    } else {
+      break;
     }
-    backfill_listener().maybe_flush();
   } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
   if (backfill_state().progress_tracker->tracked_objects_completed()
@@ -610,4 +611,12 @@ void BackfillState::ProgressTracker::complete_to(
   }
 }
 
+void BackfillState::enqueue_standalone_push(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers) {
+  progress_tracker->enqueue_push(obj);
+  backfill_machine.backfill_listener.enqueue_push(obj, v, peers);
+}
+
 } // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index a49cbeaac06..072c91e079d 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -304,6 +304,15 @@ public:
     backfill_machine.process_event(*std::move(evt));
   }
 
+  void enqueue_standalone_push(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
+
+  bool is_triggered() const {
+    return backfill_machine.triggering_event() != nullptr;
+  }
+
   hobject_t get_last_backfill_started() const {
     return last_backfill_started;
   }
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
index 32eaaf02b3f..007d0bf35f3 100644
--- a/src/crimson/osd/ec_backend.cc
+++ b/src/crimson/osd/ec_backend.cc
@@ -26,6 +26,7 @@ ECBackend::_read(const hobject_t& hoid,
 ECBackend::rep_op_fut_t
 ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards,
                               const hobject_t& hoid,
+			      crimson::osd::ObjectContextRef&& new_clone,
                               ceph::os::Transaction&& txn,
                               osd_op_params_t&& osd_op_p,
                               epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
index 90a7e2b1f4d..b14c78c9fc4 100644
--- a/src/crimson/osd/ec_backend.h
+++ b/src/crimson/osd/ec_backend.h
@@ -28,6 +28,7 @@ private:
   rep_op_fut_t
   submit_transaction(const std::set<pg_shard_t> &pg_shards,
 		     const hobject_t& hoid,
+		     crimson::osd::ObjectContextRef&& new_clone,
 		     ceph::os::Transaction&& txn,
 		     osd_op_params_t&& req,
 		     epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 4e735c3b4cb..97b241fdce4 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -940,6 +940,7 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
   };
   encode(cloned_snaps, cloning_ctx->log_entry.snaps);
   cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size);
+  cloning_ctx->clone_obc = clone_obc;
 
   return cloning_ctx;
 }
@@ -966,7 +967,7 @@ void OpsExecuter::update_clone_overlap() {
 
 void OpsExecuter::CloningContext::apply_to(
   std::vector<pg_log_entry_t>& log_entries,
-  ObjectContext& processed_obc) &&
+  ObjectContext& processed_obc)
 {
   log_entry.mtime = processed_obc.obs.oi.mtime;
   log_entries.insert(log_entries.begin(), std::move(log_entry));
@@ -983,7 +984,7 @@ OpsExecuter::flush_clone_metadata(
   assert(!txn.empty());
   update_clone_overlap();
   if (cloning_ctx) {
-    std::move(*cloning_ctx).apply_to(log_entries, *obc);
+    cloning_ctx->apply_to(log_entries, *obc);
   }
   if (snapc.seq > obc->ssc->snapset.seq) {
      // update snapset with latest snap context
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index e770e825b32..94b64ccebb1 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -197,10 +197,11 @@ private:
   struct CloningContext {
     SnapSet new_snapset;
     pg_log_entry_t log_entry;
+    ObjectContextRef clone_obc;
 
     void apply_to(
       std::vector<pg_log_entry_t>& log_entries,
-      ObjectContext& processed_obc) &&;
+      ObjectContext& processed_obc);
   };
   std::unique_ptr<CloningContext> cloning_ctx;
 
@@ -504,6 +505,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
     ceph_assert(want_mutate);
   }
 
+  apply_stats();
   if (want_mutate) {
     auto log_entries = flush_clone_metadata(
       prepare_transaction(ops),
@@ -519,14 +521,15 @@ OpsExecuter::flush_changes_n_do_ops_effects(
       std::move(txn),
       std::move(obc),
       std::move(*osd_op_params),
-      std::move(log_entries));
+      std::move(log_entries),
+      cloning_ctx
+	? std::move(cloning_ctx->clone_obc)
+	: nullptr);
 
     submitted = std::move(_submitted);
     all_completed = std::move(_all_completed);
   }
 
-  apply_stats();
-
   if (op_effects.size()) [[unlikely]] {
     // need extra ref pg due to apply_stats() which can be executed after
     // informing snap mapper
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 34ad97ceb06..0f19bfd7145 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -504,6 +504,8 @@ seastar::future<> OSD::start()
   }).then_unpack([this] {
     return _add_me_to_crush();
   }).then([this] {
+    return _add_device_class();
+   }).then([this] {
     monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
     monc->sub_want("mgrmap", 0, 0);
     monc->sub_want("osdmap", 0, 0);
@@ -608,6 +610,38 @@ seastar::future<> OSD::_send_boot()
   return monc->send_message(std::move(m));
 }
 
+seastar::future<> OSD::_add_device_class()
+{
+  LOG_PREFIX(OSD::_add_device_class);
+  if (!local_conf().get_val<bool>("osd_class_update_on_start")) {
+    co_return;
+  }
+
+  std::string device_class = co_await store.get_default_device_class();
+  if (device_class.empty()) {
+    WARN("Device class is empty; skipping crush update.");
+    co_return;
+  }
+
+  INFO("device_class is {} ", device_class);
+
+  std::string cmd = fmt::format(
+    R"({{"prefix": "osd crush set-device-class", "class": "{}", "ids": ["{}"]}})",
+    device_class, stringify(whoami)
+  );
+
+  auto [code, message, out] = co_await monc->run_command(std::move(cmd), {});
+  if (code) {
+    // to be caught by crimson/osd/main.cc
+    WARN("fail to set device_class : {} ({})", message, code);
+    throw std::runtime_error("fail to set device_class");
+  } else {
+    INFO("device_class was set: {}", message);
+  }
+
+  co_return;
+}
+
 seastar::future<> OSD::_add_me_to_crush()
 {
   LOG_PREFIX(OSD::_add_me_to_crush);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index d7d54d5d2c3..1a84ccd6a3f 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -188,6 +188,7 @@ private:
   seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap);
   seastar::future<> _send_boot();
   seastar::future<> _add_me_to_crush();
+  seastar::future<> _add_device_class();
 
   seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
 
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index c5bdcae47f2..8cab6125682 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -435,6 +435,7 @@ SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc,
 
   auto [submitted, all_completed] = co_await pg->submit_transaction(
 	  std::move(clone_obc),
+	  nullptr,
 	  std::move(txn),
 	  std::move(osd_op_p),
 	  std::move(log_entries)
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index de39fb45716..1e2988efbbe 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -907,11 +907,23 @@ void PG::mutate_object(
   }
 }
 
+void PG::enqueue_push_for_backfill(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers)
+{
+  assert(recovery_handler);
+  assert(recovery_handler->backfill_state);
+  auto backfill_state = recovery_handler->backfill_state.get();
+  backfill_state->enqueue_standalone_push(obj, v, peers);
+}
+
 PG::interruptible_future<
   std::tuple<PG::interruptible_future<>,
              PG::interruptible_future<>>>
 PG::submit_transaction(
   ObjectContextRef&& obc,
+  ObjectContextRef&& new_clone,
   ceph::os::Transaction&& txn,
   osd_op_params_t&& osd_op_p,
   std::vector<pg_log_entry_t>&& log_entries)
@@ -924,8 +936,9 @@ PG::submit_transaction(
   }
 
   epoch_t map_epoch = get_osdmap_epoch();
+  auto at_version = osd_op_p.at_version;
 
-  peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+  peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, at_version);
   peering_state.update_trim_to();
 
   ceph_assert(!log_entries.empty());
@@ -939,6 +952,7 @@ PG::submit_transaction(
   auto [submitted, all_completed] = co_await backend->submit_transaction(
       peering_state.get_acting_recovery_backfill(),
       obc->obs.oi.soid,
+      std::move(new_clone),
       std::move(txn),
       std::move(osd_op_p),
       peering_state.get_last_peering_reset(),
@@ -947,8 +961,8 @@ PG::submit_transaction(
   co_return std::make_tuple(
     std::move(submitted),
     all_completed.then_interruptible(
-      [this, last_complete=peering_state.get_info().last_complete,
-      at_version=osd_op_p.at_version](auto acked) {
+      [this, last_complete=peering_state.get_info().last_complete, at_version]
+      (auto acked) {
       for (const auto& peer : acked) {
         peering_state.update_peer_last_complete_ondisk(
           peer.shard, peer.last_complete_ondisk);
@@ -1153,11 +1167,13 @@ PG::submit_executer_fut PG::submit_executer(
     [FNAME, this](auto&& txn,
 		  auto&& obc,
 		  auto&& osd_op_p,
-		  auto&& log_entries) {
+		  auto&& log_entries,
+                  auto&& new_clone) {
       DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
       mutate_object(obc, txn, osd_op_p);
       return submit_transaction(
 	std::move(obc),
+        std::move(new_clone),
 	std::move(txn),
 	std::move(osd_op_p),
 	std::move(log_entries));
@@ -1604,7 +1620,7 @@ bool PG::should_send_op(
     //    missing set
     hoid <= peering_state.get_peer_info(peer).last_backfill ||
     (has_backfill_state() && hoid <= get_last_backfill_started() &&
-     !peering_state.get_peer_missing(peer).is_missing(hoid)));
+     !is_missing_on_peer(peer, hoid)));
   if (!should_send) {
     ceph_assert(is_backfill_target(peer));
     logger().debug("{} issue_repop shipping empty opt to osd."
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index b0429c8fb4f..15aeec0e4f3 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -45,6 +45,7 @@
 class MQuery;
 class OSDMap;
 class PGBackend;
+class ReplicatedBackend;
 class PGPeeringEvent;
 class osd_op_params_t;
 
@@ -678,6 +679,7 @@ private:
     std::tuple<interruptible_future<>, interruptible_future<>>>
   submit_transaction(
     ObjectContextRef&& obc,
+    ObjectContextRef&& new_clone,
     ceph::os::Transaction&& txn,
     osd_op_params_t&& oop,
     std::vector<pg_log_entry_t>&& log_entries);
@@ -885,6 +887,10 @@ private:
   friend class SnapTrimObjSubEvent;
 private:
 
+  void enqueue_push_for_backfill(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
   void mutate_object(
     ObjectContextRef& obc,
     ceph::os::Transaction& txn,
@@ -893,7 +899,7 @@ private:
   bool can_discard_op(const MOSDOp& m) const;
   void context_registry_on_change();
   bool is_missing_object(const hobject_t& soid) const {
-    return peering_state.get_pg_log().get_missing().get_items().count(soid);
+    return get_local_missing().is_missing(soid);
   }
   bool is_unreadable_object(const hobject_t &oid,
 			    eversion_t* v = 0) const final {
@@ -901,6 +907,11 @@ private:
       !peering_state.get_missing_loc().readable_with_acting(
 	oid, get_actingset(), v);
   }
+  bool is_missing_on_peer(
+    const pg_shard_t &peer,
+    const hobject_t &soid) const {
+    return peering_state.get_peer_missing(peer).is_missing(soid);
+  }
   bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
   const std::set<pg_shard_t> &get_actingset() const {
     return peering_state.get_actingset();
@@ -908,6 +919,7 @@ private:
 
 private:
   friend class IOInterruptCondition;
+  friend class ::ReplicatedBackend;
   struct log_update_t {
     std::set<pg_shard_t> waiting_on;
     seastar::shared_promise<> all_committed;
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index fa1f1405ffe..813218983fd 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -414,6 +414,7 @@ public:
   virtual rep_op_fut_t
   submit_transaction(const std::set<pg_shard_t> &pg_shards,
 		     const hobject_t& hoid,
+		     crimson::osd::ObjectContextRef&& new_clone,
 		     ceph::os::Transaction&& txn,
 		     osd_op_params_t&& osd_op_p,
 		     epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 705b3176b97..657e6d3e888 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -45,6 +45,10 @@ public:
 
   seastar::future<> stop() { return seastar::now(); }
   void on_pg_clean();
+  void enqueue_push(
+    const hobject_t& obj,
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
 private:
   PGRecoveryListener* pg;
   size_t start_primary_recovery_ops(
@@ -108,10 +112,6 @@ private:
     const hobject_t& end) final;
   void request_primary_scan(
     const hobject_t& begin) final;
-  void enqueue_push(
-    const hobject_t& obj,
-    const eversion_t& v,
-    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index 12ee38b4370..f09cd147ea9 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -36,19 +36,59 @@ ReplicatedBackend::_read(const hobject_t& hoid,
   return store->read(coll, ghobject_t{hoid}, off, len, flags);
 }
 
+MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg(
+  const pg_shard_t &pg_shard,
+  const hobject_t &hoid,
+  const bufferlist &encoded_txn,
+  const osd_op_params_t &osd_op_p,
+  epoch_t min_epoch,
+  epoch_t map_epoch,
+  const std::vector<pg_log_entry_t> &log_entries,
+  bool send_op,
+  ceph_tid_t tid)
+{
+  ceph_assert(pg_shard != whoami);
+  auto m = crimson::make_message<MOSDRepOp>(
+    osd_op_p.req_id,
+    whoami,
+    spg_t{pgid, pg_shard.shard},
+    hoid,
+    CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+    map_epoch,
+    min_epoch,
+    tid,
+    osd_op_p.at_version);
+  if (send_op) {
+    m->set_data(encoded_txn);
+  } else {
+    ceph::os::Transaction t;
+    bufferlist bl;
+    encode(t, bl);
+    m->set_data(bl);
+  }
+  encode(log_entries, m->logbl);
+  m->pg_trim_to = osd_op_p.pg_trim_to;
+  m->pg_committed_to = osd_op_p.pg_committed_to;
+  m->pg_stats = pg.get_info().stats;
+  return m;
+}
+
 ReplicatedBackend::rep_op_fut_t
-ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
-                                      const hobject_t& hoid,
-                                      ceph::os::Transaction&& t,
-                                      osd_op_params_t&& opp,
-                                      epoch_t min_epoch, epoch_t map_epoch,
-				      std::vector<pg_log_entry_t>&& logv)
+ReplicatedBackend::submit_transaction(
+  const std::set<pg_shard_t> &pg_shards,
+  const hobject_t& hoid,
+  crimson::osd::ObjectContextRef &&new_clone,
+  ceph::os::Transaction&& t,
+  osd_op_params_t&& opp,
+  epoch_t min_epoch, epoch_t map_epoch,
+  std::vector<pg_log_entry_t>&& logv)
 {
   LOG_PREFIX(ReplicatedBackend::submit_transaction);
   DEBUGDPP("object {}", dpp, hoid);
   auto log_entries = std::move(logv);
   auto txn = std::move(t);
   auto osd_op_p = std::move(opp);
+  auto _new_clone = std::move(new_clone);
 
   const ceph_tid_t tid = shard_services.get_tid();
   auto pending_txn =
@@ -60,37 +100,34 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     le.mark_unrollbackable();
   }
 
+  std::vector<pg_shard_t> to_push_clone;
   auto sends = std::make_unique<std::vector<seastar::future<>>>();
-  for (auto pg_shard : pg_shards) {
-    if (pg_shard != whoami) {
-      auto m = crimson::make_message<MOSDRepOp>(
-	osd_op_p.req_id,
-	whoami,
-	spg_t{pgid, pg_shard.shard},
-	hoid,
-	CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
-	map_epoch,
-	min_epoch,
-	tid,
-	osd_op_p.at_version);
-      if (pg.should_send_op(pg_shard, hoid)) {
-	m->set_data(encoded_txn);
-      } else {
-	ceph::os::Transaction t;
-	bufferlist bl;
-	encode(t, bl);
-	m->set_data(bl);
+  for (auto &pg_shard : pg_shards) {
+    if (pg_shard == whoami) {
+      continue;
+    }
+    MURef<MOSDRepOp> m;
+    if (pg.should_send_op(pg_shard, hoid)) {
+      m = new_repop_msg(
+	pg_shard, hoid, encoded_txn, osd_op_p,
+	min_epoch, map_epoch, log_entries, true, tid);
+    } else {
+      m = new_repop_msg(
+	pg_shard, hoid, encoded_txn, osd_op_p,
+	min_epoch, map_epoch, log_entries, false, tid);
+      if (_new_clone && pg.is_missing_on_peer(pg_shard, hoid)) {
+	// The head is in the push queue but hasn't been pushed yet.
+	// We need to ensure that the newly created clone will be 
+	// pushed as well, otherwise we might skip it.
+	// See: https://tracker.ceph.com/issues/68808
+	to_push_clone.push_back(pg_shard);
       }
-      pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
-      encode(log_entries, m->logbl);
-      m->pg_trim_to = osd_op_p.pg_trim_to;
-      m->pg_committed_to = osd_op_p.pg_committed_to;
-      m->pg_stats = pg.get_info().stats;
-      // TODO: set more stuff. e.g., pg_states
-      sends->emplace_back(
-	shard_services.send_to_osd(
-	  pg_shard.osd, std::move(m), map_epoch));
     }
+    pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+    // TODO: set more stuff. e.g., pg_states
+    sends->emplace_back(
+      shard_services.send_to_osd(
+	pg_shard.osd, std::move(m), map_epoch));
   }
 
   co_await pg.update_snap_map(log_entries, txn);
@@ -120,9 +157,16 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
       return seastar::now();
     }
     return peers->all_committed.get_shared_future();
-  }).then_interruptible([pending_txn, this] {
+  }).then_interruptible([pending_txn, this, _new_clone,
+			to_push_clone=std::move(to_push_clone)] {
     auto acked_peers = std::move(pending_txn->second.acked_peers);
     pending_trans.erase(pending_txn);
+    if (_new_clone && !to_push_clone.empty()) {
+      pg.enqueue_push_for_backfill(
+	_new_clone->obs.oi.soid,
+	_new_clone->obs.oi.version,
+	to_push_clone);
+    }
     return seastar::make_ready_future<
       crimson::osd::acked_peers_t>(std::move(acked_peers));
   });
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
index fb8704d8742..d5844b23a0c 100644
--- a/src/crimson/osd/replicated_backend.h
+++ b/src/crimson/osd/replicated_backend.h
@@ -35,6 +35,7 @@ private:
   rep_op_fut_t submit_transaction(
     const std::set<pg_shard_t> &pg_shards,
     const hobject_t& hoid,
+    crimson::osd::ObjectContextRef&& new_clone,
     ceph::os::Transaction&& txn,
     osd_op_params_t&& osd_op_p,
     epoch_t min_epoch, epoch_t max_epoch,
@@ -60,6 +61,17 @@ private:
   pending_transactions_t pending_trans;
   crimson::osd::PG& pg;
 
+  MURef<MOSDRepOp> new_repop_msg(
+    const pg_shard_t &pg_shard,
+    const hobject_t &hoid,
+    const bufferlist &encoded_txn,
+    const osd_op_params_t &osd_op_p,
+    epoch_t min_epoch,
+    epoch_t map_epoch,
+    const std::vector<pg_log_entry_t> &log_entries,
+    bool send_op,
+    ceph_tid_t tid);
+
   seastar::future<> request_committed(
     const osd_reqid_t& reqid, const eversion_t& at_version) final;
 };
diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc
index 4b8a8131bcf..d27b3ac43c5 100644
--- a/src/exporter/DaemonMetricCollector.cc
+++ b/src/exporter/DaemonMetricCollector.cc
@@ -29,9 +29,16 @@ using json_object = boost::json::object;
 using json_value = boost::json::value;
 using json_array = boost::json::array;
 
-void DaemonMetricCollector::request_loop(boost::asio::steady_timer &timer) {
-  timer.async_wait([&](const boost::system::error_code &e) {
-    std::cerr << e << std::endl;
+void DaemonMetricCollector::request_loop() {
+  timer.async_wait([this](const boost::system::error_code &e) {
+    if (shutdown_flag) {
+      dout(1) << "Metric collector request loop cancelled" << dendl;
+      return;
+    }
+
+    if (e) return; // Exit on error or cancellation
+
+    dout(10) << "Getting metrics loop..." << dendl;
     update_sockets();
 
     bool sort_metrics = g_conf().get_val<bool>("exporter_sort_metrics");
@@ -42,19 +49,24 @@ void DaemonMetricCollector::request_loop(boost::asio::steady_timer &timer) {
     auto stats_period = g_conf().get_val<int64_t>("exporter_stats_period");
     // time to wait before sending requests again
     timer.expires_from_now(std::chrono::seconds(stats_period));
-    request_loop(timer);
+    request_loop();
   });
 }
 
 void DaemonMetricCollector::main() {
-  // time to wait before sending requests again
-
-  boost::asio::io_context io;
-  boost::asio::steady_timer timer{io, std::chrono::seconds(0)};
-  request_loop(timer);
+  shutdown_flag = false;
+  timer.expires_from_now(std::chrono::seconds(0));
+  request_loop();
   io.run();
 }
 
+void DaemonMetricCollector::shutdown(){
+  shutdown_flag = true;
+  timer.cancel();  // Explicitly cancel the timer
+  dout(1) << "Collector shutdown initiated, timer canceled" << dendl;
+  io.stop();
+}
+
 std::string DaemonMetricCollector::get_metrics() {
   const std::lock_guard<std::mutex> lock(metrics_mutex);
   return metrics;
@@ -499,3 +511,4 @@ DaemonMetricCollector &collector_instance() {
   static DaemonMetricCollector instance;
   return instance;
 }
+
diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h
index 3302e95df91..5831a0fa3b0 100644
--- a/src/exporter/DaemonMetricCollector.h
+++ b/src/exporter/DaemonMetricCollector.h
@@ -1,17 +1,20 @@
 #pragma once
 
 #include "common/admin_socket_client.h"
+#include <atomic>
 #include <map>
 #include <string>
 #include <vector>
 
 #include <boost/asio/steady_timer.hpp>
+#include <boost/thread.hpp>
 #include <boost/json/object.hpp>
 #include <filesystem>
 #include <map>
 #include <string>
 #include <vector>
 
+
 struct pstat {
   unsigned long utime;
   unsigned long stime;
@@ -43,11 +46,16 @@ public:
   std::string metrics;
   std::pair<labels_t, std::string> add_fixed_name_metrics(std::string metric_name);
   void update_sockets();
+  void shutdown();
 
 private:
   std::mutex metrics_mutex;
   std::unique_ptr<MetricsBuilder> builder;
-  void request_loop(boost::asio::steady_timer &timer);
+  boost::asio::io_context io;
+  boost::asio::steady_timer timer{io};
+  std::atomic<bool> shutdown_flag{false};
+
+  void request_loop();
 
   void dump_asok_metric(boost::json::object perf_info,
                         boost::json::value perf_values, std::string name,
@@ -108,3 +116,4 @@ public:
 };
 
 DaemonMetricCollector &collector_instance();
+
diff --git a/src/exporter/ceph_exporter.cc b/src/exporter/ceph_exporter.cc
index 2e2c16bb085..44b67c7e615 100644
--- a/src/exporter/ceph_exporter.cc
+++ b/src/exporter/ceph_exporter.cc
@@ -1,16 +1,31 @@
 #include "common/ceph_argparse.h"
 #include "common/config.h"
-#include "exporter/DaemonMetricCollector.h"
-#include "exporter/web_server.h"
+#include "common/debug.h"
 #include "global/global_init.h"
 #include "global/global_context.h"
-
+#include "global/signal_handler.h"
+#include "exporter/DaemonMetricCollector.h"
+#include "exporter/web_server.h"
 #include <boost/thread/thread.hpp>
 #include <iostream>
 #include <map>
 #include <string>
+#include <atomic>
+#include <chrono>
+#include <thread>
 
 #define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_ceph_exporter
+
+DaemonMetricCollector &collector = collector_instance();
+
+static void handle_signal(int signum)
+{
+  ceph_assert(signum == SIGINT || signum == SIGTERM);
+  derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+  // Finish the DaemonMetricCollector
+  collector.shutdown();
+}
 
 static void usage() {
   std::cout << "usage: ceph-exporter [options]\n"
@@ -27,7 +42,6 @@ static void usage() {
 }
 
 int main(int argc, char **argv) {
-
   auto args = argv_to_vec(argc, argv);
   if (args.empty()) {
     std::cerr << argv[0] << ": -h or --help for usage" << std::endl;
@@ -64,8 +78,30 @@ int main(int argc, char **argv) {
   }
   common_init_finish(g_ceph_context);
 
+  // Register signal handlers
+  init_async_signal_handler();
+  register_async_signal_handler(SIGHUP, sighup_handler);
+  register_async_signal_handler_oneshot(SIGINT, handle_signal);
+  register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+  // Start the web server thread
   boost::thread server_thread(web_server_thread_entrypoint);
-  DaemonMetricCollector &collector = collector_instance();
+
+  // Start the DaemonMetricCollector
   collector.main();
+
+  // Interrupted. Time to terminate
+  unregister_async_signal_handler(SIGHUP, sighup_handler);
+  unregister_async_signal_handler(SIGINT, handle_signal);
+  unregister_async_signal_handler(SIGTERM, handle_signal);
+  shutdown_async_signal_handler();
+
+  // Stop the web server thread by interrupting it
+  stop_web_server();
+  server_thread.interrupt();  // Interrupt the web server thread
   server_thread.join();
+
+  dout(1) << "Ceph exporter stopped" << dendl;
+
+  return 0;
 }
diff --git a/src/exporter/web_server.cc b/src/exporter/web_server.cc
index 96cc02b389f..c01205f26bb 100644
--- a/src/exporter/web_server.cc
+++ b/src/exporter/web_server.cc
@@ -28,6 +28,9 @@ namespace net = boost::asio;      // from <boost/asio.hpp>
 namespace ssl = boost::asio::ssl; // from <boost/asio/ssl.hpp>
 using tcp = boost::asio::ip::tcp; // from <boost/asio/ip/tcp.hpp>
 
+//common io context for the web servers
+std::shared_ptr<net::io_context> global_ioc;
+
 // Base class for common functionality
 class web_connection {
 public:
@@ -43,7 +46,7 @@ protected:
   web_connection(net::any_io_executor executor, std::chrono::seconds timeout)
       : deadline_(executor, timeout) {}
 
-    // Common request processing logic
+  // Common request processing logic
   void process_request() {
     response_.version(request_.version());
     response_.keep_alive(request_.keep_alive());
@@ -64,7 +67,7 @@ protected:
     write_response();
   }
 
-    // Construct a response message based on the request target
+  // Construct a response message based on the request target
   void create_response() {
     if (request_.target() == "/") {
         response_.result(http::status::moved_permanently);
@@ -81,7 +84,7 @@ protected:
     }
   }
 
-    // Asynchronously transmit the response message
+  // Asynchronously transmit the response message
   virtual void write_response() = 0;
 
   // Check whether we have spent enough time on this connection
@@ -228,28 +231,33 @@ void https_server(tcp::acceptor &acceptor, ssl::context &ssl_ctx) {
 }
 
 void run_http_server(const std::string& exporter_addr, short unsigned int port) {
-  net::io_context ioc{1};
-  tcp::acceptor acceptor{ioc, {net::ip::make_address(exporter_addr), port}};
-  tcp::socket socket{ioc};
+  tcp::acceptor acceptor{*global_ioc, {net::ip::make_address(exporter_addr), port}};
+  tcp::socket socket{*global_ioc};
 
   http_server(acceptor, socket);
 
   dout(1) << "HTTP server running on " << exporter_addr << ":" << port << dendl;
-  ioc.run();
+  global_ioc->run();
 }
 
 void run_https_server(const std::string& exporter_addr, short unsigned int port, const std::string& cert_file, const std::string& key_file) {
-  net::io_context ioc{1};
   ssl::context ssl_ctx(ssl::context::tlsv13);
 
   ssl_ctx.use_certificate_chain_file(cert_file);
   ssl_ctx.use_private_key_file(key_file, ssl::context::pem);
 
-  tcp::acceptor acceptor{ioc, {net::ip::make_address(exporter_addr), port}};
+  tcp::acceptor acceptor{*global_ioc, {net::ip::make_address(exporter_addr), port}};
   https_server(acceptor, ssl_ctx);
 
   dout(1) << "HTTPS server running on " << exporter_addr << ":" << port << dendl;
-  ioc.run();
+  global_ioc->run();
+}
+
+void stop_web_server() {
+  if (global_ioc) {
+    global_ioc->stop();
+    dout(1) << "Ceph exporter web server stopped" << dendl;
+  }
 }
 
 void web_server_thread_entrypoint() {
@@ -259,18 +267,21 @@ void web_server_thread_entrypoint() {
     std::string cert_file = g_conf().get_val<std::string>("exporter_cert_file");
     std::string key_file = g_conf().get_val<std::string>("exporter_key_file");
 
+    // Initialize global_ioc
+    global_ioc = std::make_shared<net::io_context>(1);
+
     if (cert_file.empty() && key_file.empty()) {
       run_http_server(exporter_addr, port);
     } else {
       try {
           run_https_server(exporter_addr, port, cert_file, key_file);
       } catch (const std::exception &e) {
-          dout(1) << "Failed to start HTTPS server: " << e.what() << dendl;
+          derr << "Failed to start HTTPS server: " << e.what() << dendl;
           exit(EXIT_FAILURE);
       }
     }
   } catch (std::exception const &e) {
-      dout(1) << "Error: " << e.what() << dendl;
+      derr << "Error: " << e.what() << dendl;
       exit(EXIT_FAILURE);
   }
 }
diff --git a/src/exporter/web_server.h b/src/exporter/web_server.h
index c3339a8d43a..c6d4c54eca4 100644
--- a/src/exporter/web_server.h
+++ b/src/exporter/web_server.h
@@ -3,3 +3,4 @@
 #include <string>
 
 void web_server_thread_entrypoint();
+void stop_web_server();
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index d3387267871..b8149718724 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -307,7 +307,7 @@ static void handle_oneshot_fatal_signal(int signum)
 
   char buf[1024];
   char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
-  int r = ceph_pthread_getname(pthread_self(), pthread_name, sizeof(pthread_name));
+  int r = ceph_pthread_getname(pthread_name, sizeof(pthread_name));
   (void)r;
 #if defined(__sun)
   char message[SIG2STR_MAX];
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 627f4a3e85b..137669c1963 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -1005,7 +1005,7 @@ extern const char *ceph_cap_op_name(int op);
 /* extra info for cap import/export */
 struct ceph_mds_cap_peer {
 	__le64 cap_id;
-	__le32 seq;
+	__le32 issue_seq;
 	__le32 mseq;
 	__le32 mds;
 	__u8   flags;
@@ -1058,7 +1058,7 @@ struct ceph_mds_cap_release {
 struct ceph_mds_cap_item {
 	__le64 ino;
 	__le64 cap_id;
-	__le32 migrate_seq, seq;
+	__le32 migrate_seq, issue_seq;
 } __attribute__ ((packed));
 
 #define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
diff --git a/src/include/compat.h b/src/include/compat.h
index 53285243d91..a7d10fc5425 100644
--- a/src/include/compat.h
+++ b/src/include/compat.h
@@ -179,60 +179,12 @@ struct cpu_set_t;
 #define MSG_DONTWAIT MSG_NONBLOCK
 #endif
 
-/* compiler warning free success noop */
-#define pthread_setname_noop_helper(thread, name) ({ \
-  int __i = 0;                                       \
-  __i; })
-
-#define pthread_getname_noop_helper(thread, name, len) ({ \
-  if (name != NULL)                                       \
-    *name = '\0';                                         \
-  0; })
-
 #define pthread_kill_unsupported_helper(thread, signal) ({ \
   int __i = -ENOTSUP;                                      \
   __i; })
 
 #if defined(_WIN32) && defined(__clang__) && \
     !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
-  // In this case, llvm doesn't use the pthread api for std::thread.
-  // We cannot use native_handle() with the pthread api, nor can we pass
-  // it to Windows API functions.
-  #define ceph_pthread_setname pthread_setname_noop_helper
-#elif defined(HAVE_PTHREAD_SETNAME_NP)
-  #if defined(__APPLE__)
-    #define ceph_pthread_setname(thread, name) ({ \
-      int __result = 0;                         \
-      if (thread == pthread_self())             \
-        __result = pthread_setname_np(name);    \
-      __result; })
-  #else
-    #define ceph_pthread_setname pthread_setname_np
-  #endif
-#elif defined(HAVE_PTHREAD_SET_NAME_NP)
-  /* Fix a small name diff and return 0 */
-  #define ceph_pthread_setname(thread, name) ({ \
-    pthread_set_name_np(thread, name);          \
-    0; })
-#else
-  #define ceph_pthread_setname pthread_setname_noop_helper
-#endif
-
-#if defined(_WIN32) && defined(__clang__) && \
-    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
-  #define ceph_pthread_getname pthread_getname_noop_helper
-#elif defined(HAVE_PTHREAD_GETNAME_NP)
-  #define ceph_pthread_getname pthread_getname_np
-#elif defined(HAVE_PTHREAD_GET_NAME_NP)
-  #define ceph_pthread_getname(thread, name, len) ({ \
-    pthread_get_name_np(thread, name, len);          \
-    0; })
-#else
-  #define ceph_pthread_getname pthread_getname_noop_helper
-#endif
-
-#if defined(_WIN32) && defined(__clang__) && \
-    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
   #define ceph_pthread_kill pthread_kill_unsupported_helper
 #else
   #define ceph_pthread_kill pthread_kill
@@ -244,6 +196,9 @@ int ceph_posix_fallocate(int fd, off_t offset, off_t len);
 extern "C" {
 #endif
 
+int ceph_pthread_getname(char* name, size_t size);
+int ceph_pthread_setname(const char* name);
+
 int pipe_cloexec(int pipefd[2], int flags);
 char *ceph_strerror_r(int errnum, char *buf, size_t buflen);
 unsigned get_page_size();
diff --git a/src/include/elist.h b/src/include/elist.h
index edfb7955494..e777873b045 100644
--- a/src/include/elist.h
+++ b/src/include/elist.h
@@ -15,6 +15,10 @@
 #ifndef CEPH_ELIST_H
 #define CEPH_ELIST_H
 
+#include <cstddef> // for size_t
+
+#include "include/ceph_assert.h"
+
 /*
  * elist: embedded list.
  *
diff --git a/src/include/str_list.h b/src/include/str_list.h
index cad76c1d6f5..a4c7432c6ef 100644
--- a/src/include/str_list.h
+++ b/src/include/str_list.h
@@ -2,7 +2,6 @@
 #define CEPH_STRLIST_H
 
 #include <list>
-#include <set>
 #include <string>
 #include <string_view>
 #include <vector>
diff --git a/src/log/Entry.h b/src/log/Entry.h
index db39eca0ef3..eeb25c5f593 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -24,6 +24,7 @@ namespace logging {
 class Entry {
 public:
   using time = log_time;
+  using thread_name_t = std::array<char, 16>;
 
   Entry() = delete;
   Entry(short pr, short sub) :
@@ -32,8 +33,7 @@ public:
     m_prio(pr),
     m_subsys(sub)
   {
-    strncpy(m_thread_name, Thread::get_thread_name().data(), 16);
-    m_thread_name[15] = '\0';
+    ceph_pthread_getname(m_thread_name.data(), m_thread_name.size());
   }
   Entry(const Entry &) = default;
   Entry& operator=(const Entry &) = default;
@@ -47,7 +47,7 @@ public:
   time m_stamp;
   pthread_t m_thread;
   short m_prio, m_subsys;
-  char m_thread_name[16];
+  thread_name_t m_thread_name{};
 
   static log_clock& clock() {
     static log_clock clock;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 49dd03c06c0..63d5205d9e2 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -31,6 +31,7 @@
 
 #include <fmt/format.h>
 #include <fmt/ostream.h>
+#include <fmt/ranges.h>
 
 #define MAX_LOG_BUF 65536
 
@@ -372,6 +373,7 @@ void Log::_flush_logbuf()
 
 void Log::_flush(EntryVector& t, bool crash)
 {
+  auto now = mono_clock::now();
   long len = 0;
   if (t.empty()) {
     assert(m_log_buf.empty());
@@ -443,10 +445,29 @@ void Log::_flush(EntryVector& t, bool crash)
       m_journald->log_entry(e);
     }
 
+    {
+      auto [it, _] = m_recent_thread_names.try_emplace(e.m_thread, now, DEFAULT_MAX_THREAD_NAMES);
+      auto& [t, names] = it->second;
+      if (names.size() == 0 || names.front() != e.m_thread_name.data()) {
+        names.push_front(e.m_thread_name.data());
+      }
+      t = now;
+    }
+
     m_recent.push_back(std::move(e));
   }
   t.clear();
 
+  for (auto it = m_recent_thread_names.begin(); it != m_recent_thread_names.end(); ) {
+    auto t = it->second.first;
+    auto since = now - t;
+    if (since > std::chrono::seconds(60*60*24)) {
+      it = m_recent_thread_names.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
   _flush_logbuf();
 }
 
@@ -493,14 +514,10 @@ void Log::dump_recent()
   _flush(m_flush, false);
 
   _log_message("--- begin dump of recent events ---", true);
-  std::set<std::pair<pthread_t, const char *>> recent_pthread_ids;
   {
     EntryVector t;
     t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end()));
     m_recent.clear();
-    for (const auto& e : t) {
-      recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name));
-    }
     _flush(t, true);
   }
 
@@ -515,11 +532,15 @@ void Log::dump_recent()
 			   m_stderr_log, m_stderr_crash), true);
 
   _log_message("--- pthread ID / name mapping for recent threads ---", true);
-  for (auto& [pthread_id, pthread_name] : recent_pthread_ids)
+  for (const auto& [tid, t_names] : m_recent_thread_names)
   {
+    [[maybe_unused]] auto [t, names] = t_names;
     // we want the ID to be printed in the same format as we use for a log entry.
     // The reason is easier grepping.
-    _log_message(fmt::format("  {:x} / {}", tid_to_int(pthread_id), pthread_name), true);
+    auto msg = fmt::format("  {:x} / {}",
+      tid_to_int(tid),
+      fmt::join(names, ", "));
+    _log_message(msg, true);
   }
 
   _log_message(fmt::format("  max_recent {:9}", m_recent.capacity()), true);
diff --git a/src/log/Log.h b/src/log/Log.h
index 3a60937af55..46d97734305 100644
--- a/src/log/Log.h
+++ b/src/log/Log.h
@@ -7,6 +7,7 @@
 #include <boost/circular_buffer.hpp>
 
 #include <condition_variable>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <queue>
@@ -14,6 +15,7 @@
 #include <string_view>
 
 #include "common/Thread.h"
+#include "common/ceph_time.h"
 #include "common/likely.h"
 
 #include "log/Entry.h"
@@ -86,9 +88,14 @@ protected:
 
 private:
   using EntryRing = boost::circular_buffer<ConcreteEntry>;
+  using mono_clock = ceph::coarse_mono_clock;
+  using mono_time = ceph::coarse_mono_time;
+
+  using RecentThreadNames = std::map<pthread_t, std::pair<mono_time, boost::circular_buffer<std::string> > >;
 
   static const std::size_t DEFAULT_MAX_NEW = 100;
   static const std::size_t DEFAULT_MAX_RECENT = 10000;
+  static constexpr std::size_t DEFAULT_MAX_THREAD_NAMES = 4;
 
   Log **m_indirect_this;
 
@@ -102,6 +109,7 @@ private:
   pthread_t m_queue_mutex_holder;
   pthread_t m_flush_mutex_holder;
 
+  RecentThreadNames m_recent_thread_names; // protected by m_flush_mutex
   EntryVector m_new;    ///< new entries
   EntryRing m_recent; ///< recent (less new) entries we've already written at low detail
   EntryVector m_flush; ///< entries to be flushed (here to optimize heap allocations)
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 642d3428a27..6fbfc79d416 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -61,6 +61,7 @@ void Beacon::shutdown()
   std::unique_lock<std::mutex> lock(mutex);
   if (!finished) {
     finished = true;
+    cvar.notify_all();
     lock.unlock();
     if (sender.joinable())
       sender.join();
@@ -74,7 +75,7 @@ void Beacon::init(const MDSMap &mdsmap)
   _notify_mdsmap(mdsmap);
 
   sender = std::thread([this]() {
-    ceph_pthread_setname(pthread_self(), "beacon");
+    ceph_pthread_setname("mds-beacon");
     std::unique_lock<std::mutex> lock(mutex);
     bool sent;
     while (!finished) {
@@ -320,16 +321,15 @@ void Beacon::notify_health(MDSRank const *mds)
   // Detect MDS_HEALTH_TRIM condition
   // Indicates MDS is not trimming promptly
   {
-    const auto log_max_segments = mds->mdlog->get_max_segments();
-    const auto log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
-    if (mds->mdlog->get_num_segments() > (size_t)(log_max_segments * log_warn_factor)) {
+    if (mds->mdlog->is_trim_slow()) {
+      auto num_segments = mds->mdlog->get_num_segments();
+      auto max_segments = mds->mdlog->get_max_segments();
       CachedStackStringStream css;
-      *css << "Behind on trimming (" << mds->mdlog->get_num_segments()
-        << "/" << log_max_segments << ")";
+      *css << "Behind on trimming (" << num_segments << "/" << max_segments << ")";
 
       MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, css->strv());
-      m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
-      m.metadata["max_segments"] = stringify(log_max_segments);
+      m.metadata["num_segments"] = stringify(num_segments);
+      m.metadata["max_segments"] = stringify(max_segments);
       health.metrics.push_back(m);
     }
   }
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index af9f8edfffa..7d28e039d9c 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -2494,6 +2494,10 @@ void CDir::_omap_commit_ops(int r, int op_prio, int64_t metapool, version_t vers
       mdcache->mds->heartbeat_reset();
   }
 
+  // the last omap commit includes the omap header, so account for
+  // that size early on so that when we reach `commit_one(true)`,
+  // there is enough space for the header.
+  write_size += sizeof(fnode_t);
   using ceph::encode;
   for (auto &item : to_set) {
     bufferlist bl;
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 9adcf3b25b9..0782464ad94 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -218,8 +218,6 @@ public:
   void set_cap_id(uint64_t i) { cap_id = i; }
   uint64_t get_cap_id() const { return cap_id; }
 
-  //ceph_seq_t get_last_issue() { return last_issue; }
-
   bool is_suppress() const { return suppress > 0; }
   void inc_suppress() { suppress++; }
   void dec_suppress() { suppress--; }
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 63608d48864..d64f176acb6 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2599,6 +2599,7 @@ int Locker::issue_caps(CInode *in, Capability *only_cap)
 					   in->find_snaprealm()->inode->ino(),
 					   cap->get_cap_id(), cap->get_last_seq(),
 					   pending, wanted, 0, cap->get_mseq(),
+                                           cap->get_last_issue(),
 					   mds->get_osd_epoch_barrier());
 	in->encode_cap_message(m, cap);
 
@@ -2649,6 +2650,7 @@ int Locker::issue_caps(CInode *in, Capability *only_cap)
 					 in->find_snaprealm()->inode->ino(),
 					 cap->get_cap_id(), cap->get_last_seq(),
 					 after, wanted, 0, cap->get_mseq(),
+                                         cap->get_last_issue(),
 					 mds->get_osd_epoch_barrier());
       in->encode_cap_message(m, cap);
 
@@ -2675,6 +2677,7 @@ void Locker::issue_truncate(CInode *in)
                                        cap->get_cap_id(), cap->get_last_seq(),
                                        cap->pending(), cap->wanted(), 0,
                                        cap->get_mseq(),
+                                       cap->get_last_issue(),
                                        mds->get_osd_epoch_barrier());
     in->encode_cap_message(m, cap);			     
     mds->send_message_client_counted(m, cap->get_session());
@@ -3165,6 +3168,7 @@ void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
                                          cap->pending(),
                                          cap->wanted(), 0,
                                          cap->get_mseq(),
+                                         cap->get_last_issue(),
                                          mds->get_osd_epoch_barrier());
       in->encode_cap_message(m, cap);
       mds->send_message_client_counted(m, cap->get_session());
@@ -3375,10 +3379,10 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
     ref_t<MClientCaps> ack;
     if (op == CEPH_CAP_OP_FLUSHSNAP) {
       if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
     } else {
       if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
     }
     ack->set_snap_follows(follows);
     ack->set_client_tid(m->get_client_tid());
@@ -3500,7 +3504,7 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
     // case we get a dup response, so whatever.)
     ref_t<MClientCaps> ack;
     if (dirty) {
-      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, 0, mds->get_osd_epoch_barrier());
       ack->set_snap_follows(follows);
       ack->set_client_tid(m->get_client_tid());
       ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
@@ -3589,7 +3593,7 @@ void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
       dout(7) << " flush client." << client << " dirty " << ccap_string(dirty)
 	      << " seq " << m->get_seq() << " on " << *in << dendl;
       ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
-          m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+          m->get_caps(), 0, dirty, 0, cap->get_last_issue(), mds->get_osd_epoch_barrier());
       ack->set_client_tid(m->get_client_tid());
       ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
     }
@@ -4222,7 +4226,7 @@ void Locker::handle_client_cap_release(const cref_t<MClientCapRelease> &m)
   Session *session = mds->get_session(m);
 
   for (const auto &cap : m->caps) {
-    _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq);
+    _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.issue_seq);
   }
 
   if (session) {
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index eb2b529dcfa..3af0d8c6b1e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5891,7 +5891,7 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
   auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
 					in->ino(), realm->inode->ino(), cap->get_cap_id(),
 					cap->get_last_seq(), cap->pending(), cap->wanted(),
-					0, cap->get_mseq(), mds->get_osd_epoch_barrier());
+					0, cap->get_mseq(), cap->get_last_issue(), mds->get_osd_epoch_barrier());
   in->encode_cap_message(reap, cap);
   reap->snapbl = mds->server->get_snap_trace(session, realm);
   reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
@@ -14378,6 +14378,7 @@ bool MDCache::is_ready_to_trim_cache(void)
 
 void MDCache::upkeep_main(void)
 {
+  ceph_pthread_setname("mds-cache-trim");
   std::unique_lock lock(upkeep_mutex);
 
   // create a "memory model" for the upkeep thread. The object maintains
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 0be568433ef..4bbf2a1a141 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -53,11 +53,12 @@ MDLog::MDLog(MDSRank* m)
   event_large_threshold = g_conf().get_val<uint64_t>("mds_log_event_large_threshold");
   events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
   pause = g_conf().get_val<bool>("mds_log_pause");
-  major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
   max_segments = g_conf().get_val<uint64_t>("mds_log_max_segments");
   max_events = g_conf().get_val<int64_t>("mds_log_max_events");
   skip_corrupt_events = g_conf().get_val<bool>("mds_log_skip_corrupt_events");
   skip_unbounded_events = g_conf().get_val<bool>("mds_log_skip_unbounded_events");
+  log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
+  minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
   upkeep_thread = std::thread(&MDLog::log_trim_upkeep, this);
 }
 
@@ -258,7 +259,7 @@ void MDLog::create(MDSContext *c)
   logger->set(l_mdl_expos, journaler->get_expire_pos());
   logger->set(l_mdl_wrpos, journaler->get_write_pos());
 
-  submit_thread.create("md_submit");
+  submit_thread.create("mds-log-submit");
 }
 
 void MDLog::open(MDSContext *c)
@@ -267,9 +268,9 @@ void MDLog::open(MDSContext *c)
 
   ceph_assert(!recovery_thread.is_started());
   recovery_thread.set_completion(c);
-  recovery_thread.create("md_recov_open");
+  recovery_thread.create("mds-log-recvr");
 
-  submit_thread.create("md_submit");
+  submit_thread.create("mds-log-submit");
   // either append() or replay() will follow.
 }
 
@@ -311,7 +312,7 @@ void MDLog::reopen(MDSContext *c)
   recovery_thread.join();
 
   recovery_thread.set_completion(new C_ReopenComplete(this, c));
-  recovery_thread.create("md_recov_reopen");
+  recovery_thread.create("mds-log-reopen");
 }
 
 void MDLog::append()
@@ -357,14 +358,15 @@ void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase* c)
   ceph_assert(!mds_is_shutting_down);
 
   event_seq++;
-  events_since_last_major_segment++;
 
   if (auto sb = dynamic_cast<SegmentBoundary*>(le); sb) {
     auto ls = _start_new_segment(sb);
     if (sb->is_major_segment_boundary()) {
       major_segments.insert(ls->seq);
       logger->set(l_mdl_segmjr, major_segments.size());
-      events_since_last_major_segment = 0;
+      minor_segments_since_last_major_segment = 0;
+    } else {
+      ++minor_segments_since_last_major_segment;
     }
   }
 
@@ -403,7 +405,7 @@ void MDLog::_segment_upkeep()
   uint64_t period = journaler->get_layout_period();
   auto ls = get_current_segment();
   // start a new segment?
-  if (events_since_last_major_segment > events_per_segment*major_segment_event_ratio) {
+  if (minor_segments_since_last_major_segment > minor_segments_per_major_segment) {
     dout(10) << __func__ << ": starting new major segment, current " << *ls << dendl;
     auto sle = mds->mdcache->create_subtree_map();
     _submit_entry(sle, NULL);
@@ -656,7 +658,13 @@ void MDLog::try_to_commit_open_file_table(uint64_t last_seq)
   }
 }
 
+bool MDLog::is_trim_slow() const {
+  return (segments.size() > (size_t)(max_segments * log_warn_factor));
+}
+
 void MDLog::log_trim_upkeep(void) {
+  ceph_pthread_setname("mds-log-trim");
+
   dout(10) << dendl;
 
   std::unique_lock mds_lock(mds->mds_lock);
@@ -1008,7 +1016,7 @@ void MDLog::replay(MDSContext *c)
   }
   already_replayed = true;
 
-  replay_thread.create("md_log_replay");
+  replay_thread.create("mds-log-replay");
 }
 
 
@@ -1474,7 +1482,6 @@ void MDLog::_replay_thread()
     }
     le->set_start_off(pos);
 
-    events_since_last_major_segment++;
     if (auto sb = dynamic_cast<SegmentBoundary*>(le.get()); sb) {
       auto seq = sb->get_seq();
       if (seq > 0) {
@@ -1487,7 +1494,9 @@ void MDLog::_replay_thread()
       if (sb->is_major_segment_boundary()) {
         major_segments.insert(event_seq);
         logger->set(l_mdl_segmjr, major_segments.size());
-        events_since_last_major_segment = 0;
+	minor_segments_since_last_major_segment = 0;
+      } else {
+	++minor_segments_since_last_major_segment;
       }
     } else {
       event_seq++;
@@ -1618,9 +1627,6 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
   if (changed.count("mds_log_events_per_segment")) {
     events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
   }
-  if (changed.count("mds_log_major_segment_event_ratio")) {
-    major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
-  }
   if (changed.count("mds_log_max_events")) {
     max_events = g_conf().get_val<int64_t>("mds_log_max_events");
   }
@@ -1642,4 +1648,10 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
   if (changed.count("mds_log_trim_decay_rate")){
     log_trim_counter = DecayCounter(g_conf().get_val<double>("mds_log_trim_decay_rate"));
   }
+  if (changed.count("mds_log_warn_factor")) {
+    log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
+  }
+  if (changed.count("mds_log_minor_segments_per_major_segment")) {
+    minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
+  }
 }
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
index e2ab4e686cd..a858b40fa03 100644
--- a/src/mds/MDLog.h
+++ b/src/mds/MDLog.h
@@ -173,6 +173,9 @@ public:
   // replay state
   std::map<inodeno_t, std::set<inodeno_t>> pending_exports;
 
+  // beacon needs me too
+  bool is_trim_slow() const;
+
 protected:
   struct PendingEvent {
     PendingEvent(LogEvent *e, Context* c, bool f=false) : le(e), fin(c), flush(f) {}
@@ -302,9 +305,9 @@ private:
   bool debug_subtrees;
   std::atomic_uint64_t event_large_threshold; // accessed by submit thread
   uint64_t events_per_segment;
-  uint64_t major_segment_event_ratio;
   int64_t max_events;
   uint64_t max_segments;
+  uint64_t minor_segments_per_major_segment;
   bool pause;
   bool skip_corrupt_events;
   bool skip_unbounded_events;
@@ -312,7 +315,8 @@ private:
   std::set<uint64_t> major_segments;
   std::set<LogSegment*> expired_segments;
   std::set<LogSegment*> expiring_segments;
-  uint64_t events_since_last_major_segment = 0;
+  uint64_t minor_segments_since_last_major_segment = 0;
+  double log_warn_factor;
 
   // log trimming decay counter
   DecayCounter log_trim_counter;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 30820493cda..52ed930d71b 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -496,7 +496,7 @@ MDSRank::MDSRank(
 
   objecter->unset_honor_pool_full();
 
-  finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
+  finisher = new Finisher(cct, "MDSRank", "mds-rank-fin");
 
   mdcache = new MDCache(this, purge_queue);
   mdlog = new MDLog(this);
@@ -581,7 +581,7 @@ void MDSRankDispatcher::init()
   // who is interested in it.
   handle_osd_map();
 
-  progress_thread.create("mds_rank_progr");
+  progress_thread.create("mds-rank-progr");
 
   purge_queue.init();
 
diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc
index 1d17bbf3e92..6cbd9a094c0 100644
--- a/src/mds/MetricAggregator.cc
+++ b/src/mds/MetricAggregator.cc
@@ -73,6 +73,7 @@ int MetricAggregator::init() {
   m_cct->get_perfcounters_collection()->add(m_perf_counters);
 
   pinger = std::thread([this]() {
+      ceph_pthread_setname("mds-ping");
       std::unique_lock locker(lock);
       while (!stopping) {
         ping_all_active_ranks();
diff --git a/src/mds/MetricsHandler.cc b/src/mds/MetricsHandler.cc
index 9ad10b9d6e6..9fc4c6122a4 100644
--- a/src/mds/MetricsHandler.cc
+++ b/src/mds/MetricsHandler.cc
@@ -51,6 +51,7 @@ void MetricsHandler::init() {
   dout(10) << dendl;
 
   updater = std::thread([this]() {
+      ceph_pthread_setname("mds-metrics");
       std::unique_lock locker(lock);
       while (!stopping) {
         double after = g_conf().get_val<std::chrono::seconds>("mds_metrics_update_interval").count();
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 722b6bd7422..6b12f710db4 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1957,10 +1957,10 @@ void Migrator::handle_export_ack(const cref_t<MExportDirAck> &m)
   // this keeps authority().first in sync with subtree auth state in the journal.
   mdcache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
 
+  ceph_assert(g_conf()->mds_kill_export_at != 10);
   // log export completion, then finish (unfreeze, trigger finish context, etc.)
   mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
   mds->mdlog->flush();
-  ceph_assert(g_conf()->mds_kill_export_at != 10);
 }
 
 void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
@@ -2844,7 +2844,6 @@ void Migrator::import_reverse(CDir *dir)
   dout(7) << *dir << dendl;
 
   import_state_t& stat = import_state[dir->dirfrag()];
-  stat.state = IMPORT_ABORTING;
 
   set<CDir*> bounds;
   mdcache->get_subtree_bounds(dir, bounds);
@@ -2950,10 +2949,14 @@ void Migrator::import_reverse(CDir *dir)
       }
       in->put(CInode::PIN_IMPORTINGCAPS);
     }
+  }
+
+  if (stat.state == IMPORT_LOGGINGSTART || stat.state == IMPORT_ACKING) {
     for (auto& p : stat.session_map) {
       Session *session = p.second.first;
       session->dec_importing();
     }
+    mds->server->close_forced_opened_sessions(stat.session_map);
   }
 	 
   // log our failure
@@ -2962,6 +2965,7 @@ void Migrator::import_reverse(CDir *dir)
   mdcache->trim(num_dentries); // try trimming dentries
 
   // notify bystanders; wait in aborting state
+  stat.state = IMPORT_ABORTING;
   import_notify_abort(dir, bounds);
 }
 
@@ -3054,10 +3058,9 @@ void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
   dout(7) << *dir << dendl;
 
   map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
-  if (it == import_state.end() ||
-      it->second.state != IMPORT_LOGGINGSTART) {
+  ceph_assert(it != import_state.end());
+  if (it->second.state != IMPORT_LOGGINGSTART) {
     dout(7) << "import " << df << " must have aborted" << dendl;
-    mds->server->finish_force_open_sessions(imported_session_map);
     return;
   }
 
diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
index 925bff16542..4426d3ca6fe 100644
--- a/src/mds/PurgeQueue.cc
+++ b/src/mds/PurgeQueue.cc
@@ -122,7 +122,7 @@ PurgeQueue::PurgeQueue(
     cct(cct_),
     rank(rank_),
     metadata_pool(metadata_pool_),
-    finisher(cct, "PurgeQueue", "PQ_Finisher"),
+    finisher(cct, "PurgeQueue", "mds-pq-fin"),
     timer(cct, lock),
     filer(objecter_, &finisher),
     objecter(objecter_),
diff --git a/src/mds/QuiesceAgent.h b/src/mds/QuiesceAgent.h
index 5c07d6d8074..85900e8e71b 100644
--- a/src/mds/QuiesceAgent.h
+++ b/src/mds/QuiesceAgent.h
@@ -30,7 +30,7 @@ class QuiesceAgent {
         : quiesce_control(quiesce_control)
         , stop_agent_thread(false)
         , agent_thread(this) {
-      agent_thread.create("quiesce.agt");
+      agent_thread.create("mds-q-agt");
     };
 
     virtual ~QuiesceAgent() {
diff --git a/src/mds/QuiesceDbEncoding.h b/src/mds/QuiesceDbEncoding.h
index c76ed2d0c52..27c7e3ca2d0 100644
--- a/src/mds/QuiesceDbEncoding.h
+++ b/src/mds/QuiesceDbEncoding.h
@@ -15,7 +15,7 @@
 #include "include/encoding.h"
 #include <stdint.h>
 
-void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(v.epoch, bl, features);
@@ -23,7 +23,7 @@ void encode(QuiesceDbVersion const& v, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(v.epoch, p);
@@ -31,33 +31,33 @@ void decode(QuiesceDbVersion& v, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceState const & state, bufferlist& bl, uint64_t features=0)
+inline void encode(QuiesceState const & state, bufferlist& bl, uint64_t features=0)
 {
   static_assert(QuiesceState::QS__MAX <= UINT8_MAX);
   uint8_t v = (uint8_t)state;
   encode(v, bl, features);
 }
 
-void decode(QuiesceState & state, bufferlist::const_iterator& p)
+inline void decode(QuiesceState & state, bufferlist::const_iterator& p)
 {
   uint8_t v = 0;
   decode(v, p);
   state = (QuiesceState)v;
 }
 
-void encode(QuiesceTimeInterval const & interval, bufferlist& bl, uint64_t features=0)
+inline void encode(QuiesceTimeInterval const & interval, bufferlist& bl, uint64_t features=0)
 {
   encode(interval.count(), bl, features);
 }
 
-void decode(QuiesceTimeInterval & interval, bufferlist::const_iterator& p)
+inline void decode(QuiesceTimeInterval & interval, bufferlist::const_iterator& p)
 {
   QuiesceClock::rep count;
   decode(count, p);
   interval = QuiesceTimeInterval { count };
 }
 
-void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t features = 0)
+inline void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(rstate.state, bl, features);
@@ -65,7 +65,7 @@ void encode(RecordedQuiesceState const& rstate, bufferlist& bl, uint64_t feature
   ENCODE_FINISH(bl);
 }
 
-void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
+inline void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(rstate.state, p);
@@ -73,7 +73,7 @@ void decode(RecordedQuiesceState& rstate, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(member.rstate, bl, features);
@@ -81,7 +81,7 @@ void encode(QuiesceSet::MemberInfo const& member, bufferlist& bl, uint64_t featu
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
+inline void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(member.rstate, p);
@@ -89,7 +89,7 @@ void decode(QuiesceSet::MemberInfo& member, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(set.version, bl, features);
@@ -100,7 +100,7 @@ void encode(QuiesceSet const& set, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceSet& set, bufferlist::const_iterator& p)
+inline void decode(QuiesceSet& set, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(set.version, p);
@@ -111,7 +111,7 @@ void decode(QuiesceSet& set, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(req.control.raw, bl, features);
@@ -124,7 +124,7 @@ void encode(QuiesceDbRequest const& req, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(req.control.raw, p);
@@ -137,7 +137,7 @@ void decode(QuiesceDbRequest& req, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(listing.db_version, bl, features);
@@ -146,7 +146,7 @@ void encode(QuiesceDbListing const& listing, bufferlist& bl, uint64_t features =
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(listing.db_version, p);
@@ -155,7 +155,7 @@ void decode(QuiesceDbListing& listing, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(listing.origin, bl, features);
@@ -163,7 +163,7 @@ void encode(QuiesceDbPeerListing const& listing, bufferlist& bl, uint64_t featur
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(listing.origin, p);
@@ -171,7 +171,7 @@ void decode(QuiesceDbPeerListing& listing, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(root.state, bl, features);
@@ -179,7 +179,7 @@ void encode(QuiesceMap::RootInfo const& root, bufferlist& bl, uint64_t features
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
+inline void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(root.state, p);
@@ -187,7 +187,7 @@ void decode(QuiesceMap::RootInfo& root, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(map.db_version, bl, features);
@@ -195,7 +195,7 @@ void encode(QuiesceMap const& map, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceMap& map, bufferlist::const_iterator& p)
+inline void decode(QuiesceMap& map, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(map.db_version, p);
@@ -203,7 +203,7 @@ void decode(QuiesceMap& map, bufferlist::const_iterator& p)
   DECODE_FINISH(p);
 }
 
-void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
+inline void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
 {
   ENCODE_START(1, 1, bl);
   encode(ack.origin, bl, features);
@@ -211,7 +211,7 @@ void encode(QuiesceDbPeerAck const& ack, bufferlist& bl, uint64_t features = 0)
   ENCODE_FINISH(bl);
 }
 
-void decode(QuiesceDbPeerAck& ack, bufferlist::const_iterator& p)
+inline void decode(QuiesceDbPeerAck& ack, bufferlist::const_iterator& p)
 {
   DECODE_START(1, p);
   decode(ack.origin, p);
diff --git a/src/mds/QuiesceDbManager.cc b/src/mds/QuiesceDbManager.cc
index 12c83634e54..bb3ae93e378 100644
--- a/src/mds/QuiesceDbManager.cc
+++ b/src/mds/QuiesceDbManager.cc
@@ -200,7 +200,7 @@ void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_mem
     // start the thread
     dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
     db_thread_should_exit = false;
-    quiesce_db_thread.create("quiesce_db_mgr");
+    quiesce_db_thread.create("mds-q-db");
   } else if (quiesce_db_thread.is_started()) {
     submit_condition.notify_all();
   }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index cf286b46d46..5874a3dce56 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -615,6 +615,9 @@ void Server::handle_client_session(const cref_t<MClientSession> &m)
       mds->send_message(reply, m->get_connection());
       return;
     }
+    if (!session->client_opened) {
+      session->client_opened = true;
+    }
     if (session->is_opening() ||
 	session->is_open() ||
 	session->is_stale() ||
@@ -1054,7 +1057,7 @@ version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
   return pv;
 }
 
-void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+void Server::finish_force_open_sessions(map<client_t,pair<Session*,uint64_t> >& smap,
 					bool dec_import)
 {
   /*
@@ -1073,7 +1076,7 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_
 	dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
       } else {
 	dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
-	mds->sessionmap.set_state(session, Session::STATE_OPEN);
+	it.second.second = mds->sessionmap.set_state(session, Session::STATE_OPEN);
 	mds->sessionmap.touch_session(session);
         metrics_handler->add_session(session);
 
@@ -1103,6 +1106,29 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_
   dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
 }
 
+void Server::close_forced_opened_sessions(const map<client_t,pair<Session*,uint64_t> >& smap)
+{
+  dout(10) << __func__ << " on " << smap.size() << " clients" << dendl;
+
+  for (auto &it : smap) {
+    Session *session = it.second.first;
+    uint64_t sseq = it.second.second;
+    if (sseq == 0)
+      continue;
+    if (session->get_state_seq() != sseq) {
+      dout(10) << "skipping changed session (" << session->get_state_name() << ") "
+	       << session->info.inst << dendl;
+      continue;
+    }
+    if (session->client_opened)
+      continue;
+    dout(10) << "closing forced opened session (" << session->get_state_name() << ") "
+	     << session->info.inst << dendl;
+    ceph_assert(!session->is_importing());
+    journal_close_session(session, Session::STATE_CLOSING, NULL);
+  }
+}
+
 class C_MDS_TerminatedSessions : public ServerContext {
   void finish(int r) override {
     server->terminating_sessions = false;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 68842ea01cb..5f9a763e550 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -129,8 +129,9 @@ public:
   version_t prepare_force_open_sessions(std::map<client_t,entity_inst_t> &cm,
 					std::map<client_t,client_metadata_t>& cmm,
 					std::map<client_t,std::pair<Session*,uint64_t> >& smap);
-  void finish_force_open_sessions(const std::map<client_t,std::pair<Session*,uint64_t> >& smap,
+  void finish_force_open_sessions(std::map<client_t,std::pair<Session*,uint64_t> >& smap,
 				  bool dec_import=true);
+  void close_forced_opened_sessions(const std::map<client_t,std::pair<Session*,uint64_t> >& smap);
   void flush_client_sessions(std::set<client_t>& client_set, MDSGatherBuilder& gather);
   void finish_flush_session(Session *session, version_t seq);
   void terminate_sessions();
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index ba0b0817738..0f6038eb82b 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -615,6 +615,7 @@ void Session::dump(Formatter *f, bool cap_dump) const
   f->dump_unsigned("num_completed_requests", get_num_completed_requests());
   f->dump_unsigned("num_completed_flushes", get_num_completed_flushes());
   f->dump_bool("reconnecting", reconnecting);
+  f->dump_int("importing_count", importing_count);
   f->dump_object("recall_caps", recall_caps);
   f->dump_object("release_caps", release_caps);
   f->dump_object("recall_caps_throttle", recall_caps_throttle);
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index 9e82f00a9bf..bfe7dcd4895 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -417,6 +417,10 @@ public:
   session_info_t info;                         ///< durable bits
   MDSAuthCaps auth_caps;
 
+  // True if the session is opened by the client.
+  // False if the session is forced to open, until it is opened again by the client.
+  bool client_opened = false;
+
   xlist<Session*>::item item_session_list;
 
   std::list<ceph::ref_t<Message>> preopen_out_queue;  ///< messages for client, queued before they connect
diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h
index 96b2cb7d8b8..b001032225e 100644
--- a/src/messages/MClientCaps.h
+++ b/src/messages/MClientCaps.h
@@ -117,9 +117,9 @@ private:
   void set_ctime(const utime_t &t) { ctime = t; }
   void set_atime(const utime_t &t) { atime = t; }
 
-  void set_cap_peer(uint64_t id, ceph_seq_t seq, ceph_seq_t mseq, int mds, int flags) {
+  void set_cap_peer(uint64_t id, ceph_seq_t issue_seq, ceph_seq_t mseq, int mds, int flags) {
     peer.cap_id = id;
-    peer.seq = seq;
+    peer.issue_seq = issue_seq;
     peer.mseq = mseq;
     peer.mds = mds;
     peer.flags = flags;
@@ -137,11 +137,12 @@ protected:
 	      inodeno_t ino,
 	      inodeno_t realm,
 	      uint64_t id,
-	      long seq,
+	      ceph_seq_t seq,
 	      int caps,
 	      int wanted,
 	      int dirty,
-	      int mseq,
+	      ceph_seq_t mseq,
+              ceph_seq_t issue_seq,
               epoch_t oeb)
     : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION},
       osd_epoch_barrier(oeb) {
@@ -155,11 +156,12 @@ protected:
     head.wanted = wanted;
     head.dirty = dirty;
     head.migrate_seq = mseq;
+    head.issue_seq = issue_seq;
     memset(&peer, 0, sizeof(peer));
   }
   MClientCaps(int op,
 	      inodeno_t ino, inodeno_t realm,
-	      uint64_t id, int mseq, epoch_t oeb)
+	      uint64_t id, ceph_seq_t mseq, epoch_t oeb)
     : SafeMessage{CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION},
       osd_epoch_barrier(oeb) {
     memset(&head, 0, sizeof(head));
@@ -181,7 +183,8 @@ public:
     out << "client_caps(" << ceph_cap_op_name(head.op)
 	<< " ino " << inodeno_t(head.ino)
 	<< " " << head.cap_id
-	<< " seq " << head.seq;
+	<< " seq " << head.seq
+	<< " issue_seq " << head.issue_seq;
     if (get_tid())
       out << " tid " << get_tid();
     out << " caps=" << ccap_string(head.caps)
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index d8cca4ceb61..f742303c6e9 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -758,6 +758,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
 
     if (state == MDSMap::STATE_DNE) {
       dout(1) << __func__ << ": DNE from " << info << dendl;
+
+      /* send a beacon reply so MDSDaemon::suicide() finishes the
+         Beacon::send_and_wait() call */
+      auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+          m->get_global_id(), m->get_name(), get_fsmap().get_epoch(),
+          m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
+      mon.send_reply(op, beacon.detach());
+
       goto evict;
     }
 
diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc
index 5598bbe8709..8d0540d71f2 100644
--- a/src/mon/MonMap.cc
+++ b/src/mon/MonMap.cc
@@ -196,7 +196,12 @@ void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const
   if (!HAVE_FEATURE(con_features, MONENC) ||
       !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
     for (auto& [name, info] : mon_info) {
-      legacy_mon_addr[name] = info.public_addrs.legacy_addr();
+      // see note in mon_info_t::encode()
+      auto addr = info.public_addrs.legacy_addr();
+      if (addr == entity_addr_t()) {
+        addr = info.public_addrs.as_legacy_addr();
+      }
+      legacy_mon_addr[name] = addr;
     }
   }
 
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 9fda03b4905..719403925ad 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -16,7 +16,9 @@
 #include "NVMeofGwMon.h"
 #include "NVMeofGwMap.h"
 #include "OSDMonitor.h"
+#include "mon/health_check.h"
 
+using std::list;
 using std::map;
 using std::make_pair;
 using std::ostream;
@@ -893,6 +895,47 @@ struct CMonRequestProposal : public Context {
   }
 };
 
+void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const 
+{
+  list<string> singleGatewayDetail;
+  list<string> gatewayDownDetail;
+  for (const auto& created_map_pair: created_gws) {
+    const auto& group_key = created_map_pair.first;
+    auto& group = group_key.second;
+    const NvmeGwMonStates& gw_created_map = created_map_pair.second;
+    if ( gw_created_map.size() == 1) {
+      ostringstream ss;
+      ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ;
+      singleGatewayDetail.push_back(ss.str());
+    }
+    for (const auto& gw_created_pair: gw_created_map) {
+      const auto& gw_id = gw_created_pair.first;
+      const auto& gw_created  = gw_created_pair.second;
+      if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) {
+        ostringstream ss;
+        ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
+        gatewayDownDetail.push_back(ss.str());
+      }
+    }
+  }
+  if (!singleGatewayDetail.empty()) {
+    ostringstream ss;
+    ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
+      << "; HA is not possible with single gateway.";
+    auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN,
+        ss.str(), singleGatewayDetail.size());
+    d.detail.swap(singleGatewayDetail);
+  }
+  if (!gatewayDownDetail.empty()) {
+    ostringstream ss;
+    ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state"
+      << "; gateway might be down, try to redeploy.";
+    auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN,
+        ss.str(), gatewayDownDetail.size());
+    d.detail.swap(gatewayDownDetail);
+  }
+}
+
 int NVMeofGwMap::blocklist_gw(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 267d85b10f9..5f657733012 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -27,6 +27,9 @@
 #include "NVMeofGwTypes.h"
 
 using ceph::coarse_mono_clock;
+
+class health_check_map_t;
+
 class Monitor;
 /*-------------------*/
 class NVMeofGwMap
@@ -140,6 +143,8 @@ public:
     decode(fsm_timers, bl);
     DECODE_FINISH(bl);
   }
+
+  void get_health_checks(health_check_map_t *checks) const;
 };
 
 #include "NVMeofGwSerialize.h"
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 9d80c23a4c1..0fe5c3e655f 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -181,6 +181,11 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
 	   << HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl;
   put_version(t, pending_map.epoch, bl);
   put_last_committed(t, pending_map.epoch);
+
+  //health
+  health_check_map_t checks;
+  pending_map.get_health_checks(&checks);
+  encode_health(checks, t);
 }
 
 void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
@@ -193,6 +198,7 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
     bufferlist bl;
     int err = get_version(version, bl);
     ceph_assert(err == 0);
+    load_health();
 
     auto p = bl.cbegin();
     map.decode(p);
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
index 6739968f4e2..5f8bbc172df 100644
--- a/src/msg/async/Stack.h
+++ b/src/msg/async/Stack.h
@@ -352,7 +352,7 @@ class NetworkStack {
     static constexpr int TASK_COMM_LEN = 16;
     char tp_name[TASK_COMM_LEN];
     sprintf(tp_name, "msgr-worker-%u", id);
-    ceph_pthread_setname(pthread_self(), tp_name);
+    ceph_pthread_setname(tp_name);
   }
 
  protected:
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
index 12db599d684..789a624cf90 100644
--- a/src/msg/async/rdma/RDMAStack.cc
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -92,7 +92,6 @@ void RDMADispatcher::polling_start()
   ceph_assert(rx_cq);
 
   t = std::thread(&RDMADispatcher::polling, this);
-  ceph_pthread_setname(t.native_handle(), "rdma-polling");
 }
 
 void RDMADispatcher::polling_stop()
@@ -263,6 +262,7 @@ int RDMADispatcher::post_chunks_to_rq(int num, QueuePair *qp)
 
 void RDMADispatcher::polling()
 {
+  ceph_pthread_setname("rdma-polling");
   static int MAX_COMPLETIONS = 32;
   ibv_wc wc[MAX_COMPLETIONS];
 
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 14d2f85f40f..44f8e85b5ef 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -6006,7 +6006,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
   object_info_t& oi = obs.oi;
   const hobject_t& soid = oi.soid;
   const bool skip_data_digest = osd->store->has_builtin_csum() &&
-    osd->osd_skip_data_digest;
+    *osd->osd_skip_data_digest;
 
   PGTransaction* t = ctx->op_t.get();
 
@@ -6069,9 +6069,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
     if (op.op == CEPH_OSD_OP_ZERO &&
         obs.exists &&
-        op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
+        op.extent.offset < *osd->osd_max_object_size &&
         op.extent.length >= 1 &&
-        op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
+        op.extent.length <= *osd->osd_max_object_size &&
 	op.extent.offset + op.extent.length >= oi.size) {
       if (op.extent.offset >= oi.size) {
         // no-op
@@ -6781,7 +6781,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-	  static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+	  *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6838,7 +6838,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	result = check_offset_and_length(
 	  0, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6888,7 +6888,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       { // zero
 	result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
 	if (result < 0)
 	  break;
 
@@ -6953,7 +6953,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
         result = check_offset_and_length(
 	  op.extent.offset, op.extent.length,
-          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+          *osd->osd_max_object_size, get_dpp());
         if (result < 0)
 	  break;
 
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc
index c8cf27d2116..110c2c7d266 100644
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -65,7 +65,7 @@ void OsdScrub::dump_scrubs(ceph::Formatter* f) const
 void OsdScrub::dump_scrub_reservations(ceph::Formatter* f) const
 {
   m_resource_bookkeeper.dump_scrub_reservations(f);
-  f->open_array_section("remote_scrub_reservations");
+  f->open_object_section("remote_scrub_reservations");
   m_osd_svc.get_scrub_reserver().dump(f);
   f->close_section();
 }
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc
index cd80625aaec..c116bcbb4c2 100644
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -140,9 +140,10 @@ bool ScrubQueue::remove_entry_unlocked(spg_t pgid, scrub_level_t s_or_d)
 void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
 {
   ceph_assert(f != nullptr);
+  const auto query_time = ceph_clock_now();
   f->open_array_section("scrubs");
   for_each_job(
-      [&f](const Scrub::SchedEntry& e) {
+      [&f, query_time](const Scrub::SchedEntry& e) {
 	f->open_object_section("scrub");
 	f->dump_stream("pgid") << e.pgid;
 	f->dump_stream("sched_time") << e.schedule.not_before;
@@ -151,6 +152,15 @@ void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
 	f->dump_bool(
 	    "forced",
 	    e.schedule.scheduled_at == PgScrubber::scrub_must_stamp());
+
+        f->dump_stream("level") << (e.level == scrub_level_t::shallow
+                                       ? "shallow"
+                                       : "deep");
+        f->dump_stream("urgency") << fmt::format("{}", e.urgency);
+        f->dump_bool("eligible", e.schedule.not_before <= query_time);
+        f->dump_bool("overdue", e.schedule.deadline < query_time);
+        f->dump_stream("last_issue") << fmt::format("{}", e.last_issue);
+
 	f->close_section();
       },
       std::numeric_limits<int>::max());
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index c37f31d28dc..aa53df5ae8a 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -3,13 +3,13 @@
 
 #include "./pg_scrubber.h"  // '.' notation used to affect clang-format order
 
+#include <fmt/ranges.h>
+
 #include <cmath>
 #include <iostream>
 #include <span>
 #include <vector>
 
-#include <fmt/ranges.h>
-
 #include "debug.h"
 
 #include "common/ceph_time.h"
@@ -766,8 +766,13 @@ void PgScrubber::on_operator_periodic_cmd(
   asok_response_section(f, true, scrub_level, stamp);
 
   if (scrub_level == scrub_level_t::deep) {
+    const auto saved_shallow_stamp = m_pg->info.history.last_scrub_stamp;
     // this call sets both stamps
     m_pg->set_last_deep_scrub_stamp(stamp);
+    // restore the shallow stamp, as otherwise it will be scheduled before
+    // the deep, failing whatever test code called us (this is a test-only
+    // interface).
+    m_pg->set_last_scrub_stamp(saved_shallow_stamp);
   } else {
     m_pg->set_last_scrub_stamp(stamp);
   }
@@ -819,21 +824,21 @@ namespace {
  * an aux function to be used in select_range() below, to
  * select the correct chunk size based on the type of scrub
  */
-int size_from_conf(
+int64_t size_from_conf(
     bool is_deep,
     const ceph::common::ConfigProxy& conf,
-    std::string_view deep_opt,
-    std::string_view shallow_opt)
+    const md_config_cacher_t<int64_t>& deep_opt,
+    const md_config_cacher_t<int64_t>& shallow_opt)
 {
   if (!is_deep) {
-    auto sz = conf.get_val<int64_t>(shallow_opt);
+    auto sz = *shallow_opt;
     if (sz != 0) {
       // assuming '0' means that no distinction was yet configured between
       // deep and shallow scrubbing
-      return static_cast<int>(sz);
+      return sz;
     }
   }
-  return static_cast<int>(conf.get_val<int64_t>(deep_opt));
+  return *deep_opt;
 }
 }  // anonymous namespace
 
@@ -912,16 +917,16 @@ std::optional<uint64_t> PgScrubber::select_range()
   dout(20) << fmt::format(
 		  "{} {} mins: {}d {}s, max: {}d {}s", __func__,
 		  (m_is_deep ? "D" : "S"),
-		  conf.get_val<int64_t>("osd_scrub_chunk_min"),
-		  conf.get_val<int64_t>("osd_shallow_scrub_chunk_min"),
-		  conf.get_val<int64_t>("osd_scrub_chunk_max"),
-		  conf.get_val<int64_t>("osd_shallow_scrub_chunk_max"))
+		  *osd_scrub_chunk_min,
+		  *osd_shallow_scrub_chunk_min,
+		  *osd_scrub_chunk_max,
+		  *osd_shallow_scrub_chunk_max)
 	   << dendl;
 
-  const int min_from_conf = size_from_conf(
-      m_is_deep, conf, "osd_scrub_chunk_min", "osd_shallow_scrub_chunk_min");
-  const int max_from_conf = size_from_conf(
-      m_is_deep, conf, "osd_scrub_chunk_max", "osd_shallow_scrub_chunk_max");
+  const int min_from_conf = static_cast<int>(size_from_conf(
+      m_is_deep, conf, osd_scrub_chunk_min, osd_shallow_scrub_chunk_min));
+  const int max_from_conf = static_cast<int>(size_from_conf(
+      m_is_deep, conf, osd_scrub_chunk_max, osd_shallow_scrub_chunk_max));
 
   const int divisor = static_cast<int>(preemption_data.chunk_divisor());
   const int min_chunk_sz = std::max(3, min_from_conf / divisor);
@@ -1635,7 +1640,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   advance_token();
   const auto& conf = m_pg->get_cct()->_conf;
   const int max_from_conf = size_from_conf(
-    m_is_deep, conf, "osd_scrub_chunk_max", "osd_shallow_scrub_chunk_max");
+    m_is_deep, conf, osd_scrub_chunk_max, osd_shallow_scrub_chunk_max);
   auto cost = get_scrub_cost(max_from_conf);
   m_osds->queue_for_rep_scrub(m_pg,
 			      m_replica_request_priority,
@@ -2541,6 +2546,16 @@ PgScrubber::PgScrubber(PG* pg)
     , m_pg_id{pg->pg_id}
     , m_osds{m_pg->osd}
     , m_pg_whoami{pg->pg_whoami}
+    , osd_scrub_chunk_max{m_osds->cct->_conf, "osd_scrub_chunk_max"}
+    , osd_shallow_scrub_chunk_max{m_osds->cct->_conf,
+				  "osd_shallow_scrub_chunk_max"}
+    , osd_scrub_chunk_min{m_osds->cct->_conf, "osd_scrub_chunk_min"}
+    , osd_shallow_scrub_chunk_min{m_osds->cct->_conf,
+				  "osd_shallow_scrub_chunk_min"}
+    , osd_stats_update_period_scrubbing{
+	m_osds->cct->_conf, "osd_stats_update_period_scrubbing"}
+    , osd_stats_update_period_not_scrubbing{
+	m_osds->cct->_conf, "osd_stats_update_period_not_scrubbing"}
     , preemption_data{pg}
 {
   m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
@@ -2669,7 +2684,8 @@ const OSDMapRef& PgScrubber::get_osdmap() const
 
 LoggerSinkSet& PgScrubber::get_logger() const { return *m_osds->clog.get(); }
 
-ostream &operator<<(ostream &out, const PgScrubber &scrubber) {
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
   return out << scrubber.m_flags;
 }
 
@@ -2783,16 +2799,14 @@ void PgScrubber::update_scrub_stats(ceph::coarse_real_clock::time_point now_is)
   using clock = ceph::coarse_real_clock;
   using namespace std::chrono;
 
-  const seconds period_active = seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
-    "osd_stats_update_period_scrubbing"));
+  const seconds period_active = seconds(*osd_stats_update_period_scrubbing);
   if (!period_active.count()) {
     // a way for the operator to disable these stats updates
     return;
   }
-  const seconds period_inactive =
-    seconds(m_pg->get_cct()->_conf.get_val<int64_t>(
-	      "osd_stats_update_period_not_scrubbing") +
-	    m_pg_id.pgid.m_seed % 30);
+  const seconds period_inactive = seconds(
+      *osd_stats_update_period_not_scrubbing +
+      m_pg_id.pgid.m_seed % 30);
 
   // determine the required update period, based on our current state
   auto period{period_inactive};
@@ -2826,10 +2840,10 @@ void PgScrubber::update_scrub_stats(ceph::coarse_real_clock::time_point now_is)
 
 // ///////////////////// preemption_data_t //////////////////////////////////
 
-PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg},
+  osd_scrub_max_preemptions{pg->cct->_conf, "osd_scrub_max_preemptions"}
 {
-  m_left = static_cast<int>(
-    m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_left = *osd_scrub_max_preemptions;
 }
 
 void PgScrubber::preemption_data_t::reset()
@@ -2838,8 +2852,7 @@ void PgScrubber::preemption_data_t::reset()
 
   m_preemptable = false;
   m_preempted = false;
-  m_left = static_cast<int>(
-    m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_left = *osd_scrub_max_preemptions;
   m_size_divisor = 1;
 }
 
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index 3d7e16cd359..0d9e8c1e9f6 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -75,6 +75,8 @@ Main Scrubber interfaces:
 #include <string_view>
 #include <vector>
 
+#include "common/config_proxy.h"
+#include "common/config_cacher.h"
 #include "osd/PG.h"
 #include "osd/scrubber_common.h"
 
@@ -895,6 +897,24 @@ class PgScrubber : public ScrubPgIF,
   // scrub state.
   ceph::coarse_real_clock::time_point m_last_stat_upd{};
 
+  // ------------------ cached (frequently used) configuration values
+
+  /// initial (& max) number of objects to scrub in one pass - deep scrub
+  md_config_cacher_t<int64_t> osd_scrub_chunk_max;
+  /// initial (& max) number of objects to scrub in one pass - shallow
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_max;
+
+  /// chunk size won't be reduced (when preempted) below this
+  /// value (deep scrub)
+  md_config_cacher_t<int64_t> osd_scrub_chunk_min;
+  /// chunk size won't be reduced below this value (shallow scrub)
+  md_config_cacher_t<int64_t> osd_shallow_scrub_chunk_min;
+
+  /// stats update (publish_stats_to_osd()) interval while scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_scrubbing;
+  /// stats update interval while not scrubbing
+  md_config_cacher_t<int64_t> osd_stats_update_period_not_scrubbing;
+
   // ------------ members used if we are a replica
 
   epoch_t m_replica_min_epoch;	///< the min epoch needed to handle this message
@@ -991,6 +1011,9 @@ class PgScrubber : public ScrubPgIF,
     mutable ceph::mutex m_preemption_lock = ceph::make_mutex("preemption_lock");
     bool m_preemptable{false};
     bool m_preempted{false};
+
+    /// the number of preemptions allowed before we start blocking
+    md_config_cacher_t<uint64_t> osd_scrub_max_preemptions;
     int m_left;
     size_t m_size_divisor{1};
     bool are_preemptions_left() const { return m_left > 0; }
diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index c98ce9aec41..476304275c1 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -325,6 +325,11 @@ class Module(MgrModule):
                type='str',
                default='',
                desc='pools which the automatic balancing will be limited to',
+               runtime=True),
+        Option(name='update_pg_upmap_activity',
+               type='bool',
+               default=False,
+               desc='Updates pg_upmap activity stats to be used in `balancer status detail`',
                runtime=True)
     ]
 
@@ -339,12 +344,10 @@ class Module(MgrModule):
     no_optimization_needed = False
     success_string = 'Optimization plan created successfully'
     in_progress_string = 'in progress'
-    last_pg_upmap: List[Dict[str, Any]] = []
     pg_upmap_items_added: List[Dict[str, Any]] = []
     pg_upmap_items_removed: List[Dict[str, Any]] = []
-    last_pg_upmap_primaries: List[Dict[str, Any]] = []
     pg_upmap_primaries_added: List[Dict[str, Any]] = []
-    pg_upmap_activity_initalized = False
+    pg_upmap_primaries_removed: List[Dict[str, Any]] = []
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super(Module, self).__init__(*args, **kwargs)
@@ -371,6 +374,11 @@ class Module(MgrModule):
         """
         Show balancer status (detailed)
         """
+        pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+        if not pg_upmap_activity:
+            msg = 'This command is disabled.\n' \
+                  'To enable, run `ceph config set mgr mgr/balancer/update_pg_upmap_activity True`.\n'
+            return 0, msg, ''
         s = {
             'plans': list(self.plans.keys()),
             'active': self.active,
@@ -665,7 +673,9 @@ class Module(MgrModule):
         if not plan_:
             return (-errno.ENOENT, '', f'plan {plan} not found')
         r, detail = self.execute(plan_)
-        self.update_pg_upmap_activity()  # update pg activity in `balancer status detail`
+        pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+        if pg_upmap_activity:
+            self.update_pg_upmap_activity(plan_)  # update pg activity in `balancer status detail`
         self.plan_rm(plan)
         return (r, '', detail)
 
@@ -757,7 +767,9 @@ class Module(MgrModule):
                     self.execute(plan)
                 else:
                     self.optimize_result = detail
-                self.update_pg_upmap_activity()  # update pg activity in `balancer status detail`
+                pg_upmap_activity = cast(bool, self.get_module_option('update_pg_upmap_activity'))
+                if pg_upmap_activity:
+                    self.update_pg_upmap_activity(plan)  # update pg activity in `balancer status detail`
                 self.optimizing = False
             self.log.debug('Sleeping for %d', sleep_interval)
             self.event.wait(sleep_interval)
@@ -1582,22 +1594,16 @@ class Module(MgrModule):
             'mode': self.mode,
         }
 
-    def update_pg_upmap_activity(self) -> None:
-        osdmap = self.get_osdmap()
-        if not self.pg_upmap_activity_initalized:
-            self.last_pg_upmap = osdmap.dump().get('pg_upmap_items', '')
-            self.last_pg_upmap_primaries = osdmap.dump().get('pg_upmap_primaries', '')
-            self.pg_upmap_activity_initalized = True
+    def update_pg_upmap_activity(self, plan: Plan) -> None:
+        incdump = plan.inc.dump()
 
         # update pg_upmap_items
-        self.pg_upmap_items_added = [pg for pg in osdmap.dump().get('pg_upmap_items', '') if pg not in self.last_pg_upmap]
-        self.pg_upmap_items_removed = [pg for pg in self.last_pg_upmap if pg not in osdmap.dump().get('pg_upmap_items', '')]
-        self.last_pg_upmap = osdmap.dump().get('pg_upmap_items', '')
+        self.pg_upmap_items_added = incdump.get('new_pg_upmap_items', [])
+        self.pg_upmap_items_removed = incdump.get('old_pg_upmap_items', [])
 
         # update pg_upmap_primaries
-        self.pg_upmap_primaries_added = [pg for pg in osdmap.dump().get('pg_upmap_primaries', '') if pg not in self.last_pg_upmap_primaries]
-        self.pg_upmap_primaries_removed = [pg for pg in self.last_pg_upmap_primaries if pg not in osdmap.dump().get('pg_upmap_primaries', '')]
-        self.last_pg_upmap_primaries = osdmap.dump().get('pg_upmap_primaries', '')
+        self.pg_upmap_primaries_added = incdump.get('new_pg_upmap_primaries', [])
+        self.pg_upmap_primaries_removed = incdump.get('old_pg_upmap_primaries', [])
 
     def self_test(self) -> None:
         # turn balancer on
diff --git a/src/pybind/mgr/cephadm/cert_mgr.py b/src/pybind/mgr/cephadm/cert_mgr.py
index 9b68e85ca44..0c56c704788 100644
--- a/src/pybind/mgr/cephadm/cert_mgr.py
+++ b/src/pybind/mgr/cephadm/cert_mgr.py
@@ -1,6 +1,6 @@
 
 from cephadm.ssl_cert_utils import SSLCerts, SSLConfigException
-from typing import TYPE_CHECKING, Tuple, Union, List
+from typing import TYPE_CHECKING, Tuple, Union, List, Optional
 
 if TYPE_CHECKING:
     from cephadm.module import CephadmOrchestrator
@@ -28,5 +28,10 @@ class CertMgr:
     def get_root_ca(self) -> str:
         return self.ssl_certs.get_root_cert()
 
-    def generate_cert(self, host_fqdn: Union[str, List[str]], node_ip: Union[str, List[str]]) -> Tuple[str, str]:
-        return self.ssl_certs.generate_cert(host_fqdn, node_ip)
+    def generate_cert(
+        self,
+        host_fqdn: Union[str, List[str]],
+        node_ip: Union[str, List[str]],
+        custom_san_list: Optional[List[str]] = None,
+    ) -> Tuple[str, str]:
+        return self.ssl_certs.generate_cert(host_fqdn, node_ip, custom_san_list=custom_san_list)
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index 00410c1bc38..d3681cbe60a 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -638,6 +638,9 @@ class TunedProfileStore():
             logger.error(
                 f'Attempted to set setting "{setting}" for nonexistent os tuning profile "{profile}"')
 
+    def add_settings(self, profile: str, settings: dict) -> None:
+        self.process_settings(profile, settings, action='add')
+
     def rm_setting(self, profile: str, setting: str) -> None:
         if profile in self.profiles:
             if setting in self.profiles[profile].settings:
@@ -651,6 +654,39 @@ class TunedProfileStore():
             logger.error(
                 f'Attempted to remove setting "{setting}" from nonexistent os tuning profile "{profile}"')
 
+    def rm_settings(self, profile: str, settings: List[str]) -> None:
+        self.process_settings(profile, settings, action='remove')
+
+    def process_settings(self, profile: str, settings: Union[dict, list], action: str) -> None:
+        """
+        Process settings by either adding or removing them based on the action specified.
+        """
+        if profile not in self.profiles:
+            logger.error(f'Attempted to {action} settings for nonexistent os tuning profile "{profile}"')
+            return
+        profile_settings = self.profiles[profile].settings
+        if action == 'remove' and isinstance(settings, list):
+            invalid_settings = [s for s in settings if '=' in s or s not in profile_settings]
+            if invalid_settings:
+                raise OrchestratorError(
+                    f"Invalid settings: {', '.join(invalid_settings)}. "
+                    "Ensure settings are specified without '=' and exist in the profile. Correct format: key1,key2"
+                )
+        if action == 'add' and isinstance(settings, dict):
+            for setting, value in settings.items():
+                self.profiles[profile].settings[setting] = value
+        elif action == 'remove' and isinstance(settings, list):
+            for setting in settings:
+                self.profiles[profile].settings.pop(setting, '')
+        else:
+            logger.error(
+                f'Invalid action "{action}" for settings modification for tuned profile '
+                f'"{profile}". Valid actions are "add" and "remove"'
+            )
+            return
+        self.profiles[profile]._last_updated = datetime_to_str(datetime_now())
+        self.save()
+
     def add_profile(self, spec: TunedProfileSpec) -> None:
         spec._last_updated = datetime_to_str(datetime_now())
         self.profiles[spec.profile_name] = spec
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index e93a286cec6..e851f1ee3fc 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -3499,6 +3499,33 @@ Then run the following:
         return f'Added setting {setting} with value {value} to tuned profile {profile_name}'
 
     @handle_orch_error
+    def tuned_profile_add_settings(self, profile_name: str, settings: dict) -> str:
+        if profile_name not in self.tuned_profiles:
+            raise OrchestratorError(
+                f"Tuned profile {profile_name} does not exist. Cannot add setting."
+            )
+        self.tuned_profiles.add_settings(profile_name, settings)
+        results = [
+            f"Added setting {key} with value {value} to tuned profile {profile_name}"
+            for key, value in settings.items()
+        ]
+        self._kick_serve_loop()
+        return "\n".join(results)
+
+    @handle_orch_error
+    def tuned_profile_rm_settings(self, profile_name: str, settings: List[str]) -> str:
+        if profile_name not in self.tuned_profiles:
+            raise OrchestratorError(
+                f"Tuned profile {profile_name} does not exist. Cannot remove setting."
+            )
+        self.tuned_profiles.rm_settings(profile_name, settings)
+        results = [
+            f'Removed setting {settings} from tuned profile {profile_name}'
+        ]
+        self._kick_serve_loop()
+        return "\n".join(results)
+
+    @handle_orch_error
     def tuned_profile_rm_setting(self, profile_name: str, setting: str) -> str:
         if profile_name not in self.tuned_profiles:
             raise OrchestratorError(
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 4a7959ae045..8e9cd00fa81 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -1436,8 +1436,24 @@ class CephadmServe:
                         config_blobs=daemon_spec.final_config,
                     ).dump_json_str(),
                     use_current_daemon_image=reconfig,
+                    error_ok=True
                 )
 
+                # return number corresponding to DAEMON_FAILED_ERROR
+                # in src/cephadm/cephadmlib/constants.
+                # TODO: link these together so one cannot be changed without the other
+                if code == 17:
+                    # daemon failed on systemctl start command, meaning while
+                    # deployment failed the daemon is present and we should handle
+                    # this as if the deploy command "succeeded" and mark the daemon
+                    # as failed later when we fetch its status
+                    self.mgr.log.error(f'Deployment of {daemon_spec.name()} failed during "systemctl start" command')
+                elif code:
+                    # some other failure earlier in the deploy process. Just raise an exception
+                    # the same as we would in _run_cephadm on a nonzero rc
+                    raise OrchestratorError(
+                        f'cephadm exited with an error code: {code}, stderr: {err}')
+
                 if daemon_spec.daemon_type == 'agent':
                     self.mgr.agent_cache.agent_timestamp[daemon_spec.host] = datetime_now()
                     self.mgr.agent_cache.agent_counter[daemon_spec.host] = 1
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 9043577bc5a..04f5af28a9b 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -1015,12 +1015,6 @@ class RgwService(CephService):
         # set rgw_realm rgw_zonegroup and rgw_zone, if present
         self.set_realm_zg_zone(spec)
 
-        if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
-            # generate a self-signed cert for the rgw service
-            cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)
-            spec.rgw_frontend_ssl_certificate = ''.join([key, cert])
-            self.mgr.spec_store.save(spec)
-
         if spec.rgw_frontend_ssl_certificate:
             if isinstance(spec.rgw_frontend_ssl_certificate, list):
                 cert_data = '\n'.join(spec.rgw_frontend_ssl_certificate)
@@ -1068,6 +1062,19 @@ class RgwService(CephService):
             # and it matches the spec.
             port = spec.get_port()
 
+        if spec.generate_cert:
+            cert, key = self.mgr.cert_mgr.generate_cert(
+                daemon_spec.host,
+                self.mgr.inventory.get_addr(daemon_spec.host),
+                custom_san_list=spec.zonegroup_hostnames
+            )
+            pem = ''.join([key, cert])
+            ret, out, err = self.mgr.check_mon_command({
+                'prefix': 'config-key set',
+                'key': f'rgw/cert/{daemon_spec.name()}',
+                'val': pem,
+            })
+
         # configure frontend
         args = []
         ftype = spec.rgw_frontend_type or "beast"
@@ -1078,7 +1085,10 @@ class RgwService(CephService):
                         f"ssl_endpoint={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
                 else:
                     args.append(f"ssl_port={port}")
-                args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
+                if spec.generate_cert:
+                    args.append(f"ssl_certificate=config://rgw/cert/{daemon_spec.name()}")
+                else:
+                    args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
             else:
                 if daemon_spec.ip:
                     args.append(f"endpoint={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
@@ -1091,7 +1101,10 @@ class RgwService(CephService):
                     args.append(f"port={build_url(host=daemon_spec.ip, port=port).lstrip('/')}s")
                 else:
                     args.append(f"port={port}s")  # note the 's' suffix on port
-                args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
+                if spec.generate_cert:
+                    args.append(f"ssl_certificate=config://rgw/cert/{daemon_spec.name()}")
+                else:
+                    args.append(f"ssl_certificate=config://rgw/cert/{spec.service_name()}")
             else:
                 if daemon_spec.ip:
                     args.append(f"port={build_url(host=daemon_spec.ip, port=port).lstrip('/')}")
@@ -1180,6 +1193,10 @@ class RgwService(CephService):
             'who': utils.name_to_config_section(daemon.name()),
             'name': 'rgw_frontends',
         })
+        self.mgr.check_mon_command({
+            'prefix': 'config-key rm',
+            'key': f'rgw/cert/{daemon.name()}',
+        })
 
     def ok_to_stop(
             self,
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 13f75881453..418be93b6af 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -66,6 +66,10 @@ class NvmeofService(CephService):
         daemon_spec.keyring = keyring
         daemon_spec.extra_files = {'ceph-nvmeof.conf': gw_conf}
 
+        # Indicate to the daemon whether to utilize huge pages
+        if spec.spdk_mem_size:
+            daemon_spec.extra_files['spdk_mem_size'] = str(spec.spdk_mem_size)
+
         if spec.enable_auth:
             if (
                 not spec.client_cert
diff --git a/src/pybind/mgr/cephadm/ssl_cert_utils.py b/src/pybind/mgr/cephadm/ssl_cert_utils.py
index 930b276c8de..467b32a4df0 100644
--- a/src/pybind/mgr/cephadm/ssl_cert_utils.py
+++ b/src/pybind/mgr/cephadm/ssl_cert_utils.py
@@ -70,7 +70,12 @@ class SSLCerts:
 
         return (cert_str, key_str)
 
-    def generate_cert(self, _hosts: Union[str, List[str]], _addrs: Union[str, List[str]]) -> Tuple[str, str]:
+    def generate_cert(
+        self,
+        _hosts: Union[str, List[str]],
+        _addrs: Union[str, List[str]],
+        custom_san_list: Optional[List[str]] = None,
+    ) -> Tuple[str, str]:
 
         addrs = [_addrs] if isinstance(_addrs, str) else _addrs
         hosts = [_hosts] if isinstance(_hosts, str) else _hosts
@@ -97,6 +102,8 @@ class SSLCerts:
         san_list: List[x509.GeneralName] = [x509.DNSName(host) for host in hosts]
         if valid_ips:
             san_list.extend(ips)
+        if custom_san_list:
+            san_list.extend([x509.DNSName(n) for n in custom_san_list])
 
         builder = builder.add_extension(
             x509.SubjectAlternativeName(
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index 760bc97e515..73f36952ee7 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -8,6 +8,7 @@ enable_auth = {{ spec.enable_auth }}
 state_update_notify = {{ spec.state_update_notify }}
 state_update_interval_sec = {{ spec.state_update_interval_sec }}
 enable_spdk_discovery_controller = {{ spec.enable_spdk_discovery_controller }}
+enable_key_encryption = {{ spec.enable_key_encryption }}
 enable_prometheus_exporter = {{ spec.enable_prometheus_exporter }}
 prometheus_exporter_ssl = False
 prometheus_port = 10008
@@ -22,6 +23,10 @@ ping_spdk_under_lock = {{ spec.ping_spdk_under_lock }}
 enable_monitor_client = {{ spec.enable_monitor_client }}
 max_hosts_per_namespace = {{ spec.max_hosts_per_namespace }}
 max_namespaces_with_netmask = {{ spec.max_namespaces_with_netmask }}
+max_subsystems = {{ spec.max_subsystems }}
+max_namespaces = {{ spec.max_namespaces }}
+max_namespaces_per_subsystem = {{ spec.max_namespaces_per_subsystem }}
+max_hosts_per_subsystem = {{ spec.max_hosts_per_subsystem }}
 
 [gateway-logs]
 log_level = {{ spec.log_level }}
@@ -65,6 +70,9 @@ protocol_log_level = {{ spec.spdk_protocol_log_level }}
 log_file_dir = {{ spec.spdk_log_file_dir }}
 {% endif %}
 conn_retries = {{ spec.conn_retries }}
+{% if spec.spdk_mem_size %}
+mem_size = {{ spec.spdk_mem_size }}
+{% endif %}
 transports = {{ spec.transports }}
 {% if transport_tcp_options %}
 transport_tcp_options = {{ transport_tcp_options }}
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 6d0c00d408c..3c647476e44 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -136,7 +136,7 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str
         mock.call(host, 'osd', 'ceph-volume',
                   ['--', 'lvm', 'list', '--format', 'json'],
                   no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
-        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False),
+        mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, error_ok=True, use_current_daemon_image=False),
         mock.call(host, 'osd', 'ceph-volume',
                   ['--', 'raw', 'list', '--format', 'json'],
                   no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False),
@@ -499,7 +499,7 @@ class TestCephadm(object):
 
                 CephadmServe(cephadm_module)._check_daemons()
 
-                assert _save_host.called_with('test')
+                _save_host.assert_called_with('test')
                 assert cephadm_module.cache.get_scheduled_daemon_action('test', daemon_name) is None
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -563,6 +563,7 @@ class TestCephadm(object):
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=True,
                 )
 
@@ -618,6 +619,7 @@ class TestCephadm(object):
                             "crush_location": "datacenter=a",
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -660,6 +662,7 @@ class TestCephadm(object):
                             "keyring": "[client.crash.test]\nkey = None\n",
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -702,6 +705,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -752,6 +756,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -806,6 +811,7 @@ class TestCephadm(object):
                         },
                         "config_blobs": {},
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2849,15 +2855,15 @@ Traceback (most recent call last):
         # pass force=true in these tests to bypass _admin label check
         with with_host(cephadm_module, 'test', refresh_hosts=False, rm_with_force=True):
             cephadm_module.drain_host('test', force=True, zap_osd_devices=False)
-            assert _rm_osds.called_with([], zap=False)
+            _rm_osds.assert_called_with([], zap=False)
 
         with with_host(cephadm_module, 'test', refresh_hosts=False, rm_with_force=True):
             cephadm_module.drain_host('test', force=True, zap_osd_devices=True)
-            assert _rm_osds.called_with([], zap=True)
+            _rm_osds.assert_called_with([], zap=True)
 
         with pytest.raises(OrchestratorError, match=r"Cannot find host 'host1' in the inventory."):
             cephadm_module.drain_host('host1', force=True, zap_osd_devices=True)
-            assert _rm_osds.called_with([], zap=True)
+            _rm_osds.assert_called_with([], zap=True)
 
     def test_process_ls_output(self, cephadm_module):
         sample_ls_output = """[
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index 824e37cf4d4..75c7c3c5bf7 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -349,6 +349,7 @@ log_to_file = False"""
                             },
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -397,6 +398,7 @@ enable_auth = False
 state_update_notify = True
 state_update_interval_sec = 5
 enable_spdk_discovery_controller = False
+enable_key_encryption = True
 enable_prometheus_exporter = True
 prometheus_exporter_ssl = False
 prometheus_port = 10008
@@ -409,8 +411,12 @@ allowed_consecutive_spdk_ping_failures = 1
 spdk_ping_interval_in_seconds = 2.0
 ping_spdk_under_lock = False
 enable_monitor_client = True
-max_hosts_per_namespace = 1
+max_hosts_per_namespace = 8
 max_namespaces_with_netmask = 1000
+max_subsystems = 128
+max_namespaces = 1024
+max_namespaces_per_subsystem = 256
+max_hosts_per_subsystem = 32
 
 [gateway-logs]
 log_level = INFO
@@ -489,6 +495,7 @@ timeout = 1.0\n"""
                             }
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -601,6 +608,7 @@ class TestMonitoring:
                             "use_url_prefix": False,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -696,6 +704,7 @@ class TestMonitoring:
                             "use_url_prefix": True,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -788,6 +797,7 @@ class TestMonitoring:
                             "use_url_prefix": False,
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -833,6 +843,7 @@ class TestMonitoring:
                                                         "files": {
                                                             "ceph-exporter.crt": "mycert",
                                                             "ceph-exporter.key": "mykey"}}}),
+                                                error_ok=True,
                                                 use_current_daemon_image=False)
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
@@ -876,6 +887,7 @@ class TestMonitoring:
                         },
                         "config_blobs": {}
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -936,6 +948,7 @@ class TestMonitoring:
                             'web_config': '/etc/node-exporter/web.yml',
                         }
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1071,6 +1084,7 @@ class TestMonitoring:
                             "use_url_prefix": False
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1303,6 +1317,7 @@ class TestMonitoring:
                             "use_url_prefix": False
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1379,6 +1394,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1438,6 +1454,7 @@ class TestMonitoring:
                             },
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1588,6 +1605,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1718,6 +1736,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -1831,6 +1850,7 @@ class TestMonitoring:
                             "files": files,
                         },
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2005,6 +2025,7 @@ spec:
                             },
                             "config_blobs": {},
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -2112,6 +2133,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2160,6 +2182,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2212,6 +2235,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -2269,6 +2293,7 @@ class TestSNMPGateway:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3361,6 +3386,7 @@ class TestJaeger:
                         },
                         "config_blobs": config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3401,6 +3427,7 @@ class TestJaeger:
                         },
                         "config_blobs": es_config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, collector_spec):
@@ -3429,6 +3456,7 @@ class TestJaeger:
                             },
                             "config_blobs": collector_config,
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -3469,6 +3497,7 @@ class TestJaeger:
                         },
                         "config_blobs": collector_config,
                     }),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
                 with with_service(cephadm_module, agent_spec):
@@ -3497,6 +3526,7 @@ class TestJaeger:
                             },
                             "config_blobs": agent_config,
                         }),
+                        error_ok=True,
                         use_current_daemon_image=False,
                     )
 
@@ -3554,6 +3584,7 @@ class TestCustomContainer:
                             },
                         }
                     ),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3641,6 +3672,7 @@ class TestCustomContainer:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3694,6 +3726,7 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -3766,6 +3799,7 @@ class TestSMB:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -4011,6 +4045,7 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -4352,6 +4387,7 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
 
@@ -4475,5 +4511,6 @@ class TestMgmtGateway:
                     ['_orch', 'deploy'],
                     [],
                     stdin=json.dumps(expected),
+                    error_ok=True,
                     use_current_daemon_image=False,
                 )
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 29ddff2ffc2..76ad8d9d0ce 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -1285,7 +1285,7 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin):
             if latest < version:
                 raise RuntimeError(f"main.db version is newer ({version}) than module ({latest})")
             for i in range(version, latest):
-                self.log.info(f"upgrading main.db for {self.module_name} from {i-1}:{i}")
+                self.log.info(f"upgrading main.db for {self.module_name} from {i - 1}:{i}")
                 for sql in self.SCHEMA_VERSIONED[i]:
                     db.execute(sql)
             if version < latest:
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index efe0de55bd0..a505801eea5 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -901,10 +901,18 @@ class Orchestrator(object):
         """Change/Add a specific setting for a tuned profile"""
         raise NotImplementedError()
 
+    def tuned_profile_add_settings(self, profile_name: str, setting: dict) -> OrchResult[str]:
+        """Change/Add multiple settings for a tuned profile"""
+        raise NotImplementedError()
+
     def tuned_profile_rm_setting(self, profile_name: str, setting: str) -> OrchResult[str]:
         """Remove a specific setting for a tuned profile"""
         raise NotImplementedError()
 
+    def tuned_profile_rm_settings(self, profile_name: str, settings: List[str]) -> OrchResult[str]:
+        """Remove multiple settings from a tuned profile"""
+        raise NotImplementedError
+
     def upgrade_check(self, image: Optional[str], version: Optional[str]) -> OrchResult[str]:
         raise NotImplementedError()
 
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index fdb9280d7ed..332bc75d862 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -2250,6 +2250,39 @@ Usage:
         res = raise_if_exception(completion)
         return HandleCommandResult(stdout=res)
 
+    @_cli_write_command("orch tuned-profile add-settings")
+    def _tuned_profile_add_settings(self, profile_name: str, settings: str) -> HandleCommandResult:
+        try:
+            setting_pairs = settings.split(",")
+            parsed_setting = {}
+            parsed_setting = {key.strip(): value.strip() for key, value in (s.split('=', 1) for s in setting_pairs)}
+            completion = self.tuned_profile_add_settings(profile_name, parsed_setting)
+            res = raise_if_exception(completion)
+            return HandleCommandResult(stdout=res)
+        except ValueError:
+            error_message = (
+                "Error: Invalid format detected. "
+                "The correct format is key=value pairs separated by commas,"
+                "e.g., 'vm.swappiness=11,vm.user_reserve_kbytes=116851'"
+            )
+            return HandleCommandResult(stderr=error_message)
+
+    @_cli_write_command("orch tuned-profile rm-settings")
+    def _tuned_profile_rm_settings(self, profile_name: str, settings: str) -> HandleCommandResult:
+        try:
+            setting = [s.strip() for s in settings.split(",") if s.strip()]
+            if not setting:
+                raise ValueError(
+                    "Error: Invalid format."
+                    "The correct format is key1,key2"
+                    "e.g., vm.swappiness,vm.user_reserve_kbytes"
+                )
+            completion = self.tuned_profile_rm_settings(profile_name, setting)
+            res = raise_if_exception(completion)
+            return HandleCommandResult(stdout=res)
+        except ValueError as e:
+            return HandleCommandResult(stderr=str(e))
+
     def self_test(self) -> None:
         old_orch = self._select_orchestrator()
         self._set_backend('')
diff --git a/src/python-common/CMakeLists.txt b/src/python-common/CMakeLists.txt
index e89bbe2feef..08660342a6a 100644
--- a/src/python-common/CMakeLists.txt
+++ b/src/python-common/CMakeLists.txt
@@ -3,5 +3,5 @@ distutils_install_module(ceph)
 
 if(WITH_TESTS)
   include(AddCephTest)
-  add_tox_test(python-common TOX_ENVS py3 lint)
+  add_tox_test(python-common TOX_ENVS __tox_defaults__)
 endif()
diff --git a/src/python-common/ceph/deployment/drive_group.py b/src/python-common/ceph/deployment/drive_group.py
index c68ee01a728..43175aa79fb 100644
--- a/src/python-common/ceph/deployment/drive_group.py
+++ b/src/python-common/ceph/deployment/drive_group.py
@@ -2,7 +2,7 @@ import enum
 import yaml
 
 from ceph.deployment.inventory import Device
-from ceph.deployment.service_spec import (
+from ceph.deployment.service_spec import (  # noqa: F401 (type comments)
     CustomConfig,
     GeneralArgList,
     PlacementSpec,
@@ -11,7 +11,7 @@ from ceph.deployment.service_spec import (
 from ceph.deployment.hostspec import SpecValidationError
 
 try:
-    from typing import Optional, List, Dict, Any, Union
+    from typing import Optional, List, Dict, Any, Union  # noqa: F401
 except ImportError:
     pass
 
diff --git a/src/python-common/ceph/deployment/drive_selection/filter.py b/src/python-common/ceph/deployment/drive_selection/filter.py
index 0da1b5c3901..28f63ddc2f2 100644
--- a/src/python-common/ceph/deployment/drive_selection/filter.py
+++ b/src/python-common/ceph/deployment/drive_selection/filter.py
@@ -15,12 +15,10 @@ logger = logging.getLogger(__name__)
 
 
 class FilterGenerator(object):
-    def __init__(self, device_filter):
-        # type: (DeviceSelection) -> None
+    def __init__(self, device_filter: DeviceSelection) -> None:
         self.device_filter = device_filter
 
-    def __iter__(self):
-        # type: () -> Generator[Matcher, None, None]
+    def __iter__(self) -> Generator[Matcher, None, None]:
         if self.device_filter.actuators:
             yield EqualityMatcher('actuators', self.device_filter.actuators)
         if self.device_filter.size:
diff --git a/src/python-common/ceph/deployment/drive_selection/matchers.py b/src/python-common/ceph/deployment/drive_selection/matchers.py
index df502410aeb..a6a2147ce9e 100644
--- a/src/python-common/ceph/deployment/drive_selection/matchers.py
+++ b/src/python-common/ceph/deployment/drive_selection/matchers.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
-from typing import Tuple, Optional, Any, Union, Iterator
+# TODO: remove noqa and update to python3/mypy style type annotations
+from typing import Tuple, Optional, Any, Union, Iterator  # noqa: F401
 
-from ceph.deployment.inventory import Device
+from ceph.deployment.inventory import Device  # noqa: F401
 
 import re
 import logging
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 59ebbb6347e..85fc95cf394 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -3,7 +3,7 @@ import logging
 from typing import List, Optional, Dict, Callable
 
 from ..inventory import Device
-from ..drive_group import DriveGroupSpec, DeviceSelection, DriveGroupValidationError
+from ..drive_group import DriveGroupSpec, DeviceSelection, DriveGroupValidationError  # noqa: F401
 
 from .filter import FilterGenerator
 from .matchers import _MatchInvalid
diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py
index e2c1a5605f9..29475e94d82 100644
--- a/src/python-common/ceph/deployment/inventory.py
+++ b/src/python-common/ceph/deployment/inventory.py
@@ -1,5 +1,5 @@
 try:
-    from typing import List, Optional, Dict, Any, Union
+    from typing import List, Optional, Dict, Any, Union  # noqa: F401
 except ImportError:
     pass  # for type checking
 
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 979c14f7d00..10e9f267940 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -527,8 +527,8 @@ pattern_type=PatternType.fnmatch))
         labels = [x for x in strings if 'label:' in x]
         if len(labels) > 1:
             raise SpecValidationError('more than one label provided: {}'.format(labels))
-        for l in labels:
-            strings.remove(l)
+        for lbl in labels:
+            strings.remove(lbl)
         label = labels[0][6:] if labels else None
 
         host_patterns = strings
@@ -701,7 +701,7 @@ class ArgumentSpec:
         if isinstance(data, str):
             return cls(data, split=True, origin=cls.OriginalType.STRING)
         if 'argument' not in data:
-            raise SpecValidationError(f'ArgumentSpec must have an "argument" field')
+            raise SpecValidationError('ArgumentSpec must have an "argument" field')
         for k in data.keys():
             if k not in cls._fields:
                 raise SpecValidationError(f'ArgumentSpec got an unknown field {k!r}')
@@ -1313,6 +1313,10 @@ class RGWSpec(ServiceSpec):
             raise SpecValidationError('"ssl" field must be set to true when "generate_cert" '
                                       'is set to true')
 
+        if self.generate_cert and self.rgw_frontend_ssl_certificate:
+            raise SpecValidationError('"generate_cert" field and "rgw_frontend_ssl_certificate" '
+                                      'field are mutually exclusive')
+
 
 yaml.add_representer(RGWSpec, ServiceSpec.yaml_representer)
 
@@ -1330,6 +1334,7 @@ class NvmeofServiceSpec(ServiceSpec):
                  state_update_notify: Optional[bool] = True,
                  state_update_interval_sec: Optional[int] = 5,
                  enable_spdk_discovery_controller: Optional[bool] = False,
+                 enable_key_encryption: Optional[bool] = True,
                  omap_file_lock_duration: Optional[int] = 20,
                  omap_file_lock_retries: Optional[int] = 30,
                  omap_file_lock_retry_sleep_interval: Optional[float] = 1.0,
@@ -1340,14 +1345,20 @@ class NvmeofServiceSpec(ServiceSpec):
                  allowed_consecutive_spdk_ping_failures: Optional[int] = 1,
                  spdk_ping_interval_in_seconds: Optional[float] = 2.0,
                  ping_spdk_under_lock: Optional[bool] = False,
-                 max_hosts_per_namespace: Optional[int] = 1,
+                 max_hosts_per_namespace: Optional[int] = 8,
                  max_namespaces_with_netmask: Optional[int] = 1000,
+                 max_subsystems: Optional[int] = 128,
+                 max_namespaces: Optional[int] = 1024,
+                 max_namespaces_per_subsystem: Optional[int] = 256,
+                 max_hosts_per_subsystem: Optional[int] = 32,
                  server_key: Optional[str] = None,
                  server_cert: Optional[str] = None,
                  client_key: Optional[str] = None,
                  client_cert: Optional[str] = None,
                  root_ca_cert: Optional[str] = None,
+                 # unused and duplicate of tgt_path below, consider removing
                  spdk_path: Optional[str] = None,
+                 spdk_mem_size: Optional[int] = None,
                  tgt_path: Optional[str] = None,
                  spdk_timeout: Optional[float] = 60.0,
                  spdk_log_level: Optional[str] = '',
@@ -1409,6 +1420,8 @@ class NvmeofServiceSpec(ServiceSpec):
         self.state_update_interval_sec = state_update_interval_sec
         #: ``enable_spdk_discovery_controller`` SPDK or ceph-nvmeof discovery service
         self.enable_spdk_discovery_controller = enable_spdk_discovery_controller
+        #: ``enable_key_encryption`` encrypt DHCHAP and PSK keys before saving in OMAP
+        self.enable_key_encryption = enable_key_encryption
         #: ``enable_prometheus_exporter`` enables Prometheus exporter
         self.enable_prometheus_exporter = enable_prometheus_exporter
         #: ``verify_nqns`` enables verification of subsystem and host NQNs for validity
@@ -1425,6 +1438,14 @@ class NvmeofServiceSpec(ServiceSpec):
         self.max_hosts_per_namespace = max_hosts_per_namespace
         #: ``max_namespaces_with_netmask`` max number of namespaces which are not auto visible
         self.max_namespaces_with_netmask = max_namespaces_with_netmask
+        #: ``max_subsystems`` max number of subsystems
+        self.max_subsystems = max_subsystems
+        #: ``max_namespaces`` max number of namespaces on all subsystems
+        self.max_namespaces = max_namespaces
+        #: ``max_namespaces_per_subsystem`` max number of namespaces per one subsystem
+        self.max_namespaces_per_subsystem = max_namespaces_per_subsystem
+        #: ``max_hosts_per_subsystem`` max number of hosts per subsystems
+        self.max_hosts_per_subsystem = max_hosts_per_subsystem
         #: ``allowed_consecutive_spdk_ping_failures`` # of ping failures before aborting gateway
         self.allowed_consecutive_spdk_ping_failures = allowed_consecutive_spdk_ping_failures
         #: ``spdk_ping_interval_in_seconds`` sleep interval in seconds between SPDK pings
@@ -1443,8 +1464,10 @@ class NvmeofServiceSpec(ServiceSpec):
         self.client_cert = client_cert
         #: ``root_ca_cert`` CA cert for server/client certs
         self.root_ca_cert = root_ca_cert
-        #: ``spdk_path`` path to SPDK
+        #: ``spdk_path`` path is unused and duplicate of tgt_path below, consider removing
         self.spdk_path = spdk_path or '/usr/local/bin/nvmf_tgt'
+        #: ``spdk_mem_size`` memory size in MB for DPDK
+        self.spdk_mem_size = spdk_mem_size
         #: ``tgt_path`` nvmeof target path
         self.tgt_path = tgt_path or '/usr/local/bin/nvmf_tgt'
         #: ``spdk_timeout`` SPDK connectivity timeout
@@ -1613,6 +1636,36 @@ class NvmeofServiceSpec(ServiceSpec):
         ):
             raise SpecValidationError("Log file directory backups can't be negative")
 
+        if (self.max_hosts_per_namespace and self.max_hosts_per_namespace < 0):
+            raise SpecValidationError("Max hosts per namespace can't be negative")
+
+        if (self.max_namespaces_with_netmask and self.max_namespaces_with_netmask < 0):
+            raise SpecValidationError("Max namespaces with netmask can't be negative")
+
+        if not isinstance(self.max_subsystems, int):
+            raise SpecValidationError("Max subsystems must be an integer")
+
+        if self.max_subsystems <= 0:
+            raise SpecValidationError("Max subsystems must be greater than zero")
+
+        if not isinstance(self.max_namespaces, int):
+            raise SpecValidationError("Max namespaces must be an integer")
+
+        if self.max_namespaces <= 0:
+            raise SpecValidationError("Max namespaces must be greater than zero")
+
+        if not isinstance(self.max_namespaces_per_subsystem, int):
+            raise SpecValidationError("Max namespaces per subsystem must be an integer")
+
+        if self.max_namespaces_per_subsystem <= 0:
+            raise SpecValidationError("Max namespaces per subsystem must be greater than zero")
+
+        if not isinstance(self.max_hosts_per_subsystem, int):
+            raise SpecValidationError("Max hosts per subsystem must be an integer")
+
+        if self.max_hosts_per_subsystem <= 0:
+            raise SpecValidationError("Max hosts per subsystem must be greater than zero")
+
         if (
             self.monitor_timeout
             and self.monitor_timeout < 0.0
diff --git a/src/python-common/ceph/deployment/translate.py b/src/python-common/ceph/deployment/translate.py
index 49fb17da725..9dfe7cfcf81 100644
--- a/src/python-common/ceph/deployment/translate.py
+++ b/src/python-common/ceph/deployment/translate.py
@@ -5,7 +5,7 @@ try:
 except ImportError:
     pass
 
-from ceph.deployment.drive_selection.selector import DriveSelection
+from ceph.deployment.drive_selection.selector import DriveSelection  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/python-common/ceph/fs/earmarking.py b/src/python-common/ceph/fs/earmarking.py
index c5d4a59a4d5..f4fd4ddf96c 100644
--- a/src/python-common/ceph/fs/earmarking.py
+++ b/src/python-common/ceph/fs/earmarking.py
@@ -19,13 +19,25 @@ supported top-level scopes.
 import errno
 import enum
 import logging
-from typing import List, NamedTuple, Optional, Tuple
+from typing import List, NamedTuple, Optional, Tuple, Protocol
 
 log = logging.getLogger(__name__)
 
 XATTR_SUBVOLUME_EARMARK_NAME = 'user.ceph.subvolume.earmark'
 
 
+class FSOperations(Protocol):
+    """Protocol class representing the file system operations earmarking
+    classes will perform.
+    """
+
+    def setxattr(
+        self, path: str, key: str, value: bytes, flags: int
+    ) -> None: ...
+
+    def getxattr(self, path: str, key: str) -> bytes: ...
+
+
 class EarmarkTopScope(enum.Enum):
     NFS = "nfs"
     SMB = "smb"
@@ -53,11 +65,11 @@ class EarmarkParseError(ValueError):
 
 
 class CephFSVolumeEarmarking:
-    def __init__(self, fs, path: str) -> None:
+    def __init__(self, fs: FSOperations, path: str) -> None:
         self.fs = fs
         self.path = path
 
-    def _handle_cephfs_error(self, e: Exception, action: str) -> None:
+    def _handle_cephfs_error(self, e: Exception, action: str) -> Optional[str]:
         if isinstance(e, ValueError):
             raise EarmarkException(errno.EINVAL, f"Invalid earmark specified: {e}") from e
         elif isinstance(e, OSError):
@@ -135,7 +147,7 @@ class CephFSVolumeEarmarking:
         except Exception as e:
             return self._handle_cephfs_error(e, "getting")
 
-    def set_earmark(self, earmark: str):
+    def set_earmark(self, earmark: str) -> None:
         # Validate the earmark before attempting to set it
         if not self._validate_earmark(earmark):
             raise EarmarkException(
diff --git a/src/python-common/ceph/tests/utils.py b/src/python-common/ceph/tests/utils.py
index 04b8a4e3895..20a39e4666b 100644
--- a/src/python-common/ceph/tests/utils.py
+++ b/src/python-common/ceph/tests/utils.py
@@ -35,8 +35,7 @@ def _mk_device(rotational=True,
     )]
 
 
-def _mk_inventory(devices):
-    # type: (Any) -> List[Device]
+def _mk_inventory(devices: Any) -> List[Device]:
     devs = []
     for dev_, name in zip(devices, map(chr, range(ord('a'), ord('z')))):
         dev = Device.from_json(dev_.to_json())
diff --git a/src/python-common/requirements-lint.txt b/src/python-common/requirements-lint.txt
deleted file mode 100644
index 2a7142182c2..00000000000
--- a/src/python-common/requirements-lint.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-flake8==3.7.8
-rstcheck==3.3.1
diff --git a/src/python-common/tox.ini b/src/python-common/tox.ini
index 313a4334d51..e0b59c700ca 100644
--- a/src/python-common/tox.ini
+++ b/src/python-common/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py3, mypy, lint
+envlist = lint, rstcheck, mypy, py3
 skip_missing_interpreters = true
 
 [testenv:py3]
@@ -26,9 +26,13 @@ exclude =
     __pycache__
 
 [testenv:lint]
-deps = 
-    -rrequirements-lint.txt
+deps =
+    flake8
 commands =
     flake8 {posargs:ceph}
-    rstcheck --report info --debug README.rst
 
+[testenv:rstcheck]
+deps =
+    rstcheck
+commands =
+    rstcheck --report-level info  README.rst
diff --git a/src/rgw/driver/dbstore/README.md b/src/rgw/driver/dbstore/README.md
index f7e5df331cc..2bf3391c8c4 100644
--- a/src/rgw/driver/dbstore/README.md
+++ b/src/rgw/driver/dbstore/README.md
@@ -15,23 +15,21 @@ Add below cmake option (enabled by default)
 
 
 ## Running Test cluster
-Edit ceph.conf to add below option
+Edit ceph.conf to add below options
 
     [client]
         rgw backend store = dbstore
         rgw config store = dbstore
 
-Start vstart cluster
+To start the `vstart` cluster, run the following cmd:
 
-    MON=1 RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -o rgw_config_store=dbstore -n -d
+    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -n -d --rgw_store dbstore
 
-The above vstart command brings up RGW server on dbstore. It creates default zonegroup, zone and few default users (eg., testid) to be used for s3 operations.
+The above `vstart` command brings up the RGW server on DBStore without the need for MONs or OSDs. It creates a default zonegroup, zone, and few default users (e.g., `testid`) to be used for S3 operations, and generates database files in the `dev` subdirectory, by default, to store them.
 
-`radosgw-admin` can be used to create and remove other users, zonegroups and zones.
-
-
-By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data and *'/var/lib/ceph/radosgw/dbstore-config.db'* file to store the configuration. This can be configured using below options in ceph.conf
+`radosgw-admin` command can be used to create and remove other users, zonegroups and zones.
 
+The location and prefix for the database files can be configured using the following options:
     [client]
         dbstore db dir = <path for the directory for storing the db backend store data>
         dbstore db name prefix = <prefix to the file names created by db backend store>
diff --git a/src/rgw/driver/posix/README.md b/src/rgw/driver/posix/README.md
index 02dc8dfbe85..73971edc86f 100644
--- a/src/rgw/driver/posix/README.md
+++ b/src/rgw/driver/posix/README.md
@@ -23,15 +23,15 @@ Edit ceph.conf to add below option
         rgw config store = dbstore
         rgw filter = posix
 
-Start vstart cluster
+To start the `vstart` cluster, run the following cmd:
 
-    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -o rgw_config_store=dbstore -o rgw_filter=posix -n -d
+    MON=0 OSD=0 MDS=0 MGR=0 RGW=1 ../src/vstart.sh -n -d --rgw_store posix
 
-The above vstart command brings up RGW server on POSIXDriver. It creates default zonegroup, zone and few default users (eg., testid) to be used for s3 operations.
+The above vstart command brings up RGW server on POSIXDriver. It creates default zonegroup, zone and few default users (e.g., testid) to be used for s3 operations.
 
-`radosgw-admin` can be used to create and remove other users, zonegroups and zones.
+`radosgw-admin` command can be used to create and remove other users, zonegroups and zones.
 
-By default, the directory exported is *'/tmp/rgw_posix_driver'*.   This can be changed with the `rgw_posix_base_path` option, either in ceph.conf or on the vstart command line above.
+By default, the directory exported, *'rgw_posix_driver'*, is created in the `dev` subdirectory.   This can be changed with the `rgw_posix_base_path` option.
 
-The POSIXDriver keeps a LMDB based cache of directories, so that it can provide ordered listings.  This directory lives in `rgw_posix_database_root`, which by default is in *'/var/lib/ceph/radosgw'*
+The POSIXDriver keeps a LMDB based cache of directories, so that it can provide ordered listings.  This directory lives in `rgw_posix_database_root`, which by default is created in the `dev` subdirectory
 
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
index 7b31fd72bd4..5734284d1a3 100644
--- a/src/rgw/driver/rados/rgw_notify.cc
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -21,6 +21,7 @@
 #include "common/dout.h"
 #include "rgw_url.h"
 #include <chrono>
+#include <fmt/format.h>
 
 #define dout_subsys ceph_subsys_rgw_notification
 
@@ -769,9 +770,10 @@ public:
         });
 
     // start the worker threads to do the actual queue processing
-    const std::string WORKER_THREAD_NAME = "notif-worker";
     for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
-      workers.emplace_back([this]() {
+      workers.emplace_back([this,worker_id]() {
+        const auto thread_name = fmt::format("notif-worker-{}", worker_id);
+        ceph_pthread_setname(thread_name.c_str());
         try {
           io_context.run(); 
         } catch (const std::exception& err) {
@@ -779,11 +781,6 @@ public:
           throw err;
         }
       });
-      const auto thread_name = WORKER_THREAD_NAME+std::to_string(worker_id);
-      if (const auto rc = ceph_pthread_setname(workers.back().native_handle(), thread_name.c_str()); rc != 0) {
-        ldpp_dout(this, 1) << "ERROR: failed to set notification manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
     }
     ldpp_dout(this, 10) << "INfO: started notification manager with: " << worker_count << " workers" << dendl;
   }
diff --git a/src/rgw/rgw_amqp.cc b/src/rgw/rgw_amqp.cc
index 7504d47c6c9..5bc5d173c73 100644
--- a/src/rgw/rgw_amqp.cc
+++ b/src/rgw/rgw_amqp.cc
@@ -650,6 +650,9 @@ private:
   // (4) TODO reconnect on connection errors
   // (5) TODO cleanup timedout callbacks
   void run() noexcept {
+    // give the runner thread a name for easier debugging
+    ceph_pthread_setname("amqp_manager");
+
     amqp_frame_t frame;
     while (!stopped) {
 
@@ -838,12 +841,6 @@ public:
       // This is to prevent rehashing so that iterators are not invalidated
       // when a new connection is added.
       connections.max_load_factor(10.0);
-      // give the runner thread a name for easier debugging
-      const char* thread_name = "amqp_manager";
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-        ldout(cct, 1) << "ERROR: failed to set amqp manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
   }
 
   // non copyable
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 68fb9a29278..1a59ba02999 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -2994,7 +2994,9 @@ void RGWAccessKey::decode_json(JSONObj *obj) {
       subuser = user.substr(pos + 1);
     }
   }
-  JSONDecoder::decode_json("active", active, obj);
+  if (bool tmp = false; JSONDecoder::decode_json("active", tmp, obj)) {
+    active = tmp; // update only if "active" is present
+  }
   JSONDecoder::decode_json("create_date", create_date, obj);
 }
 
diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc
index c0ec3dc2c55..0807993338d 100644
--- a/src/rgw/rgw_kafka.cc
+++ b/src/rgw/rgw_kafka.cc
@@ -503,6 +503,7 @@ private:
   }
 
   void run() noexcept {
+    ceph_pthread_setname("kafka_manager");
     while (!stopped) {
 
       // publish all messages in the queue
@@ -575,12 +576,6 @@ public:
       // This is to prevent rehashing so that iterators are not invalidated 
       // when a new connection is added.
       connections.max_load_factor(10.0);
-      // give the runner thread a name for easier debugging
-      const char* thread_name = "kafka_manager";
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-        ldout(cct, 1) << "ERROR: failed to set kafka manager thread name to: " << thread_name
-          << ". error: " << rc << dendl;
-      }
   }
 
   // non copyable
diff --git a/src/rgw/rgw_lua_background.cc b/src/rgw/rgw_lua_background.cc
index ef97a5d6f65..c5b815f93f5 100644
--- a/src/rgw/rgw_lua_background.cc
+++ b/src/rgw/rgw_lua_background.cc
@@ -83,11 +83,6 @@ void Background::start() {
   }
   started = true;
   runner = std::thread(&Background::run, this);
-  const char* thread_name = "lua_background";
-  if (const auto rc = ceph_pthread_setname(runner.native_handle(), thread_name); rc != 0) {
-    ldout(cct, 1) << "ERROR: failed to set lua background thread name to: " << thread_name
-      << ". error: " << rc << dendl;
-  }
 }
 
 void Background::pause() {
@@ -127,6 +122,7 @@ const BackgroundMapValue& Background::get_table_value(const std::string& key) co
 //(2) Executes the script
 //(3) Sleep (configurable)
 void Background::run() {
+  ceph_pthread_setname("lua_background");
   const DoutPrefixProvider* const dpp = &dp;
   lua_state_guard lguard(cct->_conf->rgw_lua_max_memory_per_state, dpp);
   auto L = lguard.get();
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 67829e6320a..0dcf1e0f7d5 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -6811,6 +6811,8 @@ void RGWCompleteMultipart::execute(optional_yield y)
   if (upload->cksum_type != rgw::cksum::Type::none) {
     op_ret = try_sum_part_cksums(this, s->cct, upload.get(), parts, cksum, y);
     if (op_ret < 0) {
+      ldpp_dout(this, 16) << "ERROR: try_sum_part_cksums failed, obj="
+			  << meta_obj << " ret=" << op_ret << dendl;
       return;
     }
   }
@@ -6835,13 +6837,23 @@ void RGWCompleteMultipart::execute(optional_yield y)
       rgw::putobj::find_hdr_cksum(*(s->info.env));
 
       ldpp_dout_fmt(this, 10,
-		    "INFO: client supplied checksum {}: {}",
+		    "INFO: client supplied checksum {}: {} ",
 		    hdr_cksum.header_name(), supplied_cksum);
 
     if (! (supplied_cksum.empty()) &&
 	(supplied_cksum != armored_cksum)) {
-      op_ret = -ERR_INVALID_REQUEST;
-      return;
+      /* some minio SDK clients assert a checksum that is cryptographically
+       * valid but omits the part count */
+      auto parts_suffix = fmt::format("-{}", parts->parts.size());
+      auto suffix_len = armored_cksum->size() - parts_suffix.size();
+      if (armored_cksum->compare(0, suffix_len, supplied_cksum) != 0) {
+	ldpp_dout_fmt(this, 4,
+		      "{} content checksum mismatch"
+		      "\n\tcalculated={} != \n\texpected={}",
+		      hdr_cksum.header_name(), armored_cksum, supplied_cksum);
+	op_ret = -ERR_INVALID_REQUEST;
+	return;
+      }
     }
 
     buffer::list cksum_bl;
diff --git a/src/rgw/rgw_ratelimit.h b/src/rgw/rgw_ratelimit.h
index 0db1813f050..beb0eb3b1d2 100644
--- a/src/rgw/rgw_ratelimit.h
+++ b/src/rgw/rgw_ratelimit.h
@@ -239,6 +239,7 @@ class ActiveRateLimiter : public DoutPrefix  {
   std::atomic_uint8_t current_active = 0;
   std::shared_ptr<RateLimiter> ratelimit[2];
   void replace_active() {
+    ceph_pthread_setname("ratelimit_gc");
     using namespace std::chrono_literals;
     std::unique_lock<std::mutex> lk(cv_m);
     while (!stopped) {
@@ -286,8 +287,5 @@ class ActiveRateLimiter : public DoutPrefix  {
     void start() {
       ldpp_dout(this, 20) << "starting ratelimit_gc thread" << dendl;
       runner = std::thread(&ActiveRateLimiter::replace_active, this);
-      if (const auto rc = ceph_pthread_setname(runner.native_handle(), "ratelimit_gc"); rc != 0) {
-        ldpp_dout(this, 1) << "ERROR: failed to set ratelimit_gc thread name. error: " << rc << dendl;
-      }
     }
 };
diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc
index c0345a4f88a..adfc86d87cb 100644
--- a/src/rgw/rgw_rest_pubsub.cc
+++ b/src/rgw/rgw_rest_pubsub.cc
@@ -494,11 +494,11 @@ void RGWPSListTopicsOp::execute(optional_yield y) {
 
   const RGWPubSub ps(driver, get_account_or_tenant(s->owner.id), *s->penv.site);
   if (rgw::all_zonegroups_support(*s->penv.site, rgw::zone_features::notification_v2) &&
-      driver->stat_topics_v1(s->bucket->get_tenant(), null_yield, this) == -ENOENT) {
-    op_ret = ps.get_topics_v1(this, result, y);
-  } else {
+      driver->stat_topics_v1(get_account_or_tenant(s->owner.id), null_yield, this) == -ENOENT) {
     constexpr int max_items = 100;
     op_ret = ps.get_topics_v2(this, start_token, max_items, result, next_token, y);
+  } else {
+    op_ret = ps.get_topics_v1(this, result, y);
   }
   // if there are no topics it is not considered an error
   op_ret = op_ret == -ENOENT ? 0 : op_ret;
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index a245fca9945..68223405cf4 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -2399,11 +2399,17 @@ static void dump_bucket_metadata(req_state *s, rgw::sal::Bucket* bucket,
   if (s->auth.identity->is_owner_of(bucket->get_owner())) {
     const auto& user_info = s->user->get_info();
     const auto& bucket_quota = s->bucket->get_info().quota; // bucket quota
-    dump_header(s, "X-RGW-Quota-User-Size", static_cast<long long>(user_info.quota.user_quota.max_size));
-    dump_header(s, "X-RGW-Quota-User-Objects", static_cast<long long>(user_info.quota.user_quota.max_objects));
     dump_header(s, "X-RGW-Quota-Max-Buckets", static_cast<long long>(user_info.max_buckets));
-    dump_header(s, "X-RGW-Quota-Bucket-Size", static_cast<long long>(bucket_quota.max_size));
-    dump_header(s, "X-RGW-Quota-Bucket-Objects", static_cast<long long>(bucket_quota.max_objects));
+
+    if (user_info.quota.user_quota.enabled){
+      dump_header(s, "X-RGW-Quota-User-Size", static_cast<long long>(user_info.quota.user_quota.max_size));
+      dump_header(s, "X-RGW-Quota-User-Objects", static_cast<long long>(user_info.quota.user_quota.max_objects));
+    }
+
+    if (bucket_quota.enabled){
+      dump_header(s, "X-RGW-Quota-Bucket-Size", static_cast<long long>(bucket_quota.max_size));
+      dump_header(s, "X-RGW-Quota-Bucket-Objects", static_cast<long long>(bucket_quota.max_objects));
+    }
   }
 }
 
diff --git a/src/test/admin_socket.cc b/src/test/admin_socket.cc
index 69a3cbefd0e..dea29f96f11 100644
--- a/src/test/admin_socket.cc
+++ b/src/test/admin_socket.cc
@@ -27,6 +27,8 @@
 #include <sys/un.h>
 #include <signal.h>
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 class AdminSocketTest
diff --git a/src/test/admin_socket_output.h b/src/test/admin_socket_output.h
index 1df12e4a9a5..5d22e8757ee 100644
--- a/src/test/admin_socket_output.h
+++ b/src/test/admin_socket_output.h
@@ -16,6 +16,7 @@
 #define CEPH_ADMIN_SOCKET_OUTPUT_H
 
 #include <filesystem>
+#include <iostream> // for std::cout
 #include <string>
 #include <map>
 #include <set>
diff --git a/src/test/bench_log.cc b/src/test/bench_log.cc
index 60fda462e87..9e7c02afc41 100644
--- a/src/test/bench_log.cc
+++ b/src/test/bench_log.cc
@@ -1,6 +1,8 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <iostream> // for std::cout
+
 #include "include/types.h"
 #include "common/Thread.h"
 #include "common/debug.h"
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index 013335d8177..4b3ca95ca6c 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -23,6 +23,8 @@
 #include <errno.h>
 #include <sys/uio.h>
 
+#include <iostream> // for std::cout
+
 #include "include/buffer.h"
 #include "include/buffer_raw.h"
 #include "include/compat.h"
diff --git a/src/test/ceph_argparse.cc b/src/test/ceph_argparse.cc
index 738879c5ba8..436ddc86363 100644
--- a/src/test/ceph_argparse.cc
+++ b/src/test/ceph_argparse.cc
@@ -15,6 +15,7 @@
 #include "common/ceph_argparse.h"
 
 #include "gtest/gtest.h"
+#include <iostream> // for std::cout
 #include <vector>
 #include "include/stringify.h"
 
diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc
index b36d0a901de..6ca05f6dae3 100644
--- a/src/test/common/Throttle.cc
+++ b/src/test/common/Throttle.cc
@@ -23,6 +23,7 @@
 #include <signal.h>
 
 #include <chrono>
+#include <iostream> // for std::cout
 #include <list>
 #include <mutex>
 #include <random>
diff --git a/src/test/common/test_cdc.cc b/src/test/common/test_cdc.cc
index 620ecf4679f..61a5aa3708c 100644
--- a/src/test/common/test_cdc.cc
+++ b/src/test/common/test_cdc.cc
@@ -3,6 +3,7 @@
 
 #include <vector>
 #include <cstring>
+#include <iostream> // for std::cout
 #include <random>
 
 #include "include/types.h"
diff --git a/src/test/common/test_config.cc b/src/test/common/test_config.cc
index a70d567a434..4805c14a32e 100644
--- a/src/test/common/test_config.cc
+++ b/src/test/common/test_config.cc
@@ -19,6 +19,9 @@
  *
  *
  */
+
+#include <iostream> // for std::cout
+
 #include "common/config_proxy.h"
 #include "common/errno.h"
 #include "gtest/gtest.h"
diff --git a/src/test/common/test_context.cc b/src/test/common/test_context.cc
index 889d000da85..8afded98951 100644
--- a/src/test/common/test_context.cc
+++ b/src/test/common/test_context.cc
@@ -19,6 +19,9 @@
  *
  *
  */
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "include/types.h"
 #include "include/msgr.h"
diff --git a/src/test/common/test_shared_cache.cc b/src/test/common/test_shared_cache.cc
index 91120c7e59f..b7a392426d5 100644
--- a/src/test/common/test_shared_cache.cc
+++ b/src/test/common/test_shared_cache.cc
@@ -22,6 +22,9 @@
 
 #include <stdio.h>
 #include <signal.h>
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "common/Thread.h"
 #include "common/shared_cache.hpp"
diff --git a/src/test/common/test_url_escape.cc b/src/test/common/test_url_escape.cc
index 6c27b64da7a..52de8db8d9c 100644
--- a/src/test/common/test_url_escape.cc
+++ b/src/test/common/test_url_escape.cc
@@ -3,6 +3,8 @@
 
 #include "common/url_escape.h"
 
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 
 TEST(url_escape, escape) {
diff --git a/src/test/compressor/test_compression.cc b/src/test/compressor/test_compression.cc
index 98ef159dfb8..c5e4724cefc 100644
--- a/src/test/compressor/test_compression.cc
+++ b/src/test/compressor/test_compression.cc
@@ -17,6 +17,9 @@
 #include <errno.h>
 #include <signal.h>
 #include <stdlib.h>
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
 #include "common/config.h"
diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc
index e7aabf2c8af..5dbc3748e5b 100644
--- a/src/test/crimson/seastore/test_object_data_handler.cc
+++ b/src/test/crimson/seastore/test_object_data_handler.cc
@@ -218,14 +218,20 @@ struct object_data_handler_test_t:
     objaddr_t offset,
     extent_len_t length) {
     auto ret = with_trans_intr(t, [&](auto &t) {
-      return tm->get_pins(t, laddr_t::from_byte_offset(offset), length);
+      auto &layout = onode->get_layout();
+      auto odata = layout.object_data.get();
+      auto obase = odata.get_reserved_data_base();
+      return tm->get_pins(t, (obase + offset).checked_to_laddr(), length);
     }).unsafe_get();
     return ret;
   }
   std::list<LBAMappingRef> get_mappings(objaddr_t offset, extent_len_t length) {
     auto t = create_mutate_transaction();
     auto ret = with_trans_intr(*t, [&](auto &t) {
-      return tm->get_pins(t, laddr_t::from_byte_offset(offset), length);
+      auto &layout = onode->get_layout();
+      auto odata = layout.object_data.get();
+      auto obase = odata.get_reserved_data_base();
+      return tm->get_pins(t, (obase + offset).checked_to_laddr(), length);
     }).unsafe_get();
     return ret;
   }
@@ -253,12 +259,16 @@ struct object_data_handler_test_t:
 
   ObjectDataBlockRef get_extent(
     Transaction &t,
-    laddr_t addr,
+    loffset_t addr,
     extent_len_t len) {
+    auto &layout = onode->get_layout();
+    auto odata = layout.object_data.get();
+    auto obase = odata.get_reserved_data_base();
     auto ext = with_trans_intr(t, [&](auto& trans) {
-	return tm->read_extent<ObjectDataBlock>(trans, addr, len);
-	}).unsafe_get();
-    EXPECT_EQ(addr, ext->get_laddr());
+      return tm->read_extent<ObjectDataBlock>(
+	trans, (obase + addr).checked_to_laddr(), len);
+    }).unsafe_get();
+    EXPECT_EQ((obase + addr).checked_to_laddr(), ext->get_laddr());
     return ext;
   }
 
@@ -798,7 +808,7 @@ TEST_P(object_data_handler_test_t, overwrite_then_read_within_transaction) {
       auto pins = get_mappings(*t, base, len);
       assert(pins.size() == 1);
       auto pin1 = remap_pin(*t, std::move(pins.front()), 4096, 8192);
-      auto ext = get_extent(*t, laddr_t::from_byte_offset(base + 4096), 4096 * 2);
+      auto ext = get_extent(*t, base + 4096, 4096 * 2);
       ASSERT_TRUE(ext->is_exist_clean());
       write(*t, base + 4096, 4096, 'y');
       ASSERT_TRUE(ext->is_exist_mutation_pending());
diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc
index 6ad111dca5b..2d20c5fff94 100644
--- a/src/test/crimson/seastore/test_transaction_manager.cc
+++ b/src/test/crimson/seastore/test_transaction_manager.cc
@@ -26,6 +26,10 @@ namespace {
   }
 }
 
+laddr_t get_laddr_hint(uint64_t offset) {
+  return laddr_t::from_byte_offset(RootMetaBlock::SIZE + offset);
+}
+
 struct test_extent_record_t {
   test_extent_desc_t desc;
   unsigned refcount = 0;
@@ -67,8 +71,9 @@ struct transaction_manager_test_t :
   }
 
   laddr_t get_random_laddr(size_t block_size, size_t limit) {
-    return laddr_t::from_byte_offset(block_size *
-      std::uniform_int_distribution<>(0, (limit / block_size) - 1)(gen));
+    auto offset =  block_size *
+      std::uniform_int_distribution<>(0, (limit / block_size) - 1)(gen);
+    return get_laddr_hint(offset);
   }
 
   char get_random_contents() {
@@ -719,7 +724,7 @@ struct transaction_manager_test_t :
       [this, &overlay](auto &t) {
 	return lba_manager->scan_mappings(
 	  t,
-	  L_ADDR_MIN,
+	  get_laddr_hint(0),
 	  L_ADDR_MAX,
 	  [iter=overlay.begin(), &overlay](auto l, auto p, auto len) mutable {
 	    EXPECT_NE(iter, overlay.end());
@@ -830,9 +835,9 @@ struct transaction_manager_test_t :
 	auto t = create_transaction();
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * BSIZE),
+	  get_laddr_hint(i * BSIZE),
 	  BSIZE);
-	ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+	ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
 	submit_transaction(std::move(t));
       }
 
@@ -844,7 +849,7 @@ struct transaction_manager_test_t :
 	    boost::make_counting_iterator(0lu),
 	    boost::make_counting_iterator(BLOCKS),
 	    [this, &t](auto i) {
-	    return tm->read_extent<TestBlock>(t, laddr_t::from_byte_offset(i * BSIZE), BSIZE
+	    return tm->read_extent<TestBlock>(t, get_laddr_hint(i * BSIZE), BSIZE
 	    ).si_then([](auto) {
 	      return seastar::now();
 	    });
@@ -870,9 +875,9 @@ struct transaction_manager_test_t :
               auto t = create_transaction();
               auto extent = alloc_extent(
                 t,
-                laddr_t::from_byte_offset(i * BSIZE),
+                get_laddr_hint(i * BSIZE),
                 BSIZE);
-              ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+              ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
               if (try_submit_transaction(std::move(t)))
                 break;
             }
@@ -973,6 +978,7 @@ struct transaction_manager_test_t :
         extent_types_t::ROOT,
         extent_types_t::LADDR_INTERNAL,
         extent_types_t::LADDR_LEAF,
+	extent_types_t::ROOT_META,
         extent_types_t::OMAP_INNER,
         extent_types_t::OMAP_LEAF,
         extent_types_t::ONODE_BLOCK_STAGED,
@@ -1346,9 +1352,9 @@ struct transaction_manager_test_t :
   void test_remap_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t l_offset = laddr_t::from_byte_offset(32 << 10);
+      laddr_t l_offset = get_laddr_hint(32 << 10);
       size_t l_len = 32 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t r_offset = get_laddr_hint(64 << 10);
       size_t r_len = 32 << 10;
       {
 	auto t = create_transaction();
@@ -1400,12 +1406,12 @@ struct transaction_manager_test_t :
   void test_clone_and_remap_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t l_offset = laddr_t::from_byte_offset(32 << 10);
+      laddr_t l_offset = get_laddr_hint(32 << 10);
       size_t l_len = 32 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t r_offset = get_laddr_hint(64 << 10);
       size_t r_len = 32 << 10;
-      laddr_t l_clone_offset = laddr_t::from_byte_offset(96 << 10);
-      laddr_t r_clone_offset = laddr_t::from_byte_offset(128 << 10);
+      laddr_t l_clone_offset = get_laddr_hint(96 << 10);
+      laddr_t r_clone_offset = get_laddr_hint(128 << 10);
       {
 	auto t = create_transaction();
 	auto lext = alloc_extent(t, l_offset, l_len);
@@ -1455,11 +1461,11 @@ struct transaction_manager_test_t :
   void test_overwrite_pin() {
     run_async([this] {
       disable_max_extent_size();
-      laddr_t m_offset = laddr_t::from_byte_offset(8 << 10);
+      laddr_t m_offset = get_laddr_hint(8 << 10);
       size_t m_len = 56 << 10;
-      laddr_t l_offset = laddr_t::from_byte_offset(64 << 10);
+      laddr_t l_offset = get_laddr_hint(64 << 10);
       size_t l_len = 64 << 10;
-      laddr_t r_offset = laddr_t::from_byte_offset(128 << 10);
+      laddr_t r_offset = get_laddr_hint(128 << 10);
       size_t r_len = 64 << 10;
       {
 	auto t = create_transaction();
@@ -1538,7 +1544,7 @@ struct transaction_manager_test_t :
     run_async([this] {
       disable_max_extent_size();
       constexpr unsigned REMAP_NUM = 32;
-      constexpr laddr_t offset = L_ADDR_MIN;
+      laddr_t offset = get_laddr_hint(0);
       constexpr size_t length = 256 << 10;
       {
 	auto t = create_transaction();
@@ -1575,7 +1581,7 @@ struct transaction_manager_test_t :
 	      if (off == 0 || off >= 255) {
 		continue;
 	      }
-              auto new_off = laddr_t::from_byte_offset(off << 10)
+              auto new_off = get_laddr_hint(off << 10)
 		  .get_byte_distance<extent_len_t>(last_pin->get_key());
               auto new_len = last_pin->get_length() - new_off;
               //always remap right extent at new split_point
@@ -1621,7 +1627,7 @@ struct transaction_manager_test_t :
     run_async([this] {
       disable_max_extent_size();
       constexpr unsigned REMAP_NUM = 32;
-      constexpr laddr_t offset = L_ADDR_MIN;
+      laddr_t offset = get_laddr_hint(0);
       constexpr size_t length = 256 << 10;
       {
 	auto t = create_transaction();
@@ -1661,12 +1667,12 @@ struct transaction_manager_test_t :
 	    ASSERT_TRUE(!split_points.empty());
             while(!split_points.empty()) {
               // new overwrite area: start_off ~ end_off
-              auto start_off = split_points.front();
+              auto start_off = split_points.front() + 4 /*RootMetaBlock*/;
               split_points.pop_front();
-              auto end_off = split_points.front();
+              auto end_off = split_points.front() + 4 /*RootMetaBlock*/;
               split_points.pop_front();
               ASSERT_TRUE(start_off <= end_off);
-              if ((laddr_t::from_byte_offset(end_off << 10) == pin0->get_key() + pin0->get_length())
+              if ((get_laddr_hint(end_off << 10) == pin0->get_key() + pin0->get_length())
                 || (start_off == end_off)) {
                 if (split_points.empty() && empty_transaction) {
                   early_exit++;
@@ -1675,7 +1681,7 @@ struct transaction_manager_test_t :
                 continue;
               }
               empty_transaction = false;
-              auto new_off = laddr_t::from_byte_offset(start_off << 10)
+              auto new_off = get_laddr_hint(start_off << 10)
 		  .get_byte_distance<extent_len_t>(last_rpin->get_key());
               auto new_len = (end_off - start_off) << 10;
               bufferlist bl;
@@ -1768,7 +1774,7 @@ struct tm_random_block_device_test_t :
 TEST_P(tm_random_block_device_test_t, scatter_allocation)
 {
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * 4096);
+    laddr_t ADDR = get_laddr_hint(0xFF * 4096);
     epm->prefill_fragmented_devices();
     auto t = create_transaction();
     for (int i = 0; i < 1991; i++) {
@@ -1786,7 +1792,7 @@ TEST_P(tm_single_device_test_t, basic)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1807,7 +1813,7 @@ TEST_P(tm_single_device_test_t, mutate)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1845,8 +1851,8 @@ TEST_P(tm_single_device_test_t, allocate_lba_conflict)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
-    laddr_t ADDR2 = laddr_t::from_byte_offset(0xFE * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
+    laddr_t ADDR2 = get_laddr_hint(0xFE * SIZE);
     auto t = create_transaction();
     auto t2 = create_transaction();
 
@@ -1883,7 +1889,7 @@ TEST_P(tm_single_device_test_t, mutate_lba_conflict)
       for (unsigned i = 0; i < 300; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       check_mappings(t);
@@ -1891,7 +1897,7 @@ TEST_P(tm_single_device_test_t, mutate_lba_conflict)
       check();
     }
 
-    laddr_t ADDR = laddr_t::from_byte_offset(150 * SIZE);
+    laddr_t ADDR = get_laddr_hint(150 * SIZE);
     {
       auto t = create_transaction();
       auto t2 = create_transaction();
@@ -1917,15 +1923,15 @@ TEST_P(tm_single_device_test_t, concurrent_mutate_lba_no_conflict)
 {
   constexpr size_t SIZE = 4096;
   constexpr size_t NUM = 500;
-  laddr_t addr = L_ADDR_MIN;
-  laddr_t addr2 = laddr_t::from_byte_offset(SIZE * (NUM - 1));
+  laddr_t addr = get_laddr_hint(0);
+  laddr_t addr2 = get_laddr_hint(SIZE * (NUM - 1));
   run_async([this, addr, addr2] {
     {
       auto t = create_transaction();
       for (unsigned i = 0; i < NUM; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       submit_transaction(std::move(t));
@@ -1949,7 +1955,7 @@ TEST_P(tm_single_device_test_t, create_remove_same_transaction)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -1985,7 +1991,7 @@ TEST_P(tm_single_device_test_t, split_merge_read_same_transaction)
       for (unsigned i = 0; i < 300; ++i) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE),
+	  get_laddr_hint(i * SIZE),
 	  SIZE);
       }
       check_mappings(t);
@@ -1997,7 +2003,7 @@ TEST_P(tm_single_device_test_t, split_merge_read_same_transaction)
       for (unsigned i = 0; i < 240; ++i) {
 	dec_ref(
 	  t,
-	  laddr_t::from_byte_offset(i * SIZE));
+	  get_laddr_hint(i * SIZE));
       }
       check_mappings(t);
       submit_transaction(std::move(t));
@@ -2010,7 +2016,7 @@ TEST_P(tm_single_device_test_t, inc_dec_ref)
 {
   constexpr size_t SIZE = 4096;
   run_async([this] {
-    laddr_t ADDR = laddr_t::from_byte_offset(0xFF * SIZE);
+    laddr_t ADDR = get_laddr_hint(0xFF * SIZE);
     {
       auto t = create_transaction();
       auto extent = alloc_extent(
@@ -2061,10 +2067,10 @@ TEST_P(tm_single_device_test_t, cause_lba_split)
       auto t = create_transaction();
       auto extent = alloc_extent(
 	t,
-	laddr_t::from_byte_offset(i * SIZE),
+	get_laddr_hint(i * SIZE),
 	SIZE,
 	(char)(i & 0xFF));
-      ASSERT_EQ(laddr_t::from_byte_offset(i * SIZE), extent->get_laddr());
+      ASSERT_EQ(get_laddr_hint(i * SIZE), extent->get_laddr());
       submit_transaction(std::move(t));
     }
     check();
@@ -2082,9 +2088,9 @@ TEST_P(tm_single_device_test_t, random_writes)
       auto t = create_transaction();
       auto extent = alloc_extent(
 	t,
-	laddr_t::from_byte_offset(i * BSIZE),
+	get_laddr_hint(i * BSIZE),
 	BSIZE);
-      ASSERT_EQ(laddr_t::from_byte_offset(i * BSIZE), extent->get_laddr());
+      ASSERT_EQ(get_laddr_hint(i * BSIZE), extent->get_laddr());
       submit_transaction(std::move(t));
     }
 
@@ -2100,7 +2106,7 @@ TEST_P(tm_single_device_test_t, random_writes)
 	  // pad out transaction
 	  auto paddings = alloc_extents(
 	    t,
-	    laddr_t::from_byte_offset(TOTAL + (k * PADDING_SIZE)),
+	    get_laddr_hint(TOTAL + (k * PADDING_SIZE)),
 	    PADDING_SIZE);
 	  for (auto &padding : paddings) {
 	    dec_ref(t, padding->get_laddr());
@@ -2133,7 +2139,7 @@ TEST_P(tm_single_device_test_t, find_hole_assert_trigger)
 
 TEST_P(tm_single_device_intergrity_check_test_t, remap_lazy_read)
 {
-  constexpr laddr_t offset = L_ADDR_MIN;
+  laddr_t offset = get_laddr_hint(0);
   constexpr size_t length = 256 << 10;
    run_async([this, offset] {
     disable_max_extent_size();
@@ -2186,7 +2192,7 @@ TEST_P(tm_single_device_test_t, invalid_lba_mapping_detect)
       for (int i = 0; i < LEAF_NODE_CAPACITY; i++) {
 	auto extent = alloc_extent(
 	  t,
-	  laddr_t::from_byte_offset(i * 4096),
+	  get_laddr_hint(i * 4096),
 	  4096,
 	  'a');
       }
@@ -2195,12 +2201,12 @@ TEST_P(tm_single_device_test_t, invalid_lba_mapping_detect)
 
     {
       auto t = create_transaction();
-      auto pin = get_pin(t, laddr_t::from_byte_offset((LEAF_NODE_CAPACITY - 1) * 4096));
+      auto pin = get_pin(t, get_laddr_hint((LEAF_NODE_CAPACITY - 1) * 4096));
       assert(pin->is_parent_viewable());
-      auto extent = alloc_extent(t, laddr_t::from_byte_offset(LEAF_NODE_CAPACITY * 4096), 4096, 'a');
+      auto extent = alloc_extent(t, get_laddr_hint(LEAF_NODE_CAPACITY * 4096), 4096, 'a');
       assert(!pin->is_parent_viewable());
-      pin = get_pin(t, laddr_t::from_byte_offset(LEAF_NODE_CAPACITY * 4096));
-      std::ignore = alloc_extent(t, laddr_t::from_byte_offset((LEAF_NODE_CAPACITY + 1) * 4096), 4096, 'a');
+      pin = get_pin(t, get_laddr_hint(LEAF_NODE_CAPACITY * 4096));
+      std::ignore = alloc_extent(t, get_laddr_hint((LEAF_NODE_CAPACITY + 1) * 4096), 4096, 'a');
       assert(pin->is_parent_viewable());
       assert(pin->parent_modified());
       pin->maybe_fix_pos();
diff --git a/src/test/crypto.cc b/src/test/crypto.cc
index 819d41c7218..67fb440eeb9 100644
--- a/src/test/crypto.cc
+++ b/src/test/crypto.cc
@@ -1,6 +1,8 @@
 #include <errno.h>
 #include <time.h>
 
+#include <iostream> // for std::cout
+
 #include <boost/container/small_vector.hpp>
 
 #include "gtest/gtest.h"
diff --git a/src/test/daemon_config.cc b/src/test/daemon_config.cc
index cdea3b05932..4c7abd70b20 100644
--- a/src/test/daemon_config.cc
+++ b/src/test/daemon_config.cc
@@ -21,6 +21,8 @@
 #include "include/rados/librados.h"
 
 #include <errno.h>
+
+#include <iostream> // for std::cout
 #include <sstream>
 #include <string>
 #include <string.h>
diff --git a/src/test/encoding.cc b/src/test/encoding.cc
index 3c83716b048..3d508909d6d 100644
--- a/src/test/encoding.cc
+++ b/src/test/encoding.cc
@@ -4,6 +4,8 @@
 #include <fmt/format.h>
 #include "gtest/gtest.h"
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 template < typename T >
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 2dc870411bb..062980d8655 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -23,6 +23,8 @@
 #include "common/Thread.h"
 #include "include/stringify.h"
 #include "osd/ReplicatedBackend.h"
+
+#include <iostream> // for std::cout
 #include <sstream>
 
 using namespace std;
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index 0bfdd48eb98..b32c5660112 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -23,6 +23,7 @@
 #include "MemWriteback.h"
 
 #include <atomic>
+#include <iostream> // for std::cout
 
 using namespace std;
 
diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc
index 7ab9561bc19..b75e6a50825 100644
--- a/src/test/perf_counters.cc
+++ b/src/test/perf_counters.cc
@@ -15,7 +15,7 @@
 #include "include/types.h" // FIXME: ordering shouldn't be important, but right 
                            // now, this include has to come before the others.
 
-
+#include "include/utime.h"
 #include "common/perf_counters_key.h"
 #include "common/perf_counters_collection.h"
 #include "common/admin_socket_client.h"
diff --git a/src/test/test_addrs.cc b/src/test/test_addrs.cc
index 4062d0431c3..e70d234d743 100644
--- a/src/test/test_addrs.cc
+++ b/src/test/test_addrs.cc
@@ -17,6 +17,7 @@
 #include "msg/msg_types.h"
 #include "gtest/gtest.h"
 
+#include <iostream> // for std::cout
 #include <sstream>
 
 using namespace std;
diff --git a/src/test/test_denc.cc b/src/test/test_denc.cc
index 02dd1454ef8..c9144bd05bf 100644
--- a/src/test/test_denc.cc
+++ b/src/test/test_denc.cc
@@ -15,6 +15,8 @@
  */
 
 #include <stdio.h>
+
+#include <iostream> // for std::cout
 #include <numeric>
 
 #include "global/global_init.h"
diff --git a/src/test/test_features.cc b/src/test/test_features.cc
index 1ae758bfb34..bdd8838224b 100644
--- a/src/test/test_features.cc
+++ b/src/test/test_features.cc
@@ -2,6 +2,8 @@
 // vim: ts=8 sw=2 smarttab
 #include <stdio.h>
 
+#include <iostream> // for std::cout
+
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "common/ceph_releases.h"
diff --git a/src/test/test_mempool.cc b/src/test/test_mempool.cc
index bb46b19aa4e..b806282d039 100644
--- a/src/test/test_mempool.cc
+++ b/src/test/test_mempool.cc
@@ -16,6 +16,8 @@
 
 #include <stdio.h>
 
+#include <iostream> // for std::cout
+
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "global/global_context.h"
diff --git a/src/test/test_perf_counters_cache.cc b/src/test/test_perf_counters_cache.cc
index 1fa147ee273..fa2d541b7f7 100644
--- a/src/test/test_perf_counters_cache.cc
+++ b/src/test/test_perf_counters_cache.cc
@@ -4,6 +4,7 @@
 #include "global/global_context.h"
 #include "global/global_init.h"
 #include "include/msgr.h" // for CEPH_ENTITY_TYPE_CLIENT
+#include "include/utime.h"
 #include "gtest/gtest.h"
 
 using namespace ceph::perf_counters;
diff --git a/src/test/test_rewrite_latency.cc b/src/test/test_rewrite_latency.cc
index 348c8dde5c6..48a95cf183b 100644
--- a/src/test/test_rewrite_latency.cc
+++ b/src/test/test_rewrite_latency.cc
@@ -1,5 +1,6 @@
 
 #include <unistd.h>
+#include <iostream> // for std::cout
 #include <map>
 #include <errno.h>
 
diff --git a/src/test/test_snap_mapper.cc b/src/test/test_snap_mapper.cc
index a47d2538c3a..7a9ac62defe 100644
--- a/src/test/test_snap_mapper.cc
+++ b/src/test/test_snap_mapper.cc
@@ -1,4 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include <iostream> // for std::cout
 #include <iterator>
 #include <map>
 #include <set>
diff --git a/src/test/test_striper.cc b/src/test/test_striper.cc
index ee70304ebc8..1e5f93a49be 100644
--- a/src/test/test_striper.cc
+++ b/src/test/test_striper.cc
@@ -3,6 +3,8 @@
 
 #include "osdc/Striper.h"
 
+#include <iostream> // for std::cout
+
 using namespace std;
 
 TEST(Striper, Stripe1)
diff --git a/src/test/test_utime.cc b/src/test/test_utime.cc
index b1cee0e805c..de1d6f46878 100644
--- a/src/test/test_utime.cc
+++ b/src/test/test_utime.cc
@@ -1,4 +1,7 @@
 #include "include/utime.h"
+
+#include <iostream> // for std::cout
+
 #include "gtest/gtest.h"
 #include "include/stringify.h"
 #include "common/ceph_context.h"
diff --git a/src/test/test_workqueue.cc b/src/test/test_workqueue.cc
index 771b9d65952..5c2fc459da2 100644
--- a/src/test/test_workqueue.cc
+++ b/src/test/test_workqueue.cc
@@ -1,6 +1,9 @@
 #include "gtest/gtest.h"
 
 #include "common/WorkQueue.h"
+
+#include <iostream> // for std::cout
+
 #include "common/ceph_argparse.h"
 
 using namespace std;
diff --git a/src/test/testcrypto.cc b/src/test/testcrypto.cc
index 2efb9b219b9..8e3337babea 100644
--- a/src/test/testcrypto.cc
+++ b/src/test/testcrypto.cc
@@ -1,6 +1,8 @@
 #include "auth/Crypto.h"
-#include "common/Clock.h"
 
+#include <iostream> // for std::cout
+
+#include "common/Clock.h"
 #include "common/config.h"
 #include "common/debug.h"
 
diff --git a/src/test/testkeys.cc b/src/test/testkeys.cc
index 85d0b56676f..dacddb08786 100644
--- a/src/test/testkeys.cc
+++ b/src/test/testkeys.cc
@@ -1,4 +1,7 @@
 #include "auth/cephx/CephxKeyServer.h"
+
+#include <iostream> // for std::cout
+
 #include "common/ceph_argparse.h"
 #include "global/global_init.h"
 #include "common/config.h"
diff --git a/src/tools/radosacl.cc b/src/tools/radosacl.cc
index 3bfef8fb157..a6c9b9f8dc4 100644
--- a/src/tools/radosacl.cc
+++ b/src/tools/radosacl.cc
@@ -16,6 +16,8 @@
 #include <time.h>
 #include <errno.h>
 
+#include <iostream> // for std::cerr
+
 #include "include/types.h"
 #include "include/rados/librados.hpp"