185 files changed, 5425 insertions, 1419 deletions
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e71addf7bfa..1ab98a0ac4f 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -595,6 +595,8 @@ def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional
             if digest and not digest.endswith('@'):
                 logger.info(f"Using ceph image with id '{image_id}' and tag '{tag}' created on {created_date}\n{digest}")
                 return digest
+    if container_info is not None:
+        logger.warning(f"Not using image '{container_info.image_id}' as it's not in list of non-dangling images with ceph=True label")
     return None
 
 
@@ -2408,6 +2410,12 @@ def enable_cephadm_mgr_module(
     logger.info('Enabling cephadm module...')
     cli(['mgr', 'module', 'enable', 'cephadm'])
     wait_for_mgr_restart()
+    # https://tracker.ceph.com/issues/67969
+    # luckily `ceph mgr module enable <module>` returns
+    # a zero rc when the module is already enabled so
+    # this is no issue even if it is unnecessary
+    logger.info('Verifying orchestrator module is enabled...')
+    cli(['mgr', 'module', 'enable', 'orchestrator'])
     logger.info('Setting orchestrator backend to cephadm...')
     cli(['orch', 'set', 'backend', 'cephadm'])
 
@@ -3568,7 +3576,7 @@ def list_daemons(
                                 elif daemon_type == 'grafana':
                                     out, err, code = call(ctx,
                                                           [container_path, 'exec', container_id,
-                                                           'grafana-server', '-v'],
+                                                           'grafana', 'server', '-v'],
                                                           verbosity=CallVerbosity.QUIET)
                                     if not code and \
                                        out.startswith('Version '):
@@ -4074,7 +4082,7 @@ def command_adopt_grafana(ctx, daemon_id, fsid):
     ports = Monitoring.port_map['grafana']
     endpoints = [EndPoint('0.0.0.0', p) for p in ports]
 
-    _stop_and_disable(ctx, 'grafana-server')
+    _stop_and_disable(ctx, 'grafana server')
 
     ident = DaemonIdentity(fsid, daemon_type, daemon_id)
     data_dir_dst = make_data_dir(
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 37df1d48f43..f26cd361b58 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -19,7 +19,8 @@ DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
 DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
 DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:1.26.1'
+DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
+DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
 DEFAULT_REGISTRY = 'docker.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
diff --git a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
index b0a6f0579d2..01c68a83ba8 100644
--- a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
+++ b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
@@ -5,13 +5,13 @@ import re
 
 from ..call_wrappers import call, CallVerbosity
 from ..container_daemon_form import ContainerDaemonForm, daemon_to_container
-from ..container_types import CephContainer
+from ..container_types import CephContainer, extract_uid_gid
 from ..context import CephadmContext
 from ..context_getters import fetch_configs
 from ..daemon_form import register as register_daemon_form
 from ..daemon_identity import DaemonIdentity
 from ..deployment_utils import to_deployment_container
-from ..constants import DEFAULT_NGINX_IMAGE, UID_NOBODY, GID_NOGROUP
+from ..constants import DEFAULT_NGINX_IMAGE
 from ..data_utils import dict_get, is_fsid
 from ..file_utils import populate_files, makedirs, recursive_chown
 from ..exceptions import Error
@@ -90,7 +90,7 @@ class MgmtGateway(ContainerDaemonForm):
         return to_deployment_container(ctx, ctr)
 
     def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]:
-        return UID_NOBODY, GID_NOGROUP
+        return extract_uid_gid(ctx, file_path='/etc/nginx/')
 
     def get_daemon_args(self) -> List[str]:
         return []
@@ -150,6 +150,24 @@ class MgmtGateway(ContainerDaemonForm):
                 version = match.group(1)
         return version
 
+    def customize_container_args(
+        self, ctx: CephadmContext, args: List[str]
+    ) -> None:
+        uid, _ = self.uid_gid(ctx)
+        extra_args = [
+            '--user',
+            str(uid),
+        ]
+        args.extend(extra_args)
+
+    def customize_process_args(
+        self, ctx: CephadmContext, args: List[str]
+    ) -> None:
+        # The following noqa comment is intentional to suppress warnings about using double quotes
+        # instead of single quotes. We use double quotes here to ensure that single quotes are
+        # used in the final parsed output: nginx -g 'daemon off;'
+        args.extend(['nginx', '-g', "daemon off;"])  # noqa
+
     def customize_container_mounts(
         self, ctx: CephadmContext, mounts: Dict[str, str]
     ) -> None:
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 0aecd418b1b..ae9acbc9c45 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -81,6 +81,8 @@ class Config:
     smb_port: int
     ceph_config_entity: str
     vhostname: str
+    metrics_image: str
+    metrics_port: int
     # clustering related values
     rank: int
     rank_generation: int
@@ -103,6 +105,8 @@ class Config:
         smb_port: int = 0,
         ceph_config_entity: str = 'client.admin',
         vhostname: str = '',
+        metrics_image: str = '',
+        metrics_port: int = 0,
         rank: int = -1,
         rank_generation: int = -1,
         cluster_meta_uri: str = '',
@@ -122,6 +126,8 @@ class Config:
         self.smb_port = smb_port
         self.ceph_config_entity = ceph_config_entity
         self.vhostname = vhostname
+        self.metrics_image = metrics_image
+        self.metrics_port = metrics_port
         self.rank = rank
         self.rank_generation = rank_generation
         self.cluster_meta_uri = cluster_meta_uri
@@ -155,15 +161,34 @@ def _container_dns_args(cfg: Config) -> List[str]:
     return cargs
 
 
-class SambaContainerCommon:
-    def __init__(
-        self,
-        cfg: Config,
-    ) -> None:
+class ContainerCommon:
+    def __init__(self, cfg: Config, image: str = '') -> None:
         self.cfg = cfg
+        self.image = image
 
     def name(self) -> str:
-        raise NotImplementedError('samba container name')
+        raise NotImplementedError('container name')
+
+    def envs(self) -> Dict[str, str]:
+        return {}
+
+    def envs_list(self) -> List[str]:
+        return []
+
+    def args(self) -> List[str]:
+        return []
+
+    def container_args(self) -> List[str]:
+        return []
+
+    def container_image(self) -> str:
+        return self.image
+
+
+class SambaContainerCommon(ContainerCommon):
+    def __init__(self, cfg: Config, image: str = '') -> None:
+        self.cfg = cfg
+        self.image = image
 
     def envs(self) -> Dict[str, str]:
         environ = {
@@ -196,9 +221,6 @@ class SambaContainerCommon:
             args.append(f'--debug-delay={self.cfg.debug_delay}')
         return args
 
-    def container_args(self) -> List[str]:
-        return []
-
 
 class SambaNetworkedInitContainer(SambaContainerCommon):
     """SambaContainerCommon subclass that enables additional networking
@@ -233,6 +255,9 @@ class SMBDContainer(SambaContainerCommon):
         cargs = []
         if self.cfg.smb_port:
             cargs.append(f'--publish={self.cfg.smb_port}:{self.cfg.smb_port}')
+        if self.cfg.metrics_port:
+            metrics_port = self.cfg.metrics_port
+            cargs.append(f'--publish={metrics_port}:{metrics_port}')
         cargs.extend(_container_dns_args(self.cfg))
         return cargs
 
@@ -284,6 +309,17 @@ class ConfigWatchContainer(SambaContainerCommon):
         return super().args() + ['update-config', '--watch']
 
 
+class SMBMetricsContainer(ContainerCommon):
+    def name(self) -> str:
+        return 'smbmetrics'
+
+    def args(self) -> List[str]:
+        args = []
+        if self.cfg.metrics_port > 0:
+            args.append(f'--port={self.cfg.metrics_port}')
+        return args
+
+
 class CTDBMigrateInitContainer(SambaContainerCommon):
     def name(self) -> str:
         return 'ctdbMigrate'
@@ -358,13 +394,13 @@ class CTDBNodeMonitorContainer(SambaContainerCommon):
 class ContainerLayout:
     init_containers: List[SambaContainerCommon]
     primary: SambaContainerCommon
-    supplemental: List[SambaContainerCommon]
+    supplemental: List[ContainerCommon]
 
     def __init__(
         self,
         init_containers: List[SambaContainerCommon],
         primary: SambaContainerCommon,
-        supplemental: List[SambaContainerCommon],
+        supplemental: List[ContainerCommon],
     ) -> None:
         self.init_containers = init_containers
         self.primary = primary
@@ -393,6 +429,7 @@ class SMB(ContainerDaemonForm):
         self._cached_layout: Optional[ContainerLayout] = None
         self._rank_info = context_getters.fetch_rank_info(ctx)
         self.smb_port = 445
+        self.metrics_port = 9922
         self._network_mapper = _NetworkMapper(ctx)
         logger.debug('Created SMB ContainerDaemonForm instance')
 
@@ -431,6 +468,8 @@ class SMB(ContainerDaemonForm):
         files = data_utils.dict_get(configs, 'files', {})
         ceph_config_entity = configs.get('config_auth_entity', '')
         vhostname = configs.get('virtual_hostname', '')
+        metrics_image = configs.get('metrics_image', '')
+        metrics_port = int(configs.get('metrics_port', '0'))
         cluster_meta_uri = configs.get('cluster_meta_uri', '')
         cluster_lock_uri = configs.get('cluster_lock_uri', '')
         cluster_public_addrs = configs.get('cluster_public_addrs', [])
@@ -470,6 +509,8 @@ class SMB(ContainerDaemonForm):
             smb_port=self.smb_port,
             ceph_config_entity=ceph_config_entity,
             vhostname=vhostname,
+            metrics_image=metrics_image,
+            metrics_port=metrics_port,
             cluster_meta_uri=cluster_meta_uri,
             cluster_lock_uri=cluster_lock_uri,
             cluster_public_addrs=_public_addrs,
@@ -517,7 +558,7 @@ class SMB(ContainerDaemonForm):
         if self._cached_layout:
             return self._cached_layout
         init_ctrs: List[SambaContainerCommon] = []
-        ctrs: List[SambaContainerCommon] = []
+        ctrs: List[ContainerCommon] = []
 
         init_ctrs.append(ConfigInitContainer(self._cfg))
         ctrs.append(ConfigWatchContainer(self._cfg))
@@ -526,6 +567,11 @@ class SMB(ContainerDaemonForm):
             init_ctrs.append(MustJoinContainer(self._cfg))
             ctrs.append(WinbindContainer(self._cfg))
 
+        metrics_image = self._cfg.metrics_image.strip()
+        metrics_port = self._cfg.metrics_port
+        if metrics_image and metrics_port > 0:
+            ctrs.append(SMBMetricsContainer(self._cfg, metrics_image))
+
         if self._cfg.clustered:
             init_ctrs += [
                 CTDBMigrateInitContainer(self._cfg),
@@ -564,7 +610,7 @@ class SMB(ContainerDaemonForm):
         )
 
     def _to_sidecar_container(
-        self, ctx: CephadmContext, smb_ctr: SambaContainerCommon
+        self, ctx: CephadmContext, smb_ctr: ContainerCommon
     ) -> SidecarContainer:
         volume_mounts: Dict[str, str] = {}
         container_args: List[str] = smb_ctr.container_args()
@@ -587,10 +633,11 @@ class SMB(ContainerDaemonForm):
         identity = DaemonSubIdentity.from_parent(
             self.identity, smb_ctr.name()
         )
+        img = smb_ctr.container_image() or ctx.image or self.default_image
         return SidecarContainer(
             ctx,
             entrypoint='',
-            image=ctx.image or self.default_image,
+            image=img,
             identity=identity,
             container_args=container_args,
             args=smb_ctr.args(),
@@ -673,6 +720,9 @@ class SMB(ContainerDaemonForm):
     ) -> None:
         if not any(ep.port == self.smb_port for ep in endpoints):
             endpoints.append(EndPoint('0.0.0.0', self.smb_port))
+        if self.metrics_port > 0:
+            if not any(ep.port == self.metrics_port for ep in endpoints):
+                endpoints.append(EndPoint('0.0.0.0', self.metrics_port))
 
     def prepare_data_dir(self, data_dir: str, uid: int, gid: int) -> None:
         self.validate()
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index ad480c47d5c..458bfd985c3 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -2757,28 +2757,65 @@ int group_snap_get_by_id(librados::IoCtx *ioctx, const std::string &oid,
 
   return 0;
 }
+
+void group_snap_list_start(librados::ObjectReadOperation *op,
+                           const cls::rbd::GroupSnapshot &start,
+                           uint64_t max_return)
+{
+  bufferlist bl;
+  encode(start, bl);
+  encode(max_return, bl);
+
+  op->exec("rbd", "group_snap_list", bl);
+}
+
+int group_snap_list_finish(bufferlist::const_iterator *iter,
+                           std::vector<cls::rbd::GroupSnapshot> *snapshots)
+{
+  try {
+    decode(*snapshots, *iter);
+  } catch (const ceph::buffer::error &err) {
+    return -EBADMSG;
+  }
+  return 0;
+}
+
 int group_snap_list(librados::IoCtx *ioctx, const std::string &oid,
                     const cls::rbd::GroupSnapshot &start,
                     uint64_t max_return,
                     std::vector<cls::rbd::GroupSnapshot> *snapshots)
 {
-  using ceph::encode;
-  using ceph::decode;
-  bufferlist inbl, outbl;
-  encode(start, inbl);
-  encode(max_return, inbl);
+  librados::ObjectReadOperation op;
+  group_snap_list_start(&op, start, max_return);
 
-  int r = ioctx->exec(oid, "rbd", "group_snap_list", inbl, outbl);
+  bufferlist out_bl;
+  int r = ioctx->operate(oid, &op, &out_bl);
   if (r < 0) {
     return r;
   }
-  auto iter = outbl.cbegin();
+
+  auto it = out_bl.cbegin();
+  return group_snap_list_finish(&it, snapshots);
+}
+
+void group_snap_list_order_start(librados::ObjectReadOperation *op,
+                                 const std::string &start,
+                                 uint64_t max_return)
+{
+  bufferlist bl;
+  encode(start, bl);
+  encode(max_return, bl);
+  op->exec("rbd", "group_snap_list_order", bl);
+}
+
+int group_snap_list_order_finish(bufferlist::const_iterator *iter,
+                                 std::map<std::string, uint64_t> *snap_order)
+{
   try {
-    decode(*snapshots, iter);
+    decode(*snap_order, *iter);
   } catch (const ceph::buffer::error &err) {
     return -EBADMSG;
   }
-
   return 0;
 }
 
@@ -2786,24 +2823,17 @@ int group_snap_list_order(librados::IoCtx *ioctx, const std::string &oid,
                           const std::string &start, uint64_t max_return,
                           std::map<std::string, uint64_t> *snap_order)
 {
-  using ceph::encode;
-  using ceph::decode;
-  bufferlist inbl, outbl;
-  encode(start, inbl);
-  encode(max_return, inbl);
+  librados::ObjectReadOperation op;
+  group_snap_list_order_start(&op, start, max_return);
 
-  int r = ioctx->exec(oid, "rbd", "group_snap_list_order", inbl, outbl);
+  bufferlist out_bl;
+  int r = ioctx->operate(oid, &op, &out_bl);
   if (r < 0) {
     return r;
   }
-  auto iter = outbl.cbegin();
-  try {
-    decode(*snap_order, iter);
-  } catch (const ceph::buffer::error &err) {
-    return -EBADMSG;
-  }
 
-  return 0;
+  auto it = out_bl.cbegin();
+  return group_snap_list_order_finish(&it, snap_order);
 }
 
 // rbd_trash functions
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 4005c51836c..b1553bd1f17 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -580,10 +580,20 @@ int group_snap_remove(librados::IoCtx *ioctx, const std::string &oid,
 int group_snap_get_by_id(librados::IoCtx *ioctx, const std::string &oid,
                          const std::string &snap_id,
                          cls::rbd::GroupSnapshot *snapshot);
+void group_snap_list_start(librados::ObjectReadOperation *op,
+                           const cls::rbd::GroupSnapshot &start,
+                           uint64_t max_return);
+int group_snap_list_finish(ceph::buffer::list::const_iterator *iter,
+                           std::vector<cls::rbd::GroupSnapshot> *snapshots);
 int group_snap_list(librados::IoCtx *ioctx, const std::string &oid,
                     const cls::rbd::GroupSnapshot &start,
                     uint64_t max_return,
                     std::vector<cls::rbd::GroupSnapshot> *snapshots);
+void group_snap_list_order_start(librados::ObjectReadOperation *op,
+                                 const std::string &start_snap_id,
+                                 uint64_t max_return);
+int group_snap_list_order_finish(ceph::buffer::list::const_iterator *iter,
+                                 std::map<std::string, uint64_t> *snap_order);
 int group_snap_list_order(librados::IoCtx *ioctx, const std::string &oid,
                           const std::string &snap_id, uint64_t max_return,
                           std::map<std::string, uint64_t> *snap_order);
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index dcf3eaac0d6..18efba561ed 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1666,6 +1666,17 @@ options:
   - mds
   flags:
   - runtime
+- name: mds_scrub_stats_review_period
+  type: uint
+  level: advanced
+  desc: Period for which scrub stats will be available for review.
+  long_desc: Number of days for which scrub stats will be available for review since
+    start of scrub operation. After this period, the stats will be auto purged.
+    These stats will not be saved to the disk. So any restart or failover of mds
+    will cause stats to be lost forever.
+  default: 1
+  min: 1
+  max: 60
 - name: mds_session_metadata_threshold
   type: size
   level: advanced
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index 268a89154de..8dae546521d 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -520,6 +520,70 @@ options:
     stats (inc. scrub/block duration) every this many seconds.
   default: 120
   with_legacy: false
+- name: osd_scrub_retry_delay
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying a PG that has failed a prior scrub.
+  long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is
+    either applied to one of the scheduled scrubs for the PG (the next shallow
+    scrub or the next deep scrub), or to both.
+    This is a default value, used when the cause of the delay does not have an
+    associated configuration option. See the 'see also' for the configuration
+    options for some delay reasons that have their own configuration.
+  default: 30
+  min: 1
+  see_also:
+  - osd_scrub_retry_pg_state
+  - osd_scrub_retry_after_noscrub
+  - osd_scrub_retry_new_interval
+  - osd_scrub_retry_trimming
+  with_legacy: false
+- name: osd_scrub_retry_after_noscrub
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying to scrub a PG at a specific level
+    after detecting a no-scrub or no-deep-scrub flag
+  long_desc: Minimum delay after a failed attempt to scrub a PG at a level
+    (shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub
+    flags.
+  default: 60
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
+- name: osd_scrub_retry_pg_state
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG
+  long_desc: Minimum delay after a failed attempt to scrub a PG that is not
+    active and clean.
+  default: 60
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
+- name: osd_scrub_retry_trimming
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying to scrub a previously snap-trimming PG
+  long_desc: Minimum delay after a failed attempt to scrub a PG that was performing
+    snap trimming and not available for scrubbing.
+  default: 10
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
+- name: osd_scrub_retry_new_interval
+  type: int
+  level: advanced
+  desc: Period (in seconds) before retrying a scrub aborted on a new interval
+  long_desc: Minimum delay before retrying, after a scrub was aborted as the
+    PG interval changed.
+  default: 10
+  min: 1
+  see_also:
+  - osd_scrub_retry_delay
+  with_legacy: false
 - name: osd_scrub_disable_reservation_queuing
   type: bool
   level: advanced
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index 56dcc05a54b..f3d242b1fe3 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -3458,11 +3458,11 @@ options:
 - name: rgw_max_listing_results
   type: uint
   level: advanced
-  desc: Upper bound on results in listing operations, ListBucket max-keys
+  desc: Upper bound on results in listing operations, ListObjects max-keys
   long_desc: This caps the maximum permitted value for listing-like operations in
-    RGW S3. Affects ListBucket(max-keys), ListBucketVersions(max-keys), ListBucketMultipartUploads(max-uploads),
-    ListMultipartUploadParts(max-parts)
-  default: 1000
+    RGW S3. Affects ListObjects(max-keys), ListObjectsVersions(max-keys),
+    ListMultipartUploads(max-uploads), ListParts(max-parts)
+  default: 5000
   services:
   - rgw
   - rgw
diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h
index 559a889a3e2..f7faef87970 100644
--- a/src/crimson/common/gated.h
+++ b/src/crimson/common/gated.h
@@ -6,6 +6,8 @@
 #include <seastar/core/gate.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/future-util.hh>
+#include <type_traits>
+#include <vector>
 
 #include "crimson/common/exception.h"
 #include "crimson/common/log.h"
@@ -15,15 +17,27 @@ namespace crimson::common {
 
 class Gated {
  public:
+  Gated() : sid(seastar::this_shard_id()) {}
+  Gated(const seastar::shard_id sid) : sid(sid) {}
+  Gated(const Gated&) = delete;
+  Gated& operator=(const Gated&) = delete;
+  Gated(Gated&&) = default;
+  Gated& operator=(Gated&&) = delete;
+  virtual ~Gated() = default;
+  
   static seastar::logger& gated_logger() {
     return crimson::get_logger(ceph_subsys_osd);
   }
+
   template <typename Func, typename T>
   inline void dispatch_in_background(const char* what, T& who, Func&& func) {
-    (void) dispatch(what, who, func);
+    //ceph_assert(seastar::this_shard_id() == sid);
+    (void) dispatch(what, who, std::forward<Func>(func));
   }
+
   template <typename Func, typename T>
   inline seastar::future<> dispatch(const char* what, T& who, Func&& func) {
+    //ceph_assert(seastar::this_shard_id() == sid);
     return seastar::with_gate(pending_dispatch, std::forward<Func>(func)
     ).handle_exception([what, &who] (std::exception_ptr eptr) {
       if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) {
@@ -42,14 +56,81 @@ class Gated {
     });
   }
 
+  template <typename Func>
+  auto simple_dispatch(const char* what, Func&& func) {
+    //ceph_assert(seastar::this_shard_id() == sid);
+    return seastar::with_gate(pending_dispatch, std::forward<Func>(func));
+  }
+
   seastar::future<> close() {
+    ceph_assert(seastar::this_shard_id() == sid);
     return pending_dispatch.close();
   }
+
   bool is_closed() const {
     return pending_dispatch.is_closed();
   }
+
+  seastar::shard_id get_shard_id() const {
+    return sid;
+  }
  private:
   seastar::gate pending_dispatch;
+  const seastar::shard_id sid;
+};
+
+// gate_per_shard is a class that provides a gate for each shard.
+// It was introduced to provide a way to have gate for each shard
+// in a seastar application since gates are not supposed to be shared
+// across shards. ( https://tracker.ceph.com/issues/64332 )
+class gate_per_shard {
+ public:
+  gate_per_shard() : gates(seastar::smp::count) {
+    std::vector<seastar::future<>> futures;
+    for (unsigned shard = 0; shard < seastar::smp::count; ++shard) {
+      futures.push_back(seastar::smp::submit_to(shard, [this, shard] {
+        gates[shard] = std::make_unique<Gated>();
+      }));
+    }
+    seastar::when_all_succeed(futures.begin(), futures.end()).get();
+  }
+  //explicit gate_per_shard(size_t shard_count) : gates(shard_count) {}
+  gate_per_shard(const gate_per_shard&) = delete;
+  gate_per_shard& operator=(const gate_per_shard&) = delete;
+  gate_per_shard(gate_per_shard&&) = default;
+  gate_per_shard& operator=(gate_per_shard&&) = default;
+  ~gate_per_shard() = default;
+
+  template <typename Func, typename T>
+  inline void dispatch_in_background(const char* what, T& who, Func&& func) {
+    (void) dispatch(what, who, std::forward<Func>(func));
+  }
+
+  template <typename Func, typename T>
+  inline auto dispatch(const char* what, T& who, Func&& func) {
+    return gates[seastar::this_shard_id()]->dispatch(what, who, std::forward<Func>(func));
+  }
+
+  template <typename Func>
+  auto simple_dispatch(const char* what, Func&& func) {
+    return gates[seastar::this_shard_id()]->simple_dispatch(what, std::forward<Func>(func));
+  }
+
+  bool is_closed() const {
+    return gates[seastar::this_shard_id()]->is_closed();
+  }
+
+  seastar::future<> close_all() {
+    ceph_assert(gates.size() == seastar::smp::count);
+    return seastar::parallel_for_each(gates.begin(), gates.end(), [] (std::unique_ptr<Gated>& gate_ptr) {
+      return seastar::smp::submit_to(gate_ptr->get_shard_id(), [gate = gate_ptr.get()] {
+        return gate->close();
+      });
+    });
+  }
+
+ private:
+  std::vector<std::unique_ptr<Gated>> gates;
 };
 
-}// namespace crimson::common
+} // namespace crimson::common
diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h
index 80dd953c65f..5986fcb16ac 100644
--- a/src/crimson/net/io_handler.h
+++ b/src/crimson/net/io_handler.h
@@ -255,7 +255,7 @@ public:
   class shard_states_t {
   public:
     shard_states_t(seastar::shard_id _sid, io_state_t state)
-      : sid{_sid}, io_state{state} {}
+      : sid{_sid}, io_state{state}, gate{_sid} {}
 
     seastar::shard_id get_shard_id() const {
       return sid;
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index 21bb250e13f..3fd2bb1fd15 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -75,7 +75,8 @@ AlienStore::AlienStore(const std::string& type,
                        const ConfigValues& values)
   : type(type),
     path{path},
-    values(values)
+    values(values),
+    op_gates()
 {
 }
 
@@ -142,12 +143,12 @@ AlienStore::exists(
   CollectionRef ch,
   const ghobject_t& oid)
 {
-  return seastar::with_gate(op_gate, [=, this] {
-    return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
-      auto c = static_cast<AlienCollection*>(ch.get());
-      return store->exists(c->collection, oid);
+    return op_gates.simple_dispatch("exists", [=, this] {
+        return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
+            auto c = static_cast<AlienCollection*>(ch.get());
+            return store->exists(c->collection, oid);
+        });
     });
-  });
 }
 
 AlienStore::mount_ertr::future<> AlienStore::mount()
@@ -173,7 +174,7 @@ seastar::future<> AlienStore::umount()
     // not really started yet
     return seastar::now();
   }
-  return op_gate.close().then([this] {
+  return op_gates.close_all().then([this] {
     return tp->submit([this] {
       {
 	std::lock_guard l(coll_map_lock);
@@ -183,10 +184,10 @@ seastar::future<> AlienStore::umount()
 	coll_map.clear();
       }
       return store->umount();
+    }).then([] (int r) {
+      assert(r == 0);
+      return seastar::now();
     });
-  }).then([] (int r) {
-    assert(r == 0);
-    return seastar::now();
   });
 }
 
@@ -477,7 +478,7 @@ seastar::future<> AlienStore::inject_data_error(const ghobject_t& o)
 {
   logger().debug("{}", __func__);
   assert(tp);
-  return seastar::with_gate(op_gate, [=, this] {
+  return op_gates.simple_dispatch("inject_data_error", [=, this] {
     return tp->submit([o, this] {
       return store->inject_data_error(o);
     });
@@ -488,8 +489,8 @@ seastar::future<> AlienStore::inject_mdata_error(const ghobject_t& o)
 {
   logger().debug("{}", __func__);
   assert(tp);
-  return seastar::with_gate(op_gate, [=, this] {
-    return tp->submit([=, this] {
+  return op_gates.simple_dispatch("inject_mdata_error", [=, this] {
+    return tp->submit([o, this] {
       return store->inject_mdata_error(o);
     });
   });
@@ -500,7 +501,7 @@ seastar::future<> AlienStore::write_meta(const std::string& key,
 {
   logger().debug("{}", __func__);
   assert(tp);
-  return seastar::with_gate(op_gate, [=, this] {
+  return op_gates.simple_dispatch("write_meta", [=, this] {
     return tp->submit([=, this] {
       return store->write_meta(key, value);
     }).then([] (int r) {
@@ -515,8 +516,8 @@ AlienStore::read_meta(const std::string& key)
 {
   logger().debug("{}", __func__);
   assert(tp);
-  return seastar::with_gate(op_gate, [this, key] {
-    return tp->submit([this, key] {
+  return op_gates.simple_dispatch("read_meta", [this, key] {
+    return tp->submit([key, this] {
       std::string value;
       int r = store->read_meta(key, &value);
       if (r > 0) {
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index 734ee160959..d36f449afd8 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -10,6 +10,7 @@
 #include "os/ObjectStore.h"
 #include "osd/osd_types.h"
 
+#include "crimson/common/gated.h"
 #include "crimson/os/alienstore/thread_pool.h"
 #include "crimson/os/futurized_collection.h"
 #include "crimson/os/futurized_store.h"
@@ -111,9 +112,10 @@ public:
   }
 
 private:
+
   template <class... Args>
   auto do_with_op_gate(Args&&... args) const {
-    return seastar::with_gate(op_gate,
+    return op_gates.simple_dispatch("AlienStore::do_with_op_gate",
       // perfect forwarding in lambda's closure isn't available in C++17
       // using tuple as workaround; see: https://stackoverflow.com/a/49902823
       [args = std::make_tuple(std::forward<Args>(args)...)] () mutable {
@@ -130,7 +132,7 @@ private:
   uint64_t used_bytes = 0;
   std::unique_ptr<ObjectStore> store;
   std::unique_ptr<CephContext> cct;
-  mutable seastar::gate op_gate;
+  mutable crimson::common::gate_per_shard op_gates;
 
   /**
    * coll_map
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc
index 30ff4554074..f89698d602a 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.cc
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc
@@ -38,12 +38,14 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
 	      trans_intr::make_interruptible(
 		c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
     } else {
+      c.cache.account_absent_access(c.trans.get_src());
       return {false,
 	      trans_intr::make_interruptible(
 		Cache::get_extent_ertr::make_ready_future<
 		  CachedExtentRef>())};
     }
   } else {
+    c.cache.account_absent_access(c.trans.get_src());
     return {false,
 	    trans_intr::make_interruptible(
 	      Cache::get_extent_ertr::make_ready_future<
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h
index eacf8a8cc40..09f54a4f2d0 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_node.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.h
@@ -370,9 +370,11 @@ struct FixedKVNode : ChildableCachedExtent {
       if (is_valid_child_ptr(child)) {
 	return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
       } else {
+        c.cache.account_absent_access(c.trans.get_src());
 	return child_pos_t(&sparent, spos);
       }
     } else {
+      c.cache.account_absent_access(c.trans.get_src());
       return child_pos_t(this, pos);
     }
   }
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 56f7e3c76a9..cf8d3c0891d 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -52,10 +52,10 @@ Cache::Cache(
 Cache::~Cache()
 {
   LOG_PREFIX(Cache::~Cache);
-  for (auto &i: extents) {
+  for (auto &i: extents_index) {
     ERROR("extent is still alive -- {}", i);
   }
-  ceph_assert(extents.empty());
+  ceph_assert(extents_index.empty());
 }
 
 // TODO: this method can probably be removed in the future
@@ -83,7 +83,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
 
   // absent from transaction
   // retiring is not included by the cache hit metrics
-  ext = query_cache(addr, nullptr);
+  ext = query_cache(addr);
   if (ext) {
     DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
   } else {
@@ -112,7 +112,7 @@ void Cache::retire_absent_extent_addr(
   auto result = t.get_extent(addr, &ext);
   assert(result != Transaction::get_extent_ret::PRESENT
     && result != Transaction::get_extent_ret::RETIRED);
-  assert(!query_cache(addr, nullptr));
+  assert(!query_cache(addr));
 #endif
   LOG_PREFIX(Cache::retire_absent_extent_addr);
   // add a new placeholder to Cache
@@ -134,7 +134,7 @@ void Cache::dump_contents()
 {
   LOG_PREFIX(Cache::dump_contents);
   DEBUG("enter");
-  for (auto &&i: extents) {
+  for (auto &&i: extents_index) {
     DEBUG("live {}", i);
   }
   DEBUG("exit");
@@ -150,6 +150,9 @@ void Cache::register_metrics()
   last_dirty_io_by_src_ext = {};
   last_trim_rewrites = {};
   last_reclaim_rewrites = {};
+  last_access = {};
+  last_cache_absent_by_src = {};
+  last_access_by_src_ext = {};
 
   namespace sm = seastar::metrics;
   using src_t = Transaction::src_t;
@@ -204,25 +207,25 @@ void Cache::register_metrics()
   /*
    * cache_query: cache_access and cache_hit
    */
-  for (auto& [src, src_label] : labels_by_src) {
-    metrics.add_group(
-      "cache",
-      {
-        sm::make_counter(
-          "cache_access",
-          get_by_src(stats.cache_query_by_src, src).access,
-          sm::description("total number of cache accesses"),
-          {src_label}
-        ),
-        sm::make_counter(
-          "cache_hit",
-          get_by_src(stats.cache_query_by_src, src).hit,
-          sm::description("total number of cache hits"),
-          {src_label}
-        ),
-      }
-    );
-  }
+  metrics.add_group(
+    "cache",
+    {
+      sm::make_counter(
+        "cache_access",
+        [this] {
+          return stats.access.get_cache_access();
+        },
+        sm::description("total number of cache accesses")
+      ),
+      sm::make_counter(
+        "cache_hit",
+        [this] {
+          return stats.access.s.get_cache_hit();
+        },
+        sm::description("total number of cache hits")
+      ),
+    }
+  );
 
   {
     /*
@@ -487,14 +490,14 @@ void Cache::register_metrics()
       sm::make_counter(
         "cached_extents",
         [this] {
-          return extents.size();
+          return extents_index.size();
         },
         sm::description("total number of cached extents")
       ),
       sm::make_counter(
         "cached_extent_bytes",
         [this] {
-          return extents.get_bytes();
+          return extents_index.get_bytes();
         },
         sm::description("total bytes of cached extents")
       ),
@@ -738,7 +741,7 @@ void Cache::add_extent(CachedExtentRef ref)
   assert(ref->is_valid());
   assert(ref->user_hint == PLACEMENT_HINT_NULL);
   assert(ref->rewrite_generation == NULL_GENERATION);
-  extents.insert(*ref);
+  extents_index.insert(*ref);
 }
 
 void Cache::mark_dirty(CachedExtentRef ref)
@@ -876,7 +879,7 @@ void Cache::remove_extent(
   } else if (!ref->is_placeholder()) {
     lru.remove_from_lru(*ref);
   }
-  extents.erase(*ref);
+  extents_index.erase(*ref);
 }
 
 void Cache::commit_retire_extent(
@@ -897,7 +900,7 @@ void Cache::commit_replace_extent(
 {
   assert(next->get_paddr() == prev->get_paddr());
   assert(next->version == prev->version + 1);
-  extents.replace(*next, *prev);
+  extents_index.replace(*next, *prev);
 
   const auto t_src = t.get_src();
   if (is_root_type(prev->get_type())) {
@@ -1822,7 +1825,7 @@ void Cache::init()
              NULL_GENERATION,
 	     TRANS_ID_NULL);
   INFO("init root -- {}", *root);
-  extents.insert(*root);
+  extents_index.insert(*root);
 }
 
 Cache::mkfs_iertr::future<> Cache::mkfs(Transaction &t)
@@ -1851,8 +1854,8 @@ Cache::close_ertr::future<> Cache::close()
        get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
        lru.get_current_num_extents(),
        lru.get_current_size_bytes(),
-       extents.size(),
-       extents.get_bytes());
+       extents_index.size(),
+       extents_index.get_bytes());
   root.reset();
   clear_dirty();
   backref_extents.clear();
@@ -1970,7 +1973,7 @@ Cache::replay_delta(
     auto _get_extent_if_cached = [this](paddr_t addr)
       -> get_extent_ertr::future<CachedExtentRef> {
       // replay is not included by the cache hit metrics
-      auto ret = query_cache(addr, nullptr);
+      auto ret = query_cache(addr);
       if (ret) {
         // no retired-placeholder should be exist yet because no transaction
         // has been created.
@@ -1983,15 +1986,14 @@ Cache::replay_delta(
       }
     };
     auto extent_fut = (delta.pversion == 0 ?
-      // replay is not included by the cache hit metrics
       do_get_caching_extent_by_type(
         delta.type,
         delta.paddr,
         delta.laddr,
         delta.length,
-        nullptr,
         [](CachedExtent &) {},
         [this](CachedExtent &ext) {
+          // replay is not included by the cache hit metrics
           touch_extent(ext, nullptr);
         }) :
       _get_extent_if_cached(
@@ -2155,73 +2157,65 @@ Cache::do_get_caching_extent_by_type(
   paddr_t offset,
   laddr_t laddr,
   extent_len_t length,
-  const Transaction::src_t* p_src,
   extent_init_func_t &&extent_init_func,
   extent_init_func_t &&on_cache)
 {
   return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
-    src_ext_t* p_metric_key = nullptr;
-    src_ext_t metric_key;
-    if (p_src) {
-      metric_key = std::make_pair(*p_src, type);
-      p_metric_key = &metric_key;
-    }
-
     switch (type) {
     case extent_types_t::ROOT:
       ceph_assert(0 == "ROOT is never directly read");
       return get_extent_ertr::make_ready_future<CachedExtentRef>();
     case extent_types_t::BACKREF_INTERNAL:
       return do_get_caching_extent<backref::BackrefInternalNode>(
-	offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+	offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::BACKREF_LEAF:
       return do_get_caching_extent<backref::BackrefLeafNode>(
-	offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+	offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::LADDR_INTERNAL:
       return do_get_caching_extent<lba_manager::btree::LBAInternalNode>(
-	offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+	offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::LADDR_LEAF:
       return do_get_caching_extent<lba_manager::btree::LBALeafNode>(
-	offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+	offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::OMAP_INNER:
       return do_get_caching_extent<omap_manager::OMapInnerNode>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
         return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::OMAP_LEAF:
       return do_get_caching_extent<omap_manager::OMapLeafNode>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
         return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::COLL_BLOCK:
       return do_get_caching_extent<collection_manager::CollectionNode>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
         return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::ONODE_BLOCK_STAGED:
       return do_get_caching_extent<onode::SeastoreNodeExtent>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::OBJECT_DATA_BLOCK:
       return do_get_caching_extent<ObjectDataBlock>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
@@ -2230,13 +2224,13 @@ Cache::do_get_caching_extent_by_type(
       return get_extent_ertr::make_ready_future<CachedExtentRef>();
     case extent_types_t::TEST_BLOCK:
       return do_get_caching_extent<TestBlock>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
     case extent_types_t::TEST_BLOCK_PHYSICAL:
       return do_get_caching_extent<TestBlockPhysical>(
-          offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+          offset, length, std::move(extent_init_func), std::move(on_cache)
       ).safe_then([](auto extent) {
 	return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
@@ -2266,12 +2260,17 @@ cache_stats_t Cache::get_stats(
   lru.get_stats(ret, report_detail, seconds);
 
   /*
-   * get dirty stats
+   * dirty stats
+   * rewrite stats
+   * index stats
+   * access stats
    */
 
   ret.dirty_sizes = cache_size_stats_t{stats.dirty_bytes, dirty.size()};
   ret.dirty_io = stats.dirty_io;
   ret.dirty_io.minus(last_dirty_io);
+  ret.access = stats.access;
+  ret.access.minus(last_access);
 
   if (report_detail && seconds != 0) {
     counter_by_src_t<counter_by_extent_t<dirty_io_stats_t> >
@@ -2326,7 +2325,7 @@ cache_stats_t Cache::get_stats(
       const auto& io_by_ext = get_by_src(_trans_io_by_src_ext, src);
       for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
         auto ext = static_cast<extent_types_t>(_ext);
-        const auto extent_io = get_by_ext(io_by_ext, ext);
+        const auto& extent_io = get_by_ext(io_by_ext, ext);
         if (is_data_type(ext)) {
           data_io.add(extent_io);
         } else if (is_logical_metadata_type(ext)) {
@@ -2363,14 +2362,71 @@ cache_stats_t Cache::get_stats(
         << "ps, dversion="
         << fmt::format(dfmt, _reclaim_rewrites.get_avg_version());
 
+    oss << "\ncache total"
+        << cache_size_stats_t{extents_index.get_bytes(), extents_index.size()};
+
+    counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+      _access_by_src_ext = stats.access_by_src_ext;
+    counter_by_src_t<cache_access_stats_t> access_by_src;
+    for (uint8_t _src=0; _src<TRANSACTION_TYPE_MAX; ++_src) {
+      auto src = static_cast<transaction_type_t>(_src);
+      cache_access_stats_t& trans_access = get_by_src(access_by_src, src);
+      trans_access.cache_absent = get_by_src(stats.cache_absent_by_src, src);
+      trans_access.cache_absent -= get_by_src(last_cache_absent_by_src, src);
+      auto& access_by_ext = get_by_src(_access_by_src_ext, src);
+      const auto& last_access_by_ext = get_by_src(last_access_by_src_ext, src);
+      for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
+        auto ext = static_cast<extent_types_t>(_ext);
+        extent_access_stats_t& extent_access = get_by_ext(access_by_ext, ext);
+        const auto& last_extent_access = get_by_ext(last_access_by_ext, ext);
+        extent_access.minus(last_extent_access);
+        trans_access.s.add(extent_access);
+      }
+    }
+    oss << "\naccess: total"
+        << cache_access_stats_printer_t{seconds, ret.access};
+    for (uint8_t _src=0; _src<TRANSACTION_TYPE_MAX; ++_src) {
+      auto src = static_cast<transaction_type_t>(_src);
+      const auto& trans_access = get_by_src(access_by_src, src);
+      if (trans_access.is_empty()) {
+        continue;
+      }
+      extent_access_stats_t data_access;
+      extent_access_stats_t mdat_access;
+      extent_access_stats_t phys_access;
+      const auto& access_by_ext = get_by_src(_access_by_src_ext, src);
+      for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
+        auto ext = static_cast<extent_types_t>(_ext);
+        const auto& extent_access = get_by_ext(access_by_ext, ext);
+        if (is_data_type(ext)) {
+          data_access.add(extent_access);
+        } else if (is_logical_metadata_type(ext)) {
+          mdat_access.add(extent_access);
+        } else if (is_physical_type(ext)) {
+          phys_access.add(extent_access);
+        }
+      }
+      oss << "\n  " << src << ": "
+          << cache_access_stats_printer_t{seconds, trans_access}
+          << "\n    data"
+          << extent_access_stats_printer_t{seconds, data_access}
+          << "\n    mdat"
+          << extent_access_stats_printer_t{seconds, mdat_access}
+          << "\n    phys"
+          << extent_access_stats_printer_t{seconds, phys_access};
+    }
+
     INFO("{}", oss.str());
 
     last_dirty_io_by_src_ext = stats.dirty_io_by_src_ext;
     last_trim_rewrites = stats.trim_rewrites;
     last_reclaim_rewrites = stats.reclaim_rewrites;
+    last_cache_absent_by_src = stats.cache_absent_by_src;
+    last_access_by_src_ext = stats.access_by_src_ext;
   }
 
   last_dirty_io = stats.dirty_io;
+  last_access = stats.access;
 
   return ret;
 }
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index 7bfc4d07aab..dba3610e95f 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -167,7 +167,7 @@ using backref_entry_query_set_t = std::set<
  * - Remove all extents in the retired_set from Cache::extents
  * - Mark all extents in the write_set wait_io(), add promises to
  *   transaction
- * - Merge Transaction::write_set into Cache::extents
+ * - Merge Transaction::write_set into Cache::extents_index
  *
  * After phase 2, the user will submit the record to the journal.
  * Once complete, we perform phase 3:
@@ -272,6 +272,11 @@ public:
     return t.root;
   }
 
+  void account_absent_access(Transaction::src_t src) {
+    ++(get_by_src(stats.cache_absent_by_src, src));
+    ++stats.access.cache_absent;
+  }
+
   /**
    * get_extent_if_cached
    *
@@ -287,12 +292,29 @@ public:
     CachedExtentRef ret;
     LOG_PREFIX(Cache::get_extent_if_cached);
     auto result = t.get_extent(offset, &ret);
+    const auto t_src = t.get_src();
+    extent_access_stats_t& access_stats = get_by_ext(
+      get_by_src(stats.access_by_src_ext, t_src),
+      type);
     if (result == Transaction::get_extent_ret::RETIRED) {
       SUBDEBUGT(seastore_cache, "{} {} is retired on t -- {}",
                 t, type, offset, *ret);
       return get_extent_if_cached_iertr::make_ready_future<
         CachedExtentRef>(ret);
     } else if (result == Transaction::get_extent_ret::PRESENT) {
+      if (ret->is_stable()) {
+        if (ret->is_dirty()) {
+          ++access_stats.trans_dirty;
+          ++stats.access.s.trans_dirty;
+        } else {
+          ++access_stats.trans_lru;
+          ++stats.access.s.trans_lru;
+        }
+      } else {
+        ++access_stats.trans_pending;
+        ++stats.access.s.trans_pending;
+      }
+
       if (ret->is_fully_loaded()) {
         SUBTRACET(seastore_cache, "{} {} is present on t -- {}",
                   t, type, offset, *ret);
@@ -309,17 +331,29 @@ public:
     }
 
     // get_extent_ret::ABSENT from transaction
-    auto metric_key = std::make_pair(t.get_src(), type);
-    ret = query_cache(offset, &metric_key);
+    ret = query_cache(offset);
     if (!ret) {
       SUBDEBUGT(seastore_cache, "{} {} is absent", t, type, offset);
+      account_absent_access(t_src);
       return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
     } else if (is_retired_placeholder_type(ret->get_type())) {
       // retired_placeholder is not really cached yet
       SUBDEBUGT(seastore_cache, "{} {} is absent(placeholder)",
                 t, type, offset);
+      account_absent_access(t_src);
       return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
-    } else if (!ret->is_fully_loaded()) {
+    }
+
+    if (ret->is_dirty()) {
+      ++access_stats.cache_dirty;
+      ++stats.access.s.cache_dirty;
+    } else {
+      ++access_stats.cache_lru;
+      ++stats.access.s.cache_lru;
+    }
+
+    if (!ret->is_fully_loaded()) {
+      // ignore non-full extent
       SUBDEBUGT(seastore_cache, "{} {} is present without "
                 "being fully loaded", t, type, offset);
       return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
@@ -329,7 +363,6 @@ public:
     SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
               t, type, offset, *ret);
     t.add_to_read_set(ret);
-    const auto t_src = t.get_src();
     touch_extent(*ret, &t_src);
     return ret->wait_io().then([ret] {
       return get_extent_if_cached_iertr::make_ready_future<
@@ -349,6 +382,8 @@ public:
    *
    * Note, the current implementation leverages parent-child
    * pointers in LBA instead, so it should only be called in tests.
+   *
+   * This path won't be accounted by the cache_access_stats_t.
    */
   using get_extent_iertr = base_iertr;
   template <typename T>
@@ -389,11 +424,9 @@ public:
         const auto t_src = t.get_src();
         touch_extent(ext, &t_src);
       };
-      auto metric_key = std::make_pair(t.get_src(), T::TYPE);
       return trans_intr::make_interruptible(
         do_get_caching_extent<T>(
-          offset, length, &metric_key,
-          [](T &){}, std::move(f))
+          offset, length, [](T &){}, std::move(f))
       );
     }
   }
@@ -423,15 +456,22 @@ public:
     SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
 	      t, T::TYPE, offset, length);
     auto f = [&t, this](CachedExtent &ext) {
-      t.add_to_read_set(CachedExtentRef(&ext));
+      // FIXME: assert(ext.is_stable_clean());
+      assert(ext.is_stable());
+      assert(T::TYPE == ext.get_type());
       const auto t_src = t.get_src();
+      extent_access_stats_t& access_stats = get_by_ext(
+        get_by_src(stats.access_by_src_ext, t_src),
+        T::TYPE);
+      ++access_stats.load_absent;
+      ++stats.access.s.load_absent;
+
+      t.add_to_read_set(CachedExtentRef(&ext));
       touch_extent(ext, &t_src);
     };
-    auto metric_key = std::make_pair(t.get_src(), T::TYPE);
     return trans_intr::make_interruptible(
       do_get_caching_extent<T>(
-	offset, length, &metric_key,
-	std::forward<Func>(extent_init_func), std::move(f))
+	offset, length, std::forward<Func>(extent_init_func), std::move(f))
     );
   }
 
@@ -480,6 +520,13 @@ public:
     CachedExtentRef extent)
   {
     assert(extent->is_valid());
+
+    const auto t_src = t.get_src();
+    auto ext_type = extent->get_type();
+    extent_access_stats_t& access_stats = get_by_ext(
+      get_by_src(stats.access_by_src_ext, t_src),
+      ext_type);
+
     CachedExtent* p_extent;
     if (extent->is_stable()) {
       p_extent = extent->get_transactional_view(t);
@@ -487,6 +534,8 @@ public:
         assert(!extent->is_stable_writting());
         assert(p_extent->is_pending_in_trans(t.get_trans_id()));
         assert(!p_extent->is_stable_writting());
+        ++access_stats.trans_pending;
+        ++stats.access.s.trans_pending;
         if (p_extent->is_mutable()) {
           assert(p_extent->is_fully_loaded());
           assert(!p_extent->is_pending_io());
@@ -499,13 +548,29 @@ public:
         // stable from trans-view
         assert(!p_extent->is_pending_in_trans(t.get_trans_id()));
         if (t.maybe_add_to_read_set(p_extent)) {
-          const auto t_src = t.get_src();
+          if (p_extent->is_dirty()) {
+            ++access_stats.cache_dirty;
+            ++stats.access.s.cache_dirty;
+          } else {
+            ++access_stats.cache_lru;
+            ++stats.access.s.cache_lru;
+          }
           touch_extent(*p_extent, &t_src);
+        } else {
+          if (p_extent->is_dirty()) {
+            ++access_stats.trans_dirty;
+            ++stats.access.s.trans_dirty;
+          } else {
+            ++access_stats.trans_lru;
+            ++stats.access.s.trans_lru;
+          }
         }
       }
     } else {
       assert(!extent->is_stable_writting());
       assert(extent->is_pending_in_trans(t.get_trans_id()));
+      ++access_stats.trans_pending;
+      ++stats.access.s.trans_pending;
       if (extent->is_mutable()) {
         assert(extent->is_fully_loaded());
         assert(!extent->is_pending_io());
@@ -520,6 +585,8 @@ public:
     ceph_assert(!is_retired_placeholder_type(p_extent->get_type()));
     if (!p_extent->is_fully_loaded()) {
       assert(!p_extent->is_mutable());
+      ++access_stats.load_present;
+      ++stats.access.s.load_present;
       LOG_PREFIX(Cache::get_extent_viewable_by_trans);
       SUBDEBUG(seastore_cache,
         "{} {}~{} is present without been fully loaded, reading ... -- {}",
@@ -557,7 +624,7 @@ public:
 // Interfaces only for tests.
 public:
   CachedExtentRef test_query_cache(paddr_t offset) {
-    return query_cache(offset, nullptr);
+    return query_cache(offset);
   }
 
 private:
@@ -573,12 +640,11 @@ private:
   read_extent_ret<T> do_get_caching_extent(
     paddr_t offset,                ///< [in] starting addr
     extent_len_t length,           ///< [in] length
-    const src_ext_t* p_src_ext,    ///< [in] cache query metric key
     Func &&extent_init_func,       ///< [in] init func for extent
     OnCache &&on_cache
   ) {
     LOG_PREFIX(Cache::do_get_caching_extent);
-    auto cached = query_cache(offset, p_src_ext);
+    auto cached = query_cache(offset);
     if (!cached) {
       auto ret = CachedExtent::make_cached_extent_ref<T>(
         alloc_cache_buf(length));
@@ -610,7 +676,7 @@ private:
       SUBDEBUG(seastore_cache,
           "{} {}~{} is absent(placeholder), reading ... -- {}",
           T::TYPE, offset, length, *ret);
-      extents.replace(*ret, *cached);
+      extents_index.replace(*ret, *cached);
       on_cache(*ret);
 
       // replace placeholder in transactions
@@ -684,11 +750,18 @@ private:
     paddr_t offset,
     laddr_t laddr,
     extent_len_t length,
-    const Transaction::src_t* p_src,
     extent_init_func_t &&extent_init_func,
     extent_init_func_t &&on_cache
   );
 
+  /**
+   * get_caching_extent_by_type
+   *
+   * Note, the current implementation leverages parent-child
+   * pointers in LBA instead, so it should only be called in tests.
+   *
+   * This path won't be accounted by the cache_access_stats_t.
+   */
   using get_extent_by_type_iertr = get_extent_iertr;
   using get_extent_by_type_ret = get_extent_by_type_iertr::future<
     CachedExtentRef>;
@@ -731,10 +804,9 @@ private:
 	const auto t_src = t.get_src();
 	touch_extent(ext, &t_src);
       };
-      auto src = t.get_src();
       return trans_intr::make_interruptible(
 	do_get_caching_extent_by_type(
-	  type, offset, laddr, length, &src,
+	  type, offset, laddr, length,
 	  std::move(extent_init_func), std::move(f))
       );
     }
@@ -762,14 +834,21 @@ private:
     SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
 	      t, type, offset, length, laddr);
     auto f = [&t, this](CachedExtent &ext) {
-      t.add_to_read_set(CachedExtentRef(&ext));
+      // FIXME: assert(ext.is_stable_clean());
+      assert(ext.is_stable());
       const auto t_src = t.get_src();
+      extent_access_stats_t& access_stats = get_by_ext(
+        get_by_src(stats.access_by_src_ext, t_src),
+        ext.get_type());
+      ++access_stats.load_absent;
+      ++stats.access.s.load_absent;
+
+      t.add_to_read_set(CachedExtentRef(&ext));
       touch_extent(ext, &t_src);
     };
-    auto src = t.get_src();
     return trans_intr::make_interruptible(
       do_get_caching_extent_by_type(
-	type, offset, laddr, length, &src,
+	type, offset, laddr, length,
 	std::move(extent_init_func), std::move(f))
     );
   }
@@ -1141,8 +1220,8 @@ public:
     SUBINFOT(seastore_cache,
         "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
         t,
-        extents.size(),
-        extents.get_bytes(),
+        extents_index.size(),
+        extents_index.get_bytes(),
         dirty.size(),
         get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
         get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
@@ -1151,7 +1230,7 @@ public:
     // Cache::root should have been inserted to the dirty list
     assert(root->is_dirty());
     std::vector<CachedExtentRef> _dirty;
-    for (auto &e : extents) {
+    for (auto &e : extents_index) {
       _dirty.push_back(CachedExtentRef(&e));
     }
     return seastar::do_with(
@@ -1184,8 +1263,8 @@ public:
       SUBINFOT(seastore_cache,
           "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
           t,
-          extents.size(),
-          extents.get_bytes(),
+          extents_index.size(),
+          extents_index.get_bytes(),
           dirty.size(),
           get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
           get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
@@ -1370,7 +1449,7 @@ private:
 
   ExtentPlacementManager& epm;
   RootBlockRef root;               ///< ref to current root
-  ExtentIndex extents;             ///< set of live extents
+  ExtentIndex extents_index;             ///< set of live extents
 
   journal_seq_t last_commit = JOURNAL_SEQ_MIN;
 
@@ -1598,7 +1677,6 @@ private:
     counter_by_src_t<uint64_t> trans_created_by_src;
     counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src;
     counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src;
-    counter_by_src_t<query_counters_t> cache_query_by_src;
     success_read_trans_efforts_t success_read_efforts;
 
     uint64_t dirty_bytes = 0;
@@ -1607,6 +1685,11 @@ private:
     counter_by_src_t<counter_by_extent_t<dirty_io_stats_t> >
       dirty_io_by_src_ext;
 
+    cache_access_stats_t access;
+    counter_by_src_t<uint64_t> cache_absent_by_src;
+    counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+      access_by_src_ext;
+
     uint64_t onode_tree_depth = 0;
     int64_t onode_tree_extents_num = 0;
     counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts;
@@ -1639,6 +1722,10 @@ private:
     last_dirty_io_by_src_ext;
   mutable rewrite_stats_t last_trim_rewrites;
   mutable rewrite_stats_t last_reclaim_rewrites;
+  mutable cache_access_stats_t last_access;
+  mutable counter_by_src_t<uint64_t> last_cache_absent_by_src;
+  mutable counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+    last_access_by_src_ext;
 
   void account_conflict(Transaction::src_t src1, Transaction::src_t src2) {
     assert(src1 < Transaction::src_t::MAX);
@@ -1776,21 +1863,10 @@ private:
   }
 
   // Extents in cache may contain placeholders
-  CachedExtentRef query_cache(
-      paddr_t offset,
-      const src_ext_t* p_metric_key) {
-    query_counters_t* p_counters = nullptr;
-    if (p_metric_key) {
-      p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first);
-      ++p_counters->access;
-    }
-    if (auto iter = extents.find_offset(offset);
-        iter != extents.end()) {
-      if (p_metric_key &&
-          // retired_placeholder is not really cached yet
-          !is_retired_placeholder_type(iter->get_type())) {
-        ++p_counters->hit;
-      }
+  CachedExtentRef query_cache(paddr_t offset) {
+    if (auto iter = extents_index.find_offset(offset);
+        iter != extents_index.end()) {
+      assert(iter->is_stable());
       return CachedExtentRef(&*iter);
     } else {
       return CachedExtentRef();
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 96cf91b1d43..6c5c6c6fcc2 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -786,7 +786,7 @@ protected:
 
   struct retired_placeholder_t{};
   CachedExtent(retired_placeholder_t, extent_len_t _length)
-    : state(extent_state_t::INVALID),
+    : state(extent_state_t::CLEAN),
       length(_length) {
     assert(length > 0);
   }
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
index 8439b733704..b7a1d8f8ba9 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -62,12 +62,14 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
 	      trans_intr::make_interruptible(
 		c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
     } else {
+      c.cache.account_absent_access(c.trans.get_src());
       return {false,
 	      trans_intr::make_interruptible(
 		Cache::get_extent_ertr::make_ready_future<
 		  CachedExtentRef>())};
     }
   } else {
+    c.cache.account_absent_access(c.trans.get_src());
     return {false,
 	    trans_intr::make_interruptible(
 	      Cache::get_extent_ertr::make_ready_future<
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 9a0ac439083..15774332373 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -711,13 +711,9 @@ seastar::future<> SeaStore::report_stats()
     }
 
     cache_size_stats_t lru_sizes_ps = cache_total.lru_sizes;
-    lru_sizes_ps.size /= seastar::smp::count;
-    lru_sizes_ps.num_extents /= seastar::smp::count;
+    lru_sizes_ps.divide_by(seastar::smp::count);
     cache_io_stats_t lru_io_ps = cache_total.lru_io;
-    lru_io_ps.in_sizes.size /= seastar::smp::count;
-    lru_io_ps.in_sizes.num_extents /= seastar::smp::count;
-    lru_io_ps.out_sizes.size /= seastar::smp::count;
-    lru_io_ps.out_sizes.num_extents /= seastar::smp::count;
+    lru_io_ps.divide_by(seastar::smp::count);
     INFO("cache lru: total{} {}; per-shard: total{} {}",
          cache_total.lru_sizes,
          cache_io_stats_printer_t{seconds, cache_total.lru_io},
@@ -725,21 +721,21 @@ seastar::future<> SeaStore::report_stats()
          cache_io_stats_printer_t{seconds, lru_io_ps});
 
     cache_size_stats_t dirty_sizes_ps = cache_total.dirty_sizes;
-    dirty_sizes_ps.size /= seastar::smp::count;
-    dirty_sizes_ps.num_extents /= seastar::smp::count;
+    dirty_sizes_ps.divide_by(seastar::smp::count);
     dirty_io_stats_t dirty_io_ps = cache_total.dirty_io;
-    dirty_io_ps.in_sizes.size /= seastar::smp::count;
-    dirty_io_ps.in_sizes.num_extents /= seastar::smp::count;
-    dirty_io_ps.num_replace /= seastar::smp::count;
-    dirty_io_ps.out_sizes.size /= seastar::smp::count;
-    dirty_io_ps.out_sizes.num_extents /= seastar::smp::count;
-    dirty_io_ps.out_versions /= seastar::smp::count;
+    dirty_io_ps.divide_by(seastar::smp::count);
     INFO("cache dirty: total{} {}; per-shard: total{} {}",
          cache_total.dirty_sizes,
          dirty_io_stats_printer_t{seconds, cache_total.dirty_io},
          dirty_sizes_ps,
          dirty_io_stats_printer_t{seconds, dirty_io_ps});
 
+    cache_access_stats_t access_ps = cache_total.access;
+    access_ps.divide_by(seastar::smp::count);
+    INFO("cache_access: total{}; per-shard{}",
+         cache_access_stats_printer_t{seconds, cache_total.access},
+         cache_access_stats_printer_t{seconds, access_ps});
+
     return seastar::now();
   });
 }
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index 13637a59db6..e1430b30019 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -1038,4 +1038,78 @@ std::ostream& operator<<(std::ostream& out, const dirty_io_stats_printer_t& p)
   return out;
 }
 
+std::ostream& operator<<(std::ostream& out, const extent_access_stats_printer_t& p)
+{
+  constexpr const char* dfmt = "{:.2f}";
+  double est_total_access = static_cast<double>(p.stats.get_estimated_total_access());
+  out << "(~";
+  if (est_total_access > 1000000) {
+    out << fmt::format(dfmt, est_total_access/1000000)
+        << "M, ";
+  } else {
+    out << fmt::format(dfmt, est_total_access/1000)
+        << "K, ";
+  }
+  double trans_hit = static_cast<double>(p.stats.get_trans_hit());
+  double cache_hit = static_cast<double>(p.stats.get_cache_hit());
+  double est_cache_access = static_cast<double>(p.stats.get_estimated_cache_access());
+  double load_absent = static_cast<double>(p.stats.load_absent);
+  out << "trans-hit=~"
+      << fmt::format(dfmt, trans_hit/est_total_access*100)
+      << "%(p"
+      << fmt::format(dfmt, p.stats.trans_pending/trans_hit)
+      << ",d"
+      << fmt::format(dfmt, p.stats.trans_dirty/trans_hit)
+      << ",l"
+      << fmt::format(dfmt, p.stats.trans_lru/trans_hit)
+      << "), cache-hit=~"
+      << fmt::format(dfmt, cache_hit/est_cache_access*100)
+      << "%(d"
+      << fmt::format(dfmt, p.stats.cache_dirty/cache_hit)
+      << ",l"
+      << fmt::format(dfmt, p.stats.cache_lru/cache_hit)
+      <<"), load-present/absent="
+      << fmt::format(dfmt, p.stats.load_present/load_absent)
+      << ")";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const cache_access_stats_printer_t& p)
+{
+  constexpr const char* dfmt = "{:.2f}";
+  double total_access = static_cast<double>(p.stats.get_total_access());
+  out << "(";
+  if (total_access > 1000000) {
+    out << fmt::format(dfmt, total_access/1000000)
+        << "M, ";
+  } else {
+    out << fmt::format(dfmt, total_access/1000)
+        << "K, ";
+  }
+  double trans_hit = static_cast<double>(p.stats.s.get_trans_hit());
+  double cache_hit = static_cast<double>(p.stats.s.get_cache_hit());
+  double cache_access = static_cast<double>(p.stats.get_cache_access());
+  double load_absent = static_cast<double>(p.stats.s.load_absent);
+  out << "trans-hit="
+      << fmt::format(dfmt, trans_hit/total_access*100)
+      << "%(p"
+      << fmt::format(dfmt, p.stats.s.trans_pending/trans_hit)
+      << ",d"
+      << fmt::format(dfmt, p.stats.s.trans_dirty/trans_hit)
+      << ",l"
+      << fmt::format(dfmt, p.stats.s.trans_lru/trans_hit)
+      << "), cache-hit="
+      << fmt::format(dfmt, cache_hit/cache_access*100)
+      << "%(d"
+      << fmt::format(dfmt, p.stats.s.cache_dirty/cache_hit)
+      << ",l"
+      << fmt::format(dfmt, p.stats.s.cache_lru/cache_hit)
+      <<"), load/absent="
+      << fmt::format(dfmt, load_absent/p.stats.cache_absent*100)
+      << "%, load-present/absent="
+      << fmt::format(dfmt, p.stats.s.load_present/load_absent)
+      << ")";
+  return out;
+}
+
 } // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index 5cdb1f86142..df5c184e7ab 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -2800,6 +2800,11 @@ struct cache_size_stats_t {
     size -= o.size;
     num_extents -= o.num_extents;
   }
+
+  void divide_by(unsigned d) {
+    size /= d;
+    num_extents /= d;
+  }
 };
 std::ostream& operator<<(std::ostream&, const cache_size_stats_t&);
 struct cache_size_stats_printer_t {
@@ -2825,6 +2830,11 @@ struct cache_io_stats_t {
     in_sizes.minus(o.in_sizes);
     out_sizes.minus(o.out_sizes);
   }
+
+  void divide_by(unsigned d) {
+    in_sizes.divide_by(d);
+    out_sizes.divide_by(d);
+  }
 };
 struct cache_io_stats_printer_t {
   double seconds;
@@ -2861,6 +2871,13 @@ struct dirty_io_stats_t {
     out_sizes.minus(o.out_sizes);
     out_versions -= o.out_versions;
   }
+
+  void divide_by(unsigned d) {
+    in_sizes.divide_by(d);
+    num_replace /= d;
+    out_sizes.divide_by(d);
+    out_versions /= d;
+  }
 };
 struct dirty_io_stats_printer_t {
   double seconds;
@@ -2868,17 +2885,130 @@ struct dirty_io_stats_printer_t {
 };
 std::ostream& operator<<(std::ostream&, const dirty_io_stats_printer_t&);
 
+/*
+ * Doesn't account:
+ *   replay
+ *   rewrite
+ *   retiring/placeholder
+ *   get_caching_extent() -- test only
+ *   get_caching_extent_by_type() -- test only
+ */
+struct extent_access_stats_t {
+  uint64_t trans_pending = 0;
+  uint64_t trans_dirty = 0;
+  uint64_t trans_lru = 0;
+  uint64_t cache_dirty = 0;
+  uint64_t cache_lru = 0;
+
+  uint64_t load_absent = 0;
+  uint64_t load_present = 0;
+
+  uint64_t get_trans_hit() const {
+    return trans_pending + trans_dirty + trans_lru;
+  }
+
+  uint64_t get_cache_hit() const {
+    return cache_dirty + cache_lru;
+  }
+
+  uint64_t get_estimated_cache_access() const {
+    return get_cache_hit() + load_absent;
+  }
+
+  uint64_t get_estimated_total_access() const {
+    return get_trans_hit() + get_cache_hit() + load_absent;
+  }
+
+  bool is_empty() const {
+    return get_estimated_total_access() == 0;
+  }
+
+  void add(const extent_access_stats_t& o) {
+    trans_pending += o.trans_pending;
+    trans_dirty += o.trans_dirty;
+    trans_lru += o.trans_lru;
+    cache_dirty += o.cache_dirty;
+    cache_lru += o.cache_lru;
+    load_absent += o.load_absent;
+    load_present += o.load_present;
+  }
+
+  void minus(const extent_access_stats_t& o) {
+    trans_pending -= o.trans_pending;
+    trans_dirty -= o.trans_dirty;
+    trans_lru -= o.trans_lru;
+    cache_dirty -= o.cache_dirty;
+    cache_lru -= o.cache_lru;
+    load_absent -= o.load_absent;
+    load_present -= o.load_present;
+  }
+
+  void divide_by(unsigned d) {
+    trans_pending /= d;
+    trans_dirty /= d;
+    trans_lru /= d;
+    cache_dirty /= d;
+    cache_lru /= d;
+    load_absent /= d;
+    load_present /= d;
+  }
+};
+struct extent_access_stats_printer_t {
+  double seconds;
+  const extent_access_stats_t& stats;
+};
+std::ostream& operator<<(std::ostream&, const extent_access_stats_printer_t&);
+
+struct cache_access_stats_t {
+  extent_access_stats_t s;
+  uint64_t cache_absent = 0;
+
+  uint64_t get_cache_access() const {
+    return s.get_cache_hit() + cache_absent;
+  }
+
+  uint64_t get_total_access() const {
+    return s.get_trans_hit() + get_cache_access();
+  }
+
+  bool is_empty() const {
+    return get_total_access() == 0;
+  }
+
+  void add(const cache_access_stats_t& o) {
+    s.add(o.s);
+    cache_absent += o.cache_absent;
+  }
+
+  void minus(const cache_access_stats_t& o) {
+    s.minus(o.s);
+    cache_absent -= o.cache_absent;
+  }
+
+  void divide_by(unsigned d) {
+    s.divide_by(d);
+    cache_absent /= d;
+  }
+};
+struct cache_access_stats_printer_t {
+  double seconds;
+  const cache_access_stats_t& stats;
+};
+std::ostream& operator<<(std::ostream&, const cache_access_stats_printer_t&);
+
 struct cache_stats_t {
   cache_size_stats_t lru_sizes;
   cache_io_stats_t lru_io;
   cache_size_stats_t dirty_sizes;
   dirty_io_stats_t dirty_io;
+  cache_access_stats_t access;
 
   void add(const cache_stats_t& o) {
     lru_sizes.add(o.lru_sizes);
     lru_io.add(o.lru_io);
     dirty_sizes.add(o.dirty_sizes);
     dirty_io.add(o.dirty_io);
+    access.add(o.access);
   }
 };
 
@@ -2900,6 +3030,7 @@ WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t)
 
 #if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::cache_access_stats_printer_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<crimson::os::seastore::cache_io_stats_printer_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<crimson::os::seastore::cache_size_stats_t> : fmt::ostream_formatter {};
 template <> struct fmt::formatter<crimson::os::seastore::cache_size_stats_printer_t> : fmt::ostream_formatter {};
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 23636d79d88..828b8a25592 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -508,9 +508,11 @@ public:
 	    auto ret = get_extent_if_linked<T>(t, pin->duplicate());
 	    if (ret.index() == 1) {
 	      return std::move(std::get<1>(ret));
+	    } else {
+	      // absent
+	      return base_iertr::make_ready_future<TCachedExtentRef<T>>();
 	    }
 	  }
-	  return base_iertr::make_ready_future<TCachedExtentRef<T>>();
 	}).si_then([this, &t, &remaps, original_paddr,
 			    original_laddr, original_len,
 			    &extents, FNAME](auto ext) mutable {
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
index 26f7e5dedde..32eaaf02b3f 100644
--- a/src/crimson/osd/ec_backend.cc
+++ b/src/crimson/osd/ec_backend.cc
@@ -32,6 +32,6 @@ ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards,
 			      std::vector<pg_log_entry_t>&& log_entries)
 {
   // todo
-  return {seastar::now(),
-	  seastar::make_ready_future<crimson::osd::acked_peers_t>()};
+  return make_ready_future<rep_op_ret_t>(seastar::now(),
+	  seastar::make_ready_future<crimson::osd::acked_peers_t>());
 }
diff --git a/src/crimson/osd/object_metadata_helper.cc b/src/crimson/osd/object_metadata_helper.cc
index 3016b0ae406..12bf855afbe 100644
--- a/src/crimson/osd/object_metadata_helper.cc
+++ b/src/crimson/osd/object_metadata_helper.cc
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
 #include "crimson/osd/object_metadata_helper.h"
 
 namespace {
@@ -23,12 +26,26 @@ subsets_t calc_clone_subsets(
   subsets_t subsets;
   logger().debug("{}: {} clone_overlap {} ",
                  __func__, soid, snapset.clone_overlap);
-
+  assert(missing.get_items().contains(soid));
+  const pg_missing_item &missing_item = missing.get_items().at(soid);
+  auto dirty_regions = missing_item.clean_regions.get_dirty_regions();
+  if (dirty_regions.empty()) {
+    logger().debug(
+      "{} {} not touched, no need to recover, skipping",
+      __func__,
+      soid);
+    return subsets;
+  }
   uint64_t size = snapset.clone_size[soid.snap];
   if (size) {
     subsets.data_subset.insert(0, size);
   }
 
+  // let data_subset store only the modified content of the object.
+  subsets.data_subset.intersection_of(dirty_regions);
+  logger().debug("{} {} data_subset {}",
+                 __func__, soid, subsets.data_subset);
+
   // TODO: make sure CEPH_FEATURE_OSD_CACHEPOOL is not supported in Crimson
   // Skips clone subsets if caching was enabled (allow_incomplete_clones).
 
@@ -140,7 +157,7 @@ subsets_t calc_head_subsets(
     subsets.data_subset.insert(0, obj_size);
   }
   assert(missing.get_items().contains(head));
-  const pg_missing_item missing_item = missing.get_items().at(head);
+  const pg_missing_item &missing_item = missing.get_items().at(head);
   // let data_subset store only the modified content of the object.
   subsets.data_subset.intersection_of(missing_item.clean_regions.get_dirty_regions());
   logger().debug("{} {} data_subset {}",
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 854c4eab6af..df4f73d4077 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -844,57 +844,6 @@ std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
   return log_entries;
 }
 
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove(
-  const hobject_t& soid,
-  SnapMapper& snap_mapper,
-  OSDriver& osdriver,
-  ceph::os::Transaction& txn)
-{
-  logger().debug("{}: soid {}", __func__, soid);
-  return interruptor::async([soid, &snap_mapper,
-                             _t=osdriver.get_transaction(&txn)]() mutable {
-    const auto r = snap_mapper.remove_oid(soid, &_t);
-    if (r) {
-      logger().error("{}: remove_oid {} failed with {}",
-                     __func__, soid, r);
-    }
-    // On removal tolerate missing key corruption
-    assert(r == 0 || r == -ENOENT);
-  });
-}
-
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify(
-  const hobject_t& soid,
-  const std::set<snapid_t>& snaps,
-  SnapMapper& snap_mapper,
-  OSDriver& osdriver,
-  ceph::os::Transaction& txn)
-{
-  logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
-  return interruptor::async([soid, snaps, &snap_mapper,
-                             _t=osdriver.get_transaction(&txn)]() mutable {
-    assert(std::size(snaps) > 0);
-    [[maybe_unused]] const auto r = snap_mapper.update_snaps(
-      soid, snaps, 0, &_t);
-    assert(r == 0);
-  });
-}
-
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone(
-  const hobject_t& soid,
-  const std::set<snapid_t>& snaps,
-  SnapMapper& snap_mapper,
-  OSDriver& osdriver,
-  ceph::os::Transaction& txn)
-{
-  logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
-  return interruptor::async([soid, snaps, &snap_mapper,
-                             _t=osdriver.get_transaction(&txn)]() mutable {
-    assert(std::size(snaps) > 0);
-    snap_mapper.add_oid(soid, snaps, &_t);
-  });
-}
-
 // Defined here because there is a circular dependency between OpsExecuter and PG
 uint32_t OpsExecuter::get_pool_stripe_width() const {
   return pg->get_pgpool().info.get_stripe_width();
@@ -974,6 +923,7 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
     0
   };
   encode(cloned_snaps, cloning_ctx->log_entry.snaps);
+  cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size);
 
   return cloning_ctx;
 }
@@ -1019,14 +969,6 @@ OpsExecuter::flush_clone_metadata(
   update_clone_overlap();
   if (cloning_ctx) {
     std::move(*cloning_ctx).apply_to(log_entries, *obc);
-    const auto& coid = log_entries.front().soid;
-    const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap];
-    maybe_snap_mapped = snap_map_clone(
-      coid,
-      std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)},
-      snap_mapper,
-      osdriver,
-      txn);
   }
   if (snapc.seq > obc->ssc->snapset.seq) {
      // update snapset with latest snap context
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 834266ce68f..e0e5e10e0a9 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -21,6 +21,7 @@
 #include "os/Transaction.h"
 #include "osd/osd_types.h"
 
+#include "crimson/common/coroutine.h"
 #include "crimson/common/errorator.h"
 #include "crimson/common/interruptible_future.h"
 #include "crimson/common/type_helpers.h"
@@ -272,24 +273,7 @@ private:
     OSDriver& osdriver,
     ceph::os::Transaction& txn);
 
-  static interruptible_future<> snap_map_remove(
-    const hobject_t& soid,
-    SnapMapper& snap_mapper,
-    OSDriver& osdriver,
-    ceph::os::Transaction& txn);
-  static interruptible_future<> snap_map_modify(
-    const hobject_t& soid,
-    const std::set<snapid_t>& snaps,
-    SnapMapper& snap_mapper,
-    OSDriver& osdriver,
-    ceph::os::Transaction& txn);
-  static interruptible_future<> snap_map_clone(
-    const hobject_t& soid,
-    const std::set<snapid_t>& snaps,
-    SnapMapper& snap_mapper,
-    OSDriver& osdriver,
-    ceph::os::Transaction& txn);
-
+private:
   // this gizmo could be wrapped in std::optional for the sake of lazy
   // initialization. we don't need it for ops that doesn't have effect
   // TODO: verify the init overhead of chunked_fifo
@@ -424,7 +408,7 @@ public:
     const std::vector<OSDOp>& ops,
     SnapMapper& snap_mapper,
     OSDriver& osdriver,
-    MutFunc&& mut_func) &&;
+    MutFunc mut_func) &&;
   std::vector<pg_log_entry_t> prepare_transaction(
     const std::vector<OSDOp>& ops);
   void fill_op_params(modified_by m);
@@ -510,60 +494,60 @@ OpsExecuter::flush_changes_n_do_ops_effects(
   const std::vector<OSDOp>& ops,
   SnapMapper& snap_mapper,
   OSDriver& osdriver,
-  MutFunc&& mut_func) &&
+  MutFunc mut_func) &&
 {
   const bool want_mutate = !txn.empty();
   // osd_op_params are instantiated by every wr-like operation.
   assert(osd_op_params || !want_mutate);
   assert(obc);
-  rep_op_fut_t maybe_mutated =
-    interruptor::make_ready_future<rep_op_fut_tuple>(
-	seastar::now(),
-	interruptor::make_interruptible(osd_op_errorator::now()));
+
+  auto submitted = interruptor::now();
+  auto all_completed =
+    interruptor::make_interruptible(osd_op_errorator::now());
+
   if (cloning_ctx) {
     ceph_assert(want_mutate);
   }
+
   if (want_mutate) {
-    maybe_mutated = flush_clone_metadata(
+    auto log_entries = co_await flush_clone_metadata(
       prepare_transaction(ops),
       snap_mapper,
       osdriver,
-      txn
-    ).then_interruptible([mut_func=std::move(mut_func),
-                          this](auto&& log_entries) mutable {
-      if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
-        ceph_assert(log_rit->version == osd_op_params->at_version);
-      }
-      auto [submitted, all_completed] =
-        std::forward<MutFunc>(mut_func)(std::move(txn),
-                                        std::move(obc),
-                                        std::move(*osd_op_params),
-                                        std::move(log_entries));
-      return interruptor::make_ready_future<rep_op_fut_tuple>(
-	std::move(submitted),
-	osd_op_ierrorator::future<>(std::move(all_completed)));
-    });
+      txn);
+
+    if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
+      ceph_assert(log_rit->version == osd_op_params->at_version);
+    }
+
+    auto [_submitted, _all_completed] = co_await mut_func(
+      std::move(txn),
+      std::move(obc),
+      std::move(*osd_op_params),
+      std::move(log_entries));
+
+    submitted = std::move(_submitted);
+    all_completed = std::move(_all_completed);
   }
+
   apply_stats();
 
-  if (__builtin_expect(op_effects.empty(), true)) {
-    return maybe_mutated;
-  } else {
-    return maybe_mutated.then_unpack_interruptible(
-      // need extra ref pg due to apply_stats() which can be executed after
-      // informing snap mapper
-      [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable {
-      return interruptor::make_ready_future<rep_op_fut_tuple>(
-	  std::move(submitted),
-	  all_completed.safe_then_interruptible([this, pg=std::move(pg)] {
-	    // let's do the cleaning of `op_effects` in destructor
-	    return interruptor::do_for_each(op_effects,
-	      [pg=std::move(pg)](auto& op_effect) {
-	      return op_effect->execute(pg);
-	    });
-	  }));
+  if (op_effects.size()) [[unlikely]] {
+    // need extra ref pg due to apply_stats() which can be executed after
+    // informing snap mapper
+    all_completed =
+      std::move(all_completed).safe_then_interruptible([this, pg=this->pg] {
+      // let's do the cleaning of `op_effects` in destructor
+      return interruptor::do_for_each(op_effects,
+        [pg=std::move(pg)](auto& op_effect) {
+        return op_effect->execute(pg);
+      });
     });
   }
+
+  co_return std::make_tuple(
+    std::move(submitted),
+    std::move(all_completed));
 }
 
 template <class Func>
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 190ea47abd8..8d2d10fbd7c 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -718,7 +718,7 @@ seastar::future<> OSD::stop()
     DEBUG("prepared to stop");
     public_msgr->stop();
     cluster_msgr->stop();
-    auto gate_close_fut = gate.close();
+    auto gate_close_fut = gate.close_all();
     return asok->stop().then([this] {
       return heartbeat->stop();
     }).then([this] {
@@ -1557,10 +1557,18 @@ seastar::future<> OSD::handle_peering_op(
 
 seastar::future<> OSD::check_osdmap_features()
 {
+  LOG_PREFIX(OSD::check_osdmap_features);
   assert(seastar::this_shard_id() == PRIMARY_CORE);
-  return store.write_meta(
-      "require_osd_release",
-      stringify((int)osdmap->require_osd_release));
+  if (osdmap->require_osd_release != last_require_osd_release) {
+    DEBUG("updating require_osd_release from {} to {}",
+          to_string(last_require_osd_release),
+          to_string(osdmap->require_osd_release));
+    last_require_osd_release = osdmap->require_osd_release;
+    return store.write_meta(
+        "require_osd_release",
+        stringify((int)osdmap->require_osd_release));
+  }
+  return seastar::now();
 }
 
 seastar::future<> OSD::prepare_to_stop()
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index 8df23c53f7a..de39d808274 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -232,7 +232,9 @@ private:
     Ref<MOSDPGUpdateLogMissingReply> m);
 
 private:
-  crimson::common::Gated gate;
+  crimson::common::gate_per_shard gate;
+
+  ceph_release_t last_require_osd_release{ceph_release_t::unknown};
 
   seastar::promise<> stop_acked;
   void got_stop_ack() {
diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc
index 509d4c4a484..c030c9d8970 100644
--- a/src/crimson/osd/osd_operations/background_recovery.cc
+++ b/src/crimson/osd/osd_operations/background_recovery.cc
@@ -158,6 +158,8 @@ PglogBasedRecovery::PglogBasedRecovery(
 PglogBasedRecovery::interruptible_future<bool>
 PglogBasedRecovery::do_recovery()
 {
+  LOG_PREFIX(PglogBasedRecovery::do_recovery);
+  DEBUGDPPI("{}: {}", *pg, __func__, *this);
   if (pg->has_reset_since(epoch_started)) {
     return seastar::make_ready_future<bool>(false);
   }
@@ -167,6 +169,7 @@ PglogBasedRecovery::do_recovery()
 			       interruptor>([this] (auto&& trigger) {
       return pg->get_recovery_handler()->start_recovery_ops(
 	trigger,
+	*this,
 	crimson::common::local_conf()->osd_recovery_max_single_start);
     });
   });
diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h
index 17f2cd57a30..5ae0e1a9edb 100644
--- a/src/crimson/osd/osd_operations/background_recovery.h
+++ b/src/crimson/osd/osd_operations/background_recovery.h
@@ -91,8 +91,20 @@ public:
     RecoveryBackend::RecoveryBlockingEvent
   > tracking_events;
 
+  void cancel() {
+    cancelled = true;
+  }
+
+  bool is_cancelled() const {
+    return cancelled;
+  }
+
+  epoch_t get_epoch_started() const {
+    return epoch_started;
+  }
 private:
   interruptible_future<bool> do_recovery() override;
+  bool cancelled = false;
 };
 
 class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> {
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index f1247c3fa08..7512b3d108d 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -197,7 +197,7 @@ SnapTrimObjSubEvent::remove_clone(
     pg->get_collection_ref()->get_cid(),
     ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
   obc->obs.oi = object_info_t(coid);
-  return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn);
+  return interruptor::now();
 }
 
 void SnapTrimObjSubEvent::remove_head_whiteout(
@@ -263,7 +263,7 @@ SnapTrimObjSubEvent::adjust_snaps(
     ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
     OI_ATTR,
     bl);
-  add_log_entry(
+  auto &loge = add_log_entry(
     pg_log_entry_t::MODIFY,
     coid,
     obc->obs.oi.prior_version,
@@ -271,8 +271,10 @@ SnapTrimObjSubEvent::adjust_snaps(
     osd_reqid_t(),
     obc->obs.oi.mtime,
     0);
-  return OpsExecuter::snap_map_modify(
-    coid, new_snaps, pg->snap_mapper, pg->osdriver, txn);
+  bufferlist snapsbl;
+  encode(new_snaps, snapsbl);
+  loge.snaps.swap(snapsbl);
+  return interruptor::now();
 }
 
 void SnapTrimObjSubEvent::update_head(
@@ -361,6 +363,7 @@ SnapTrimObjSubEvent::remove_or_update(
       // save head snapset
       logger().debug("{}: {} new snapset {} on {}",
 		     *this, coid, head_obc->ssc->snapset, head_obc->obs.oi);
+      osd_op_p.at_version.version++;
       if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) {
 	remove_head_whiteout(obc, head_obc, txn);
       } else {
@@ -400,32 +403,8 @@ SnapTrimObjSubEvent::start()
   // lock both clone's and head's obcs
   co_await pg->obc_loader.with_obc<RWState::RWWRITE>(
     coid,
-    [this](auto head_obc, auto clone_obc) {
-      logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
-      return enter_stage<interruptor>(
-        client_pp().process
-      ).then_interruptible(
-        [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable {
-	  logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
-	  return remove_or_update(
-	    clone_obc, head_obc
-	  ).safe_then_interruptible([clone_obc, this](auto&& txn) mutable {
-	    auto [submitted, all_completed] = pg->submit_transaction(
-	      std::move(clone_obc),
-	      std::move(txn),
-	      std::move(osd_op_p),
-	      std::move(log_entries));
-	    return submitted.then_interruptible(
-	      [this, all_completed=std::move(all_completed)]() mutable {
-		return enter_stage<interruptor>(
-		  client_pp().wait_repop
-		).then_interruptible([all_completed=std::move(all_completed)]() mutable{
-		  return std::move(all_completed);
-		});
-	      });
-	  });
-	});
-    },
+    std::bind(&SnapTrimObjSubEvent::process_and_submit,
+              this, std::placeholders::_1, std::placeholders::_2),
     false
   ).handle_error_interruptible(
     remove_or_update_iertr::pass_further{},
@@ -436,6 +415,33 @@ SnapTrimObjSubEvent::start()
   co_await interruptor::make_interruptible(handle.complete());
 }
 
+ObjectContextLoader::load_obc_iertr::future<>
+SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc,
+                                        ObjectContextRef clone_obc) {
+  logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
+
+  co_await enter_stage<interruptor>(client_pp().process);
+
+  logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
+
+  auto txn = co_await remove_or_update(clone_obc, head_obc);
+
+  auto [submitted, all_completed] = co_await pg->submit_transaction(
+	  std::move(clone_obc),
+	  std::move(txn),
+	  std::move(osd_op_p),
+	  std::move(log_entries)
+  );
+
+  co_await std::move(submitted);
+
+  co_await enter_stage<interruptor>(client_pp().wait_repop);
+
+  co_await std::move(all_completed);
+
+  co_return;
+}
+
 void SnapTrimObjSubEvent::print(std::ostream &lhs) const
 {
   lhs << "SnapTrimObjSubEvent("
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 0e6c1e9d8fc..06d8f43c2f3 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -113,6 +113,10 @@ public:
 private:
   object_stat_sum_t delta_stats;
 
+  ObjectContextLoader::load_obc_iertr::future<> process_and_submit(
+    ObjectContextRef head_obc,
+    ObjectContextRef clone_obc);
+
   snap_trim_obj_subevent_ret_t remove_clone(
     ObjectContextRef obc,
     ObjectContextRef head_obc,
@@ -134,7 +138,7 @@ private:
   remove_or_update_iertr::future<ceph::os::Transaction>
   remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc);
 
-  void add_log_entry(
+  pg_log_entry_t& add_log_entry(
     int _op,
     const hobject_t& _soid,
     const eversion_t& pv,
@@ -151,7 +155,7 @@ private:
       rid,
       mt,
       return_code);
-    osd_op_p.at_version.version++;
+    return log_entries.back();
   }
 
   Ref<PG> pg;
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index ad34373c74f..644cc84513d 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -27,6 +27,7 @@
 
 #include "os/Transaction.h"
 
+#include "crimson/common/coroutine.h"
 #include "crimson/common/exception.h"
 #include "crimson/common/log.h"
 #include "crimson/net/Connection.h"
@@ -514,6 +515,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
 
 Context *PG::on_clean()
 {
+  recovery_handler->on_pg_clean();
   scrubber.on_primary_active_clean();
   return nullptr;
 }
@@ -893,8 +895,9 @@ void PG::mutate_object(
   }
 }
 
-std::tuple<PG::interruptible_future<>,
-           PG::interruptible_future<>>
+PG::interruptible_future<
+  std::tuple<PG::interruptible_future<>,
+             PG::interruptible_future<>>>
 PG::submit_transaction(
   ObjectContextRef&& obc,
   ceph::os::Transaction&& txn,
@@ -902,9 +905,10 @@ PG::submit_transaction(
   std::vector<pg_log_entry_t>&& log_entries)
 {
   if (__builtin_expect(stopping, false)) {
-    return {seastar::make_exception_future<>(
-              crimson::common::system_shutdown_exception()),
-            seastar::now()};
+    co_return std::make_tuple(
+        interruptor::make_interruptible(seastar::make_exception_future<>(
+          crimson::common::system_shutdown_exception())),
+        interruptor::now());
   }
 
   epoch_t map_epoch = get_osdmap_epoch();
@@ -916,7 +920,7 @@ PG::submit_transaction(
   ceph_assert(log_entries.rbegin()->version >= projected_last_update);
   projected_last_update = log_entries.rbegin()->version;
 
-  auto [submitted, all_completed] = backend->submit_transaction(
+  auto [submitted, all_completed] = co_await backend->submit_transaction(
       peering_state.get_acting_recovery_backfill(),
       obc->obs.oi.soid,
       std::move(txn),
@@ -924,16 +928,19 @@ PG::submit_transaction(
       peering_state.get_last_peering_reset(),
       map_epoch,
       std::move(log_entries));
-  return std::make_tuple(std::move(submitted), all_completed.then_interruptible(
-    [this, last_complete=peering_state.get_info().last_complete,
+  co_return std::make_tuple(
+    std::move(submitted),
+    all_completed.then_interruptible(
+      [this, last_complete=peering_state.get_info().last_complete,
       at_version=osd_op_p.at_version](auto acked) {
-    for (const auto& peer : acked) {
-      peering_state.update_peer_last_complete_ondisk(
-        peer.shard, peer.last_complete_ondisk);
-    }
-    peering_state.complete_write(at_version, last_complete);
-    return seastar::now();
-  }));
+      for (const auto& peer : acked) {
+        peering_state.update_peer_last_complete_ondisk(
+          peer.shard, peer.last_complete_ondisk);
+      }
+      peering_state.complete_write(at_version, last_complete);
+      return seastar::now();
+    })
+  );
 }
 
 PG::interruptible_future<> PG::repair_object(
@@ -1439,14 +1446,10 @@ void PG::update_stats(const pg_stat_t &stat) {
 
 PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
 {
-  if (__builtin_expect(stopping, false)) {
-    return seastar::make_exception_future<>(
-	crimson::common::system_shutdown_exception());
-  }
-
-  logger().debug("{}: {}", __func__, *req);
+  LOG_PREFIX(PG::handle_rep_op);
+  DEBUGDPP("{}", *this, *req);
   if (can_discard_replica_op(*req)) {
-    return seastar::now();
+    co_return;
   }
 
   ceph::os::Transaction txn;
@@ -1456,6 +1459,11 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
   std::vector<pg_log_entry_t> log_entries;
   decode(log_entries, p);
   update_stats(req->pg_stats);
+
+  co_await update_snap_map(
+    log_entries,
+    txn);
+
   log_operation(std::move(log_entries),
                 req->pg_trim_to,
                 req->version,
@@ -1463,18 +1471,42 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
                 !txn.empty(),
                 txn,
                 false);
-  logger().debug("PG::handle_rep_op: do_transaction...");
-  return interruptor::make_interruptible(shard_services.get_store().do_transaction(
-	coll_ref, std::move(txn))).then_interruptible(
-      [req, lcod=peering_state.get_info().last_complete, this] {
-      peering_state.update_last_complete_ondisk(lcod);
-      const auto map_epoch = get_osdmap_epoch();
-      auto reply = crimson::make_message<MOSDRepOpReply>(
-        req.get(), pg_whoami, 0,
-	map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
-      reply->set_last_complete_ondisk(lcod);
-      return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch);
-    });
+  DEBUGDPP("{} do_transaction", *this, *req);
+  co_await interruptor::make_interruptible(
+    shard_services.get_store().do_transaction(coll_ref, std::move(txn))
+  );
+
+  const auto &lcod = peering_state.get_info().last_complete;
+  peering_state.update_last_complete_ondisk(lcod);
+  const auto map_epoch = get_osdmap_epoch();
+  auto reply = crimson::make_message<MOSDRepOpReply>(
+    req.get(), pg_whoami, 0,
+    map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+  reply->set_last_complete_ondisk(lcod);
+  co_await interruptor::make_interruptible(
+    shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch)
+  );
+  co_return;
+}
+
+PG::interruptible_future<> PG::update_snap_map(
+  const std::vector<pg_log_entry_t> &log_entries,
+  ObjectStore::Transaction& t)
+{
+  LOG_PREFIX(PG::update_snap_map);
+  DEBUGDPP("", *this);
+  return interruptor::do_for_each(
+    log_entries,
+    [this, &t](const auto& entry) mutable {
+    if (entry.soid.snap < CEPH_MAXSNAP) {
+      // TODO: avoid seastar::async https://tracker.ceph.com/issues/67704
+      return interruptor::async(
+        [this, entry, _t=osdriver.get_transaction(&t)]() mutable {
+        snap_mapper.update_snap_map(entry, &_t);
+      });
+    }
+    return interruptor::now();
+  });
 }
 
 void PG::log_operation(
@@ -1496,10 +1528,6 @@ void PG::log_operation(
    * handles these cases.
    */
 #if 0
-  if (transaction_applied) {
-    //TODO:
-    //update_snap_map(logv, t);
-  }
   auto last = logv.rbegin();
   if (is_primary() && last != logv.rend()) {
     projected_log.skip_can_rollback_to_to_head();
@@ -1700,6 +1728,7 @@ void PG::on_change(ceph::os::Transaction &t) {
   peering_state.state_clear(PG_STATE_SNAPTRIM);
   peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
   snap_mapper.reset_backend();
+  reset_pglog_based_recovery_op();
 }
 
 void PG::context_registry_on_change() {
@@ -1839,4 +1868,19 @@ void PG::PGLogEntryHandler::remove(const hobject_t &soid) {
   DEBUGDPP("remove {} on pglog rollback", *pg, soid);
   pg->remove_maybe_snapmapped_object(*t, soid);
 }
+
+void PG::set_pglog_based_recovery_op(PglogBasedRecovery *op) {
+  ceph_assert(!pglog_based_recovery_op);
+  pglog_based_recovery_op = op;
+}
+
+void PG::reset_pglog_based_recovery_op() {
+  pglog_based_recovery_op = nullptr;
+}
+
+void PG::cancel_pglog_based_recovery_op() {
+  ceph_assert(pglog_based_recovery_op);
+  pglog_based_recovery_op->cancel();
+  reset_pglog_based_recovery_op();
+}
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index b1e5e1fa22a..11c0e3668b1 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -64,6 +64,7 @@ namespace crimson::osd {
 class OpsExecuter;
 class BackfillRecovery;
 class SnapTrimEvent;
+class PglogBasedRecovery;
 
 class PG : public boost::intrusive_ref_counter<
   PG,
@@ -433,6 +434,10 @@ public:
     recovery_handler->backfill_cancelled();
   }
 
+  void on_recovery_cancelled() final {
+    cancel_pglog_based_recovery_op();
+  }
+
   void on_recovery_reserved() final {
     recovery_handler->start_pglogbased_recovery();
   }
@@ -591,6 +596,9 @@ public:
 
   interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
   void update_stats(const pg_stat_t &stat);
+  interruptible_future<> update_snap_map(
+    const std::vector<pg_log_entry_t> &log_entries,
+    ObjectStore::Transaction& t);
   void log_operation(
     std::vector<pg_log_entry_t>&& logv,
     const eversion_t &trim_to,
@@ -673,7 +681,8 @@ private:
     SuccessFunc&& success_func,
     FailureFunc&& failure_func);
   interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
-  std::tuple<interruptible_future<>, interruptible_future<>>
+  interruptible_future<
+    std::tuple<interruptible_future<>, interruptible_future<>>>
   submit_transaction(
     ObjectContextRef&& obc,
     ceph::os::Transaction&& txn,
@@ -838,6 +847,10 @@ public:
     return can_discard_replica_op(m, m.get_map_epoch());
   }
 
+  void set_pglog_based_recovery_op(PglogBasedRecovery *op) final;
+  void reset_pglog_based_recovery_op() final;
+  void cancel_pglog_based_recovery_op();
+
 private:
   // instead of seastar::gate, we use a boolean flag to indicate
   // whether the system is shutting down, as we don't need to track
@@ -845,6 +858,7 @@ private:
   bool stopping = false;
 
   PGActivationBlocker wait_for_active_blocker;
+  PglogBasedRecovery* pglog_based_recovery_op = nullptr;
 
   friend std::ostream& operator<<(std::ostream&, const PG& pg);
   friend class ClientRequest;
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index 21dce24b899..fa1f1405ffe 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -60,9 +60,10 @@ public:
   using interruptible_future =
     ::crimson::interruptible::interruptible_future<
       ::crimson::osd::IOInterruptCondition, T>;
-  using rep_op_fut_t =
+  using rep_op_ret_t = 
     std::tuple<interruptible_future<>,
 	       interruptible_future<crimson::osd::acked_peers_t>>;
+  using rep_op_fut_t = interruptible_future<rep_op_ret_t>;
   PGBackend(shard_id_t shard, CollectionRef coll,
             crimson::osd::ShardServices &shard_services,
             DoutPrefixProvider &dpp);
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index b583c87a3f6..4f874d526b3 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -24,29 +24,33 @@ namespace {
 
 using std::map;
 using std::set;
+using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
 
 void PGRecovery::start_pglogbased_recovery()
 {
-  using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
-  (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+  auto [op, fut] = pg->get_shard_services().start_operation<PglogBasedRecovery>(
     static_cast<crimson::osd::PG*>(pg),
     pg->get_shard_services(),
     pg->get_osdmap_epoch(),
     float(0.001));
+  pg->set_pglog_based_recovery_op(op.get());
 }
 
 PGRecovery::interruptible_future<bool>
 PGRecovery::start_recovery_ops(
   RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+  PglogBasedRecovery &recover_op,
   size_t max_to_start)
 {
   assert(pg->is_primary());
   assert(pg->is_peered());
 
-  if (!pg->is_recovering() && !pg->is_backfilling()) {
-    logger().debug("recovery raced and were queued twice, ignoring!");
+  if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+      recover_op.is_cancelled()) {
+    logger().debug("recovery {} cancelled.", recover_op);
     return seastar::make_ready_future<bool>(false);
   }
+  ceph_assert(pg->is_recovering());
 
   // in ceph-osd the do_recovery() path handles both the pg log-based
   // recovery and the backfill, albeit they are separated at the layer
@@ -68,12 +72,15 @@ PGRecovery::start_recovery_ops(
   return interruptor::parallel_for_each(started,
 					[] (auto&& ifut) {
     return std::move(ifut);
-  }).then_interruptible([this] {
+  }).then_interruptible([this, &recover_op] {
     //TODO: maybe we should implement a recovery race interruptor in the future
-    if (!pg->is_recovering() && !pg->is_backfilling()) {
-      logger().debug("recovery raced and were queued twice, ignoring!");
+    if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+	recover_op.is_cancelled()) {
+      logger().debug("recovery {} cancelled.", recover_op);
       return seastar::make_ready_future<bool>(false);
     }
+    ceph_assert(pg->is_recovering());
+    ceph_assert(!pg->is_backfilling());
 
     bool done = !pg->get_peering_state().needs_recovery();
     if (done) {
@@ -101,6 +108,7 @@ PGRecovery::start_recovery_ops(
           pg->get_osdmap_epoch(),
           PeeringState::RequestBackfill{});
       }
+      pg->reset_pglog_based_recovery_op();
     }
     return seastar::make_ready_future<bool>(!done);
   });
@@ -603,9 +611,13 @@ bool PGRecovery::budget_available() const
   return true;
 }
 
-void PGRecovery::backfilled()
+void PGRecovery::on_pg_clean()
 {
   backfill_state.reset();
+}
+
+void PGRecovery::backfilled()
+{
   using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
   std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
     static_cast<crimson::osd::PG*>(pg),
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index c2f289e3cf6..6cd29c3dc52 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -17,6 +17,7 @@
 
 namespace crimson::osd {
 class UrgentRecovery;
+class PglogBasedRecovery;
 }
 
 class MOSDPGBackfillRemove;
@@ -32,6 +33,7 @@ public:
 
   interruptible_future<bool> start_recovery_ops(
     RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+    crimson::osd::PglogBasedRecovery &recover_op,
     size_t max_to_start);
   void on_activate_complete();
   void on_backfill_reserved();
@@ -42,6 +44,7 @@ public:
   }
 
   seastar::future<> stop() { return seastar::now(); }
+  void on_pg_clean();
 private:
   PGRecoveryListener* pg;
   size_t start_primary_recovery_ops(
diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h
index a53221c4a69..6c88b170b4f 100644
--- a/src/crimson/osd/pg_recovery_listener.h
+++ b/src/crimson/osd/pg_recovery_listener.h
@@ -11,6 +11,7 @@
 
 namespace crimson::osd {
   class ShardServices;
+  class PglogBasedRecovery;
 };
 
 class RecoveryBackend;
@@ -38,4 +39,7 @@ public:
   virtual void publish_stats_to_osd() = 0;
   virtual OSDriver &get_osdriver() = 0;
   virtual SnapMapper &get_snap_mapper() = 0;
+  virtual void set_pglog_based_recovery_op(
+    crimson::osd::PglogBasedRecovery *op) = 0;
+  virtual void reset_pglog_based_recovery_op() = 0;
 };
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index 6ec8d30d596..cbb8c883e07 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -5,6 +5,7 @@
 
 #include "messages/MOSDRepOpReply.h"
 
+#include "crimson/common/coroutine.h"
 #include "crimson/common/exception.h"
 #include "crimson/common/log.h"
 #include "crimson/os/futurized_store.h"
@@ -38,13 +39,16 @@ ReplicatedBackend::_read(const hobject_t& hoid,
 ReplicatedBackend::rep_op_fut_t
 ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
                                       const hobject_t& hoid,
-                                      ceph::os::Transaction&& txn,
-                                      osd_op_params_t&& osd_op_p,
+                                      ceph::os::Transaction&& t,
+                                      osd_op_params_t&& opp,
                                       epoch_t min_epoch, epoch_t map_epoch,
-				      std::vector<pg_log_entry_t>&& log_entries)
+				      std::vector<pg_log_entry_t>&& logv)
 {
   LOG_PREFIX(ReplicatedBackend::submit_transaction);
   DEBUGDPP("object {}", dpp, hoid);
+  auto log_entries = std::move(logv);
+  auto txn = std::move(t);
+  auto osd_op_p = std::move(opp);
 
   const ceph_tid_t tid = shard_services.get_tid();
   auto pending_txn =
@@ -89,6 +93,8 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     }
   }
 
+  co_await pg.update_snap_map(log_entries, txn);
+
   pg.log_operation(
     std::move(log_entries),
     osd_op_p.pg_trim_to,
@@ -99,8 +105,8 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     false);
 
   auto all_completed = interruptor::make_interruptible(
-    shard_services.get_store().do_transaction(coll, std::move(txn))
-  ).then_interruptible([FNAME, this,
+      shard_services.get_store().do_transaction(coll, std::move(txn))
+   ).then_interruptible([FNAME, this,
 			peers=pending_txn->second.weak_from_this()] {
     if (!peers) {
       // for now, only actingset_changed can cause peers
@@ -117,13 +123,14 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
   }).then_interruptible([pending_txn, this] {
     auto acked_peers = std::move(pending_txn->second.acked_peers);
     pending_trans.erase(pending_txn);
-    return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers));
+    return seastar::make_ready_future<
+      crimson::osd::acked_peers_t>(std::move(acked_peers));
   });
 
   auto sends_complete = seastar::when_all_succeed(
     sends->begin(), sends->end()
   ).finally([sends=std::move(sends)] {});
-  return {std::move(sends_complete), std::move(all_completed)};
+  co_return std::make_tuple(std::move(sends_complete), std::move(all_completed));
 }
 
 void ReplicatedBackend::on_actingset_changed(bool same_primary)
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 57eb18b0d3e..627f4a3e85b 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -440,6 +440,7 @@ enum {
 	CEPH_MDS_OP_QUIESCE_PATH = 0x01508,
 	CEPH_MDS_OP_QUIESCE_INODE = 0x01509,
 	CEPH_MDS_OP_LOCK_PATH = 0x0150a,
+	CEPH_MDS_OP_UNINLINE_DATA = 0x0150b
 };
 
 #define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE     || \
diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h
index 314dbfad6ca..73efc73ae9a 100644
--- a/src/include/cephfs/types.h
+++ b/src/include/cephfs/types.h
@@ -861,6 +861,8 @@ void inode_t<Allocator>::dump(ceph::Formatter *f) const
   f->dump_unsigned("file_data_version", file_data_version);
   f->dump_unsigned("xattr_version", xattr_version);
   f->dump_unsigned("backtrace_version", backtrace_version);
+  f->dump_unsigned("inline_data_version", inline_data.version);
+  f->dump_unsigned("inline_data_length", inline_data.length());
 
   f->dump_string("stray_prior_path", stray_prior_path);
   f->dump_unsigned("max_size_ever", max_size_ever);
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
index b10ea7c27cb..48358fce936 100644
--- a/src/include/config-h.in.cmake
+++ b/src/include/config-h.in.cmake
@@ -393,6 +393,9 @@
 /* Define if libcryptsetup can be used (linux only) */
 #cmakedefine HAVE_LIBCRYPTSETUP
 
+/* Define if libnbd can be used */
+#cmakedefine HAVE_LIBNBD
+
 /* Shared library extension, such as .so, .dll or .dylib */
 #cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@"
 
diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt
index 3ba46028f0f..b66e5ead40a 100644
--- a/src/librbd/CMakeLists.txt
+++ b/src/librbd/CMakeLists.txt
@@ -77,6 +77,7 @@ set(librbd_internal_srcs
   exclusive_lock/PostAcquireRequest.cc
   exclusive_lock/PreReleaseRequest.cc
   exclusive_lock/StandardPolicy.cc
+  group/ListSnapshotsRequest.cc
   image/AttachChildRequest.cc
   image/AttachParentRequest.cc
   image/CloneRequest.cc
@@ -219,6 +220,11 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
           crypto/luks/Magic.cc)
 endif()
 
+if(HAVE_LIBNBD)
+  list(APPEND librbd_internal_srcs
+          migration/NBDStream.cc)
+endif()
+
 add_library(rbd_api STATIC librbd.cc)
 add_library(rbd_internal STATIC
   ${librbd_internal_srcs}
@@ -240,6 +246,10 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
   target_include_directories(rbd_internal PRIVATE ${LIBCRYPTSETUP_INCLUDE_DIR})
   target_link_libraries(rbd_internal PRIVATE ${LIBCRYPTSETUP_LIBRARIES})
 endif()
+if(HAVE_LIBNBD)
+  target_include_directories(rbd_internal PRIVATE ${LIBNBD_INCLUDE_DIR})
+  target_link_libraries(rbd_internal PRIVATE ${LIBNBD_LIBRARIES})
+endif()
 
 add_custom_target(librbd_plugins)
 set(librbd_plugins_dir ${CEPH_INSTALL_PKGLIBDIR}/librbd)
diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc
index bf1c53284f2..97c53abd70f 100644
--- a/src/librbd/api/Group.cc
+++ b/src/librbd/api/Group.cc
@@ -11,6 +11,7 @@
 #include "librbd/ImageWatcher.h"
 #include "librbd/Operations.h"
 #include "librbd/Utils.h"
+#include "librbd/group/ListSnapshotsRequest.h"
 #include "librbd/internal.h"
 #include "librbd/io/AioCompletion.h"
 
@@ -53,36 +54,18 @@ snap_t get_group_snap_id(I* ictx,
   return CEPH_NOSNAP;
 }
 
+template <typename I>
 int group_snap_list(librados::IoCtx& group_ioctx, const std::string& group_id,
-		    std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
+                    bool try_to_sort, bool fail_if_not_sorted,
+                    std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
 {
-  CephContext *cct = (CephContext *)group_ioctx.cct();
-
-  string group_header_oid = util::group_header_name(group_id);
-
-  const int max_read = 1024;
-  cls::rbd::GroupSnapshot snap_last;
-  int r;
-
-  for (;;) {
-    vector<cls::rbd::GroupSnapshot> snaps_page;
-
-    r = cls_client::group_snap_list(&group_ioctx, group_header_oid,
-				    snap_last, max_read, &snaps_page);
-
-    if (r < 0) {
-      lderr(cct) << "error reading snap list from group: "
-	<< cpp_strerror(-r) << dendl;
-      return r;
-    }
-    cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end());
-    if (snaps_page.size() < max_read) {
-      break;
-    }
-    snap_last = *snaps_page.rbegin();
-  }
-
-  return 0;
+  C_SaferCond cond;
+  auto req = group::ListSnapshotsRequest<I>::create(group_ioctx, group_id,
+                                                    try_to_sort,
+                                                    fail_if_not_sorted,
+                                                    cls_snaps, &cond);
+  req->send();
+  return cond.wait();
 }
 
 std::string calc_ind_image_snap_name(uint64_t pool_id,
@@ -593,7 +576,7 @@ int Group<I>::remove(librados::IoCtx& io_ctx, const char *group_name)
   string group_header_oid = util::group_header_name(group_id);
 
   std::vector<cls::rbd::GroupSnapshot> snaps;
-  r = group_snap_list(io_ctx, group_id, &snaps);
+  r = group_snap_list<I>(io_ctx, group_id, false, false, &snaps);
   if (r < 0 && r != -ENOENT) {
     lderr(cct) << "error listing group snapshots" << dendl;
     return r;
@@ -1191,7 +1174,7 @@ int Group<I>::snap_remove(librados::IoCtx& group_ioctx, const char *group_name,
   }
 
   std::vector<cls::rbd::GroupSnapshot> snaps;
-  r = group_snap_list(group_ioctx, group_id, &snaps);
+  r = group_snap_list<I>(group_ioctx, group_id, false, false, &snaps);
   if (r < 0) {
     return r;
   }
@@ -1232,7 +1215,7 @@ int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
   }
 
   std::vector<cls::rbd::GroupSnapshot> group_snaps;
-  r = group_snap_list(group_ioctx, group_id, &group_snaps);
+  r = group_snap_list<I>(group_ioctx, group_id, false, false, &group_snaps);
   if (r < 0) {
     return r;
   }
@@ -1261,6 +1244,7 @@ int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
 
 template <typename I>
 int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+                        bool try_to_sort, bool fail_if_not_sorted,
 			std::vector<group_snap_info2_t> *group_snaps)
 {
   CephContext *cct = (CephContext *)group_ioctx.cct();
@@ -1275,7 +1259,8 @@ int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
   }
 
   std::vector<cls::rbd::GroupSnapshot> cls_group_snaps;
-  r = group_snap_list(group_ioctx, group_id, &cls_group_snaps);
+  r = group_snap_list<I>(group_ioctx, group_id, try_to_sort, fail_if_not_sorted,
+                         &cls_group_snaps);
   if (r < 0) {
     return r;
   }
@@ -1311,7 +1296,7 @@ int Group<I>::snap_get_info(librados::IoCtx& group_ioctx,
   }
 
   std::vector<cls::rbd::GroupSnapshot> cls_group_snaps;
-  r = group_snap_list(group_ioctx, group_id, &cls_group_snaps);
+  r = group_snap_list<I>(group_ioctx, group_id, false, false, &cls_group_snaps);
   if (r < 0) {
     return r;
   }
@@ -1353,7 +1338,7 @@ int Group<I>::snap_rollback(librados::IoCtx& group_ioctx,
   }
 
   std::vector<cls::rbd::GroupSnapshot> snaps;
-  r = group_snap_list(group_ioctx, group_id, &snaps);
+  r = group_snap_list<I>(group_ioctx, group_id, false, false, &snaps);
   if (r < 0) {
     return r;
   }
diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h
index 2a7574612ec..ffbb9afea1a 100644
--- a/src/librbd/api/Group.h
+++ b/src/librbd/api/Group.h
@@ -47,6 +47,7 @@ struct Group {
   static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
                          const char *old_snap_name, const char *new_snap_name);
   static int snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+                       bool try_to_sort, bool fail_if_not_sorted,
                        std::vector<group_snap_info2_t> *snaps);
   static int snap_get_info(librados::IoCtx& group_ioctx,
                            const char *group_name, const char *snap_name,
diff --git a/src/librbd/group/ListSnapshotsRequest.cc b/src/librbd/group/ListSnapshotsRequest.cc
new file mode 100644
index 00000000000..bbd3759ff2b
--- /dev/null
+++ b/src/librbd/group/ListSnapshotsRequest.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/group/ListSnapshotsRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::group::ListSnapshotsRequest: " << this \
+                           << " " << __func__ << ": "
+
+namespace librbd {
+namespace group {
+
+namespace {
+
+const uint32_t MAX_RETURN = 1024;
+
+} // anonymous namespace
+
+template <typename I>
+ListSnapshotsRequest<I>::ListSnapshotsRequest(librados::IoCtx &group_io_ctx,
+                                              const std::string &group_id,
+                                              bool try_to_sort,
+                                              bool fail_if_not_sorted,
+                                              std::vector<cls::rbd::GroupSnapshot> *snaps,
+                                              Context *on_finish)
+     : m_group_io_ctx(group_io_ctx), m_group_id(group_id),
+       m_try_to_sort(try_to_sort), m_fail_if_not_sorted(fail_if_not_sorted),
+       m_snaps(snaps), m_on_finish(on_finish) {
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 20) << "group_id=" << m_group_id
+                 << ", try_to_sort=" << m_try_to_sort
+                 << ", fail_if_not_sorted=" << m_fail_if_not_sorted
+                 << dendl;
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::send() {
+  list_snap_orders();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::list_snap_orders() {
+  if (!m_try_to_sort) {
+    list_snaps();
+    return;
+  }
+
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::group_snap_list_order_start(&op, m_start_after_order, MAX_RETURN);
+  auto comp = util::create_rados_callback<
+      ListSnapshotsRequest<I>,
+      &ListSnapshotsRequest<I>::handle_list_snap_orders>(this);
+  m_out_bl.clear();
+  int r = m_group_io_ctx.aio_operate(util::group_header_name(m_group_id), comp,
+                                     &op, &m_out_bl);
+  ceph_assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::handle_list_snap_orders(int r) {
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  std::map<std::string, uint64_t> snap_orders;
+  if (r == 0) {
+    auto iter = m_out_bl.cbegin();
+    r = cls_client::group_snap_list_order_finish(&iter, &snap_orders);
+  }
+
+  if (r < 0) {
+    if (r == -EOPNOTSUPP && !m_fail_if_not_sorted) {
+      list_snaps();
+      return;
+    } else {
+      lderr(cct) << "failed to get group snapshot orders: " << cpp_strerror(r)
+                 << dendl;
+      finish(r);
+      return;
+    }
+  }
+
+  m_snap_orders.insert(snap_orders.begin(), snap_orders.end());
+  if (snap_orders.size() < MAX_RETURN) {
+    list_snaps();
+    return;
+  }
+
+  m_start_after_order = snap_orders.rbegin()->first;
+  list_snap_orders();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::list_snaps() {
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << dendl;
+
+  librados::ObjectReadOperation op;
+  cls_client::group_snap_list_start(&op, m_start_after, MAX_RETURN);
+  auto comp = util::create_rados_callback<
+      ListSnapshotsRequest<I>,
+      &ListSnapshotsRequest<I>::handle_list_snaps>(this);
+  m_out_bl.clear();
+  int r = m_group_io_ctx.aio_operate(util::group_header_name(m_group_id), comp,
+                                     &op, &m_out_bl);
+  ceph_assert(r == 0);
+  comp->release();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::handle_list_snaps(int r) {
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  std::vector<cls::rbd::GroupSnapshot> snaps;
+  if (r == 0) {
+    auto iter = m_out_bl.cbegin();
+    r = cls_client::group_snap_list_finish(&iter, &snaps);
+  }
+
+  if (r < 0) {
+    lderr(cct) << "failed to list group snapshots: " << cpp_strerror(r)
+               << dendl;
+    finish(r);
+    return;
+  }
+
+  m_snaps->insert(m_snaps->end(), snaps.begin(), snaps.end());
+  if (snaps.size() < MAX_RETURN) {
+    sort_snaps();
+    return;
+  }
+
+  m_start_after = *snaps.rbegin();
+  list_snaps();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::sort_snaps() {
+  if (!m_try_to_sort) {
+    finish(0);
+    return;
+  }
+
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << dendl;
+
+  for (const auto& snap : *m_snaps) {
+    if (m_snap_orders.find(snap.id) == m_snap_orders.end()) {
+      ldout(cct, 10) << "Missing order for snap_id=" << snap.id << dendl;
+      finish(m_fail_if_not_sorted ? -EINVAL : 0);
+      return;
+    }
+  }
+
+  std::sort(m_snaps->begin(), m_snaps->end(),
+            [this](const cls::rbd::GroupSnapshot &a,
+                   const cls::rbd::GroupSnapshot &b) {
+	       return this->m_snap_orders[a.id] < this->m_snap_orders[b.id];
+	    });
+
+  finish(0);
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::finish(int r) {
+  auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+  ldout(cct, 10) << "r=" << r << dendl;
+
+  m_on_finish->complete(r);
+  delete this;
+}
+
+} // namespace group
+} // namespace librbd
+
+template class librbd::group::ListSnapshotsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/group/ListSnapshotsRequest.h b/src/librbd/group/ListSnapshotsRequest.h
new file mode 100644
index 00000000000..7152dd981ab
--- /dev/null
+++ b/src/librbd/group/ListSnapshotsRequest.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
+#define CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <string>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace group {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ListSnapshotsRequest {
+public:
+  static ListSnapshotsRequest *create(
+      librados::IoCtx &group_io_ctx, const std::string &group_id,
+      bool try_to_sort, bool fail_if_not_sorted,
+      std::vector<cls::rbd::GroupSnapshot> *snaps, Context *on_finish) {
+    return new ListSnapshotsRequest(group_io_ctx, group_id, try_to_sort,
+                                    fail_if_not_sorted, snaps, on_finish);
+  }
+
+  ListSnapshotsRequest(librados::IoCtx &group_io_ctx,
+                       const std::string &group_id,
+                       bool try_to_sort, bool fail_if_not_sorted,
+                       std::vector<cls::rbd::GroupSnapshot> *snaps,
+                       Context *on_finish);
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>    /--------\
+   *    |       |        | (if required. repeat if more
+   *    v       v        |  entries)
+   *  LIST_SNAP_ORDERS --/
+   *    |       /--------\
+   *    |       |        | (repeat if more
+   *    v       v        |  snapshots)
+   *  LIST_SNAPS --------/
+   *    |
+   *    v
+   *  SORT_SNAPS (if required)  
+   *    |
+   *    v
+   *  <finish>
+   *
+   * @endverbatim
+   */
+
+  librados::IoCtx &m_group_io_ctx;
+  std::string m_group_id;
+  bool m_try_to_sort;
+  //Fail if m_try_to_sort is true and sorting fails. Ignored if m_try_to_sort is false.
+  bool m_fail_if_not_sorted;
+  std::vector<cls::rbd::GroupSnapshot> *m_snaps;
+  std::map<std::string, uint64_t> m_snap_orders;
+  Context *m_on_finish;
+
+  cls::rbd::GroupSnapshot m_start_after;
+  std::string m_start_after_order;
+  bufferlist m_out_bl;
+
+  void list_snaps();
+  void handle_list_snaps(int r);
+
+  void list_snap_orders();
+  void handle_list_snap_orders(int r);
+
+  void sort_snaps();
+
+  void finish(int r);
+};
+
+} // namespace group
+} // namespace librbd
+
+extern template class librbd::group::ListSnapshotsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index ed8ec9e9130..c389282c0cc 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -1456,7 +1456,8 @@ namespace librbd {
     }
 
     std::vector<group_snap_info2_t> snaps2;
-    int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &snaps2);
+    int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true,
+                                            false, &snaps2);
 
     for (const auto& snap : snaps2) {
       snaps->push_back(
@@ -1473,7 +1474,8 @@ namespace librbd {
   int RBD::group_snap_list2(IoCtx& group_ioctx, const char *group_name,
                             std::vector<group_snap_info2_t> *snaps)
   {
-    return librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps);
+    return librbd::api::Group<>::snap_list(group_ioctx, group_name, true,
+                                           false, snaps);
   }
 
   int RBD::group_snap_get_info(IoCtx& group_ioctx, const char *group_name,
@@ -7322,7 +7324,8 @@ extern "C" int rbd_group_snap_list(rados_ioctx_t group_p,
   }
 
   std::vector<librbd::group_snap_info2_t> cpp_snaps;
-  int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+  int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true, false,
+                                          &cpp_snaps);
 
   if (r == -ENOENT) {
     *snaps_size = 0;
@@ -7372,7 +7375,8 @@ extern "C" int rbd_group_snap_list2(rados_ioctx_t group_p,
   librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
 
   std::vector<librbd::group_snap_info2_t> cpp_snaps;
-  int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+  int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true, false,
+                                          &cpp_snaps);
   if (r < 0) {
     return r;
   }
diff --git a/src/librbd/migration/FileStream.cc b/src/librbd/migration/FileStream.cc
index b548de7ee37..2da9e0df5e7 100644
--- a/src/librbd/migration/FileStream.cc
+++ b/src/librbd/migration/FileStream.cc
@@ -226,6 +226,18 @@ void FileStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
 
 #endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
 
+template <typename I>
+void FileStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+                                        io::SparseExtents* sparse_extents,
+                                        Context* on_finish) {
+  // TODO: list sparse extents based on SEEK_HOLE/SEEK_DATA
+  for (auto [byte_offset, byte_length] : byte_extents) {
+    sparse_extents->insert(byte_offset, byte_length,
+                           {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+  }
+  on_finish->complete(0);
+}
+
 } // namespace migration
 } // namespace librbd
 
diff --git a/src/librbd/migration/FileStream.h b/src/librbd/migration/FileStream.h
index 1a7e20ac707..e6050d865cc 100644
--- a/src/librbd/migration/FileStream.h
+++ b/src/librbd/migration/FileStream.h
@@ -44,6 +44,10 @@ public:
   void read(io::Extents&& byte_extents, bufferlist* data,
             Context* on_finish) override;
 
+  void list_sparse_extents(io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents,
+                           Context* on_finish) override;
+
 private:
   CephContext* m_cct;
   std::shared_ptr<AsioEngine> m_asio_engine;
diff --git a/src/librbd/migration/HttpStream.cc b/src/librbd/migration/HttpStream.cc
index fa3cc00320e..ecf771bb53c 100644
--- a/src/librbd/migration/HttpStream.cc
+++ b/src/librbd/migration/HttpStream.cc
@@ -77,6 +77,18 @@ void HttpStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
   m_http_client->read(std::move(byte_extents), data, on_finish);
 }
 
+template <typename I>
+void HttpStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+                                        io::SparseExtents* sparse_extents,
+                                        Context* on_finish) {
+  // no sparseness information -- list the full range as DATA
+  for (auto [byte_offset, byte_length] : byte_extents) {
+    sparse_extents->insert(byte_offset, byte_length,
+                           {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+  }
+  on_finish->complete(0);
+}
+
 } // namespace migration
 } // namespace librbd
 
diff --git a/src/librbd/migration/HttpStream.h b/src/librbd/migration/HttpStream.h
index 01a58371496..8606f271298 100644
--- a/src/librbd/migration/HttpStream.h
+++ b/src/librbd/migration/HttpStream.h
@@ -45,6 +45,10 @@ public:
   void read(io::Extents&& byte_extents, bufferlist* data,
             Context* on_finish) override;
 
+  void list_sparse_extents(io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents,
+                           Context* on_finish) override;
+
 private:
   using HttpResponse = boost::beast::http::response<
     boost::beast::http::string_body>;
diff --git a/src/librbd/migration/NBDStream.cc b/src/librbd/migration/NBDStream.cc
new file mode 100644
index 00000000000..bf8c0c8519e
--- /dev/null
+++ b/src/librbd/migration/NBDStream.cc
@@ -0,0 +1,370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/NBDStream.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+
+#include <libnbd.h>
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string URI_KEY{"uri"};
+
+int from_nbd_errno(int rc) {
+  // nbd_get_errno() needs a default/fallback error:
+  // "Even when a call returns an error, nbd_get_errno() might return 0.
+  // This does not mean there was no error. It means no additional errno
+  // information is available for this error."
+  return rc > 0 ? -rc : -EIO;
+}
+
+int extent_cb(void* data, const char* metacontext, uint64_t offset,
+              uint32_t* entries, size_t nr_entries, int* error) {
+  auto sparse_extents = reinterpret_cast<io::SparseExtents*>(data);
+
+  // "[...] always check the metacontext field to ensure you are
+  // receiving the data you expect."
+  if (strcmp(metacontext, LIBNBD_CONTEXT_BASE_ALLOCATION) == 0) {
+    for (size_t i = 0; i < nr_entries; i += 2) {
+      auto length = entries[i];
+      auto state = entries[i + 1];
+      if (length > 0 && state & (LIBNBD_STATE_HOLE | LIBNBD_STATE_ZERO)) {
+        sparse_extents->insert(offset, length,
+                               {io::SPARSE_EXTENT_STATE_ZEROED, length});
+      }
+      offset += length;
+    }
+  }
+
+  return 1;
+}
+
+} // anonymous namespace
+
+template <typename>
+class NBDClient {
+public:
+  static NBDClient* create() {
+    return new NBDClient();
+  }
+
+  const char* get_error() {
+    return nbd_get_error();
+  }
+
+  int get_errno() {
+    return nbd_get_errno();
+  }
+
+  int init() {
+    m_handle.reset(nbd_create());
+    return m_handle != nullptr ? 0 : -1;
+  }
+
+  int add_meta_context(const char* name) {
+    return nbd_add_meta_context(m_handle.get(), name);
+  }
+
+  int connect_uri(const char* uri) {
+    return nbd_connect_uri(m_handle.get(), uri);
+  }
+
+  int64_t get_size() {
+    return nbd_get_size(m_handle.get());
+  }
+
+  int pread(void* buf, size_t count, uint64_t offset, uint32_t flags) {
+    return nbd_pread(m_handle.get(), buf, count, offset, flags);
+  }
+
+  int block_status(uint64_t count, uint64_t offset,
+                   nbd_extent_callback extent_callback, uint32_t flags) {
+    return nbd_block_status(m_handle.get(), count, offset, extent_callback,
+                            flags);
+  }
+
+  int shutdown(uint32_t flags) {
+    return nbd_shutdown(m_handle.get(), flags);
+  }
+
+private:
+  struct nbd_handle_deleter {
+    void operator()(nbd_handle* h) {
+      nbd_close(h);
+    }
+  };
+  std::unique_ptr<nbd_handle, nbd_handle_deleter> m_handle;
+};
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream::ReadRequest: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+struct NBDStream<I>::ReadRequest {
+  NBDStream* nbd_stream;
+  io::Extents byte_extents;
+  bufferlist* data;
+  Context* on_finish;
+  size_t index = 0;
+
+  ReadRequest(NBDStream* nbd_stream, io::Extents&& byte_extents,
+              bufferlist* data, Context* on_finish)
+    : nbd_stream(nbd_stream), byte_extents(std::move(byte_extents)),
+      data(data), on_finish(on_finish) {
+    auto cct = nbd_stream->m_cct;
+    ldout(cct, 20) << dendl;
+  }
+
+  void send() {
+    data->clear();
+    read();
+  }
+
+  void read() {
+    if (index >= byte_extents.size()) {
+      finish(0);
+      return;
+    }
+
+    auto cct = nbd_stream->m_cct;
+    auto [byte_offset, byte_length] = byte_extents[index++];
+    ldout(cct, 20) << "byte_offset=" << byte_offset << " byte_length="
+                   << byte_length << dendl;
+
+    auto& nbd_client = nbd_stream->m_nbd_client;
+    auto ptr = buffer::ptr_node::create(buffer::create_small_page_aligned(
+      byte_length));
+    int rc = nbd_client->pread(ptr->c_str(), byte_length, byte_offset, 0);
+    if (rc == -1) {
+      rc = nbd_client->get_errno();
+      lderr(cct) << "pread " << byte_offset << "~" << byte_length << ": "
+                 << nbd_client->get_error() << " (errno = " << rc << ")"
+                 << dendl;
+      finish(from_nbd_errno(rc));
+      return;
+    }
+
+    data->push_back(std::move(ptr));
+    boost::asio::post(nbd_stream->m_strand, [this] { read(); });
+  }
+
+  void finish(int r) {
+    auto cct = nbd_stream->m_cct;
+    ldout(cct, 20) << "r=" << r << dendl;
+
+    if (r < 0) {
+      data->clear();
+    }
+
+    on_finish->complete(r);
+    delete this;
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream::ListSparseExtentsRequest: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+struct NBDStream<I>::ListSparseExtentsRequest {
+  NBDStream* nbd_stream;
+  io::Extents byte_extents;
+  io::SparseExtents* sparse_extents;
+  Context* on_finish;
+  size_t index = 0;
+
+  ListSparseExtentsRequest(NBDStream* nbd_stream, io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents, Context* on_finish)
+    : nbd_stream(nbd_stream), byte_extents(std::move(byte_extents)),
+      sparse_extents(sparse_extents), on_finish(on_finish) {
+    auto cct = nbd_stream->m_cct;
+    ldout(cct, 20) << dendl;
+  }
+
+  void send() {
+    list_sparse_extents();
+  }
+
+  void list_sparse_extents() {
+    if (index >= byte_extents.size()) {
+      finish(0);
+      return;
+    }
+
+    auto cct = nbd_stream->m_cct;
+    auto [byte_offset, byte_length] = byte_extents[index++];
+    ldout(cct, 20) << "byte_offset=" << byte_offset << " byte_length="
+                   << byte_length << dendl;
+
+    // nbd_block_status() is specified to be really loose:
+    // "The count parameter is a hint: the server may choose to
+    // return less status, or the final block may extend beyond the
+    // requested range. [...] It is possible for the extent function
+    // to be called more times than you expect [...] It is also
+    // possible that the extent function is not called at all, even
+    // for metadata contexts that you requested."
+    io::SparseExtents tmp_sparse_extents;
+    tmp_sparse_extents.insert(byte_offset, byte_length,
+                              {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+
+    auto& nbd_client = nbd_stream->m_nbd_client;
+    int rc = nbd_client->block_status(byte_length, byte_offset,
+                                      {extent_cb, &tmp_sparse_extents}, 0);
+    if (rc == -1) {
+      rc = nbd_client->get_errno();
+      lderr(cct) << "block_status " << byte_offset << "~" << byte_length << ": "
+                 << nbd_client->get_error() << " (errno = " << rc << ")"
+                 << dendl;
+      // don't propagate errors -- we are set up to list any missing
+      // parts of the range as DATA if nbd_block_status() returns less
+      // status or none at all
+    }
+
+    // trim the result in case more status was returned
+    sparse_extents->insert(tmp_sparse_extents.intersect(byte_offset,
+                                                        byte_length));
+
+    boost::asio::post(nbd_stream->m_strand, [this] { list_sparse_extents(); });
+  }
+
+  void finish(int r) {
+    auto cct = nbd_stream->m_cct;
+    ldout(cct, 20) << "r=" << r << dendl;
+
+    on_finish->complete(r);
+    delete this;
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream: " \
+                           << this << " " << __func__ << ": "
+
+template <typename I>
+NBDStream<I>::NBDStream(I* image_ctx, const json_spirit::mObject& json_object)
+  : m_cct(image_ctx->cct), m_asio_engine(image_ctx->asio_engine),
+    m_json_object(json_object),
+    m_strand(boost::asio::make_strand(*m_asio_engine)) {
+}
+
+template <typename I>
+NBDStream<I>::~NBDStream() {
+}
+
+template <typename I>
+void NBDStream<I>::open(Context* on_finish) {
+  std::string uri;
+  int rc;
+
+  if (auto it = m_json_object.find(URI_KEY);
+      it != m_json_object.end()) {
+    if (it->second.type() == json_spirit::str_type) {
+      uri = it->second.get_str();
+    } else {
+      lderr(m_cct) << "invalid URI" << dendl;
+      on_finish->complete(-EINVAL);
+      return;
+    }
+  } else {
+    lderr(m_cct) << "missing URI" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  ldout(m_cct, 10) << "uri=" << uri << dendl;
+
+  m_nbd_client.reset(NBDClient<I>::create());
+  rc = m_nbd_client->init();
+  if (rc == -1) {
+    rc = m_nbd_client->get_errno();
+    lderr(m_cct) << "init: " << m_nbd_client->get_error()
+                 << " (errno = " << rc << ")" << dendl;
+    on_finish->complete(from_nbd_errno(rc));
+    return;
+  }
+
+  rc = m_nbd_client->add_meta_context(LIBNBD_CONTEXT_BASE_ALLOCATION);
+  if (rc == -1) {
+    rc = m_nbd_client->get_errno();
+    lderr(m_cct) << "add_meta_context: " << m_nbd_client->get_error()
+                 << " (errno = " << rc << ")" << dendl;
+    on_finish->complete(from_nbd_errno(rc));
+    return;
+  }
+
+  rc = m_nbd_client->connect_uri(uri.c_str());
+  if (rc == -1) {
+    rc = m_nbd_client->get_errno();
+    lderr(m_cct) << "connect_uri: " << m_nbd_client->get_error()
+                 << " (errno = " << rc << ")" << dendl;
+    on_finish->complete(from_nbd_errno(rc));
+    return;
+  }
+
+  on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::close(Context* on_finish) {
+  ldout(m_cct, 20) << dendl;
+
+  if (m_nbd_client != nullptr) {
+    // send a graceful shutdown to the server
+    // ignore errors -- we are read-only, also from the client's
+    // POV there is no disadvantage to abruptly closing the socket
+    // in nbd_close()
+    m_nbd_client->shutdown(0);
+    m_nbd_client.reset();
+  }
+
+  on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::get_size(uint64_t* size, Context* on_finish) {
+  ldout(m_cct, 20) << dendl;
+
+  int64_t rc = m_nbd_client->get_size();
+  if (rc == -1) {
+    rc = m_nbd_client->get_errno();
+    lderr(m_cct) << "get_size: " << m_nbd_client->get_error()
+                 << " (errno = " << rc << ")" << dendl;
+    on_finish->complete(from_nbd_errno(rc));
+    return;
+  }
+
+  *size = rc;
+  on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::read(io::Extents&& byte_extents,
+                        bufferlist* data,
+                        Context* on_finish) {
+  ldout(m_cct, 20) << byte_extents << dendl;
+  auto ctx = new ReadRequest(this, std::move(byte_extents), data, on_finish);
+  boost::asio::post(m_strand, [ctx] { ctx->send(); });
+}
+
+template <typename I>
+void NBDStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+                                       io::SparseExtents* sparse_extents,
+                                       Context* on_finish) {
+  ldout(m_cct, 20) << byte_extents << dendl;
+  auto ctx = new ListSparseExtentsRequest(this, std::move(byte_extents),
+                                          sparse_extents, on_finish);
+  boost::asio::post(m_strand, [ctx] { ctx->send(); });
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::NBDStream<librbd::ImageCtx>;
diff --git a/src/librbd/migration/NBDStream.h b/src/librbd/migration/NBDStream.h
new file mode 100644
index 00000000000..aeced5d4f3d
--- /dev/null
+++ b/src/librbd/migration/NBDStream.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
+#define CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
+
+#include "include/int_types.h"
+#include "librbd/migration/StreamInterface.h"
+#include <json_spirit/json_spirit.h>
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/strand.hpp>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> class NBDClient;
+
+template <typename ImageCtxT>
+class NBDStream : public StreamInterface {
+public:
+  static NBDStream* create(ImageCtxT* image_ctx,
+                           const json_spirit::mObject& json_object) {
+    return new NBDStream(image_ctx, json_object);
+  }
+
+  NBDStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object);
+  ~NBDStream() override;
+
+  NBDStream(const NBDStream&) = delete;
+  NBDStream& operator=(const NBDStream&) = delete;
+
+  void open(Context* on_finish) override;
+  void close(Context* on_finish) override;
+
+  void get_size(uint64_t* size, Context* on_finish) override;
+
+  void read(io::Extents&& byte_extents, bufferlist* data,
+            Context* on_finish) override;
+
+  void list_sparse_extents(io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents,
+                           Context* on_finish) override;
+
+private:
+  CephContext* m_cct;
+  std::shared_ptr<AsioEngine> m_asio_engine;
+  json_spirit::mObject m_json_object;
+  boost::asio::strand<boost::asio::io_context::executor_type> m_strand;
+
+  std::unique_ptr<NBDClient<ImageCtxT>> m_nbd_client;
+
+  struct ReadRequest;
+  struct ListSparseExtentsRequest;
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::NBDStream<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
diff --git a/src/librbd/migration/RawSnapshot.cc b/src/librbd/migration/RawSnapshot.cc
index f56d518c3e1..ce280f8f16e 100644
--- a/src/librbd/migration/RawSnapshot.cc
+++ b/src/librbd/migration/RawSnapshot.cc
@@ -205,13 +205,9 @@ void RawSnapshot<I>::list_snap(io::Extents&& image_extents,
   auto cct = m_image_ctx->cct;
   ldout(cct, 20) << "image_extents=" << image_extents << dendl;
 
-  // raw does support sparse extents so list the full IO extent as a delta
-  for (auto& [image_offset, image_length] : image_extents) {
-    sparse_extents->insert(image_offset, image_length,
-                           {io::SPARSE_EXTENT_STATE_DATA, image_length});
-  }
-
-  on_finish->complete(0);
+  // raw directly maps the image-extent IO down to a byte IO extent
+  m_stream->list_sparse_extents(std::move(image_extents), sparse_extents,
+                                on_finish);
 }
 
 } // namespace migration
diff --git a/src/librbd/migration/S3Stream.cc b/src/librbd/migration/S3Stream.cc
index a611e274ae1..b53e821991a 100644
--- a/src/librbd/migration/S3Stream.cc
+++ b/src/librbd/migration/S3Stream.cc
@@ -194,6 +194,18 @@ void S3Stream<I>::process_request(HttpRequest& http_request) {
                    << "authorization=" << authorization << dendl;
 }
 
+template <typename I>
+void S3Stream<I>::list_sparse_extents(io::Extents&& byte_extents,
+                                      io::SparseExtents* sparse_extents,
+                                      Context* on_finish) {
+  // no sparseness information -- list the full range as DATA
+  for (auto [byte_offset, byte_length] : byte_extents) {
+    sparse_extents->insert(byte_offset, byte_length,
+                           {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+  }
+  on_finish->complete(0);
+}
+
 } // namespace migration
 } // namespace librbd
 
diff --git a/src/librbd/migration/S3Stream.h b/src/librbd/migration/S3Stream.h
index 586b217878c..1c2927ad1e1 100644
--- a/src/librbd/migration/S3Stream.h
+++ b/src/librbd/migration/S3Stream.h
@@ -46,6 +46,10 @@ public:
   void read(io::Extents&& byte_extents, bufferlist* data,
             Context* on_finish) override;
 
+  void list_sparse_extents(io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents,
+                           Context* on_finish) override;
+
 private:
   using HttpRequest = boost::beast::http::request<
     boost::beast::http::empty_body>;
diff --git a/src/librbd/migration/SourceSpecBuilder.cc b/src/librbd/migration/SourceSpecBuilder.cc
index f44d97d58f1..78937db99b3 100644
--- a/src/librbd/migration/SourceSpecBuilder.cc
+++ b/src/librbd/migration/SourceSpecBuilder.cc
@@ -7,6 +7,9 @@
 #include "librbd/migration/FileStream.h"
 #include "librbd/migration/HttpStream.h"
 #include "librbd/migration/S3Stream.h"
+#if defined(HAVE_LIBNBD)
+#include "librbd/migration/NBDStream.h"
+#endif
 #include "librbd/migration/NativeFormat.h"
 #include "librbd/migration/QCOWFormat.h"
 #include "librbd/migration/RawFormat.h"
@@ -125,6 +128,10 @@ int SourceSpecBuilder<I>::build_stream(
     stream->reset(HttpStream<I>::create(m_image_ctx, stream_obj));
   } else if (type == "s3") {
     stream->reset(S3Stream<I>::create(m_image_ctx, stream_obj));
+#if defined(HAVE_LIBNBD)
+  } else if (type == "nbd") {
+    stream->reset(NBDStream<I>::create(m_image_ctx, stream_obj));
+#endif
   } else {
     lderr(cct) << "unknown or unsupported stream type '" << type << "'"
                << dendl;
diff --git a/src/librbd/migration/StreamInterface.h b/src/librbd/migration/StreamInterface.h
index 782a9a5f8d5..52ded94ccad 100644
--- a/src/librbd/migration/StreamInterface.h
+++ b/src/librbd/migration/StreamInterface.h
@@ -24,6 +24,10 @@ struct StreamInterface {
 
   virtual void read(io::Extents&& byte_extents, bufferlist* data,
                     Context* on_finish) = 0;
+
+  virtual void list_sparse_extents(io::Extents&& byte_extents,
+                                   io::SparseExtents* sparse_extents,
+                                   Context* on_finish) = 0;
 };
 
 } // namespace migration
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 89e2e5e4be9..f000da7928a 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -3461,16 +3461,23 @@ bool CDir::can_auth_pin(int *err_ret) const
 {
   int err;
   if (!is_auth()) {
+    dout(20) << __func__ << ": error - no auth" << dendl;
     err = ERR_NOT_AUTH;
   } else if (is_freezing_dir() || is_frozen_dir()) {
+    dout(20) << __func__ << ": error - fragmenting dir ("
+	     << (is_freezing_dir() ? "freezing" : "frozen")
+	     << ")" << dendl;
     err = ERR_FRAGMENTING_DIR;
   } else {
     auto p = is_freezing_or_frozen_tree();
     if (p.first) {
+      dout(20) << __func__ << ": error - exporting tree" << dendl;
       err = ERR_EXPORTING_TREE;
     } else if (p.second) {
+      dout(20) << __func__ << ": error - exporting tree" << dendl;
       err = ERR_EXPORTING_TREE;
     } else {
+      dout(20) << __func__ << ": auth!" << dendl;
       err = 0;
     }
   }
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 46aff12673e..faf9f408688 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -2987,12 +2987,15 @@ void CInode::clear_ambiguous_auth()
 bool CInode::can_auth_pin(int *err_ret) const {
   int err;
   if (!is_auth()) {
+    dout(20) << __func__ << ": error - no auth" << dendl;
     err = ERR_NOT_AUTH;
   } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
+    dout(20) << __func__ << ": error - exporting inode" << dendl;
     err = ERR_EXPORTING_INODE;
   } else {
     if (parent)
       return parent->can_auth_pin(err_ret);
+    dout(20) << __func__ << ": auth!" << dendl;
     err = 0;
   }
   if (err && err_ret)
@@ -5281,6 +5284,7 @@ void CInode::scrub_maybe_delete_info()
 {
   if (scrub_infop &&
       !scrub_infop->scrub_in_progress &&
+      !scrub_infop->uninline_in_progress &&
       !scrub_infop->last_scrub_dirty) {
     scrub_infop.reset();
   }
@@ -5292,30 +5296,52 @@ void CInode::scrub_initialize(ScrubHeaderRef& header)
 
   scrub_info();
   scrub_infop->scrub_in_progress = true;
+  scrub_infop->uninline_in_progress = false;
   scrub_infop->queued_frags.clear();
   scrub_infop->header = header;
   header->inc_num_pending();
   // right now we don't handle remote inodes
 }
 
+void CInode::uninline_initialize()
+{
+  dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
+  scrub_infop->uninline_in_progress = true;
+}
+
 void CInode::scrub_aborted() {
   dout(20) << __func__ << dendl;
   ceph_assert(scrub_is_in_progress());
 
   scrub_infop->scrub_in_progress = false;
+  scrub_infop->uninline_in_progress = false;
   scrub_infop->header->dec_num_pending();
   scrub_maybe_delete_info();
 }
 
+void CInode::common_finished() {
+  if (!scrub_is_in_progress()) {
+    scrub_infop->last_scrub_version = get_version();
+    scrub_infop->last_scrub_stamp = ceph_clock_now();
+    scrub_infop->last_scrub_dirty = true;
+    scrub_infop->header->dec_num_pending();
+  }
+}
+
 void CInode::scrub_finished() {
   dout(20) << __func__ << dendl;
   ceph_assert(scrub_is_in_progress());
 
-  scrub_infop->last_scrub_version = get_version();
-  scrub_infop->last_scrub_stamp = ceph_clock_now();
-  scrub_infop->last_scrub_dirty = true;
   scrub_infop->scrub_in_progress = false;
-  scrub_infop->header->dec_num_pending();
+  common_finished();
+}
+
+void CInode::uninline_finished() {
+  dout(20) << __func__ << dendl;
+  ceph_assert(scrub_is_in_progress());
+
+  scrub_infop->uninline_in_progress = false;
+  common_finished();
 }
 
 int64_t CInode::get_backtrace_pool() const
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index cf2322998e3..d55b6442107 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -305,6 +305,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
 
     bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
     bool scrub_in_progress = false; /// are we currently scrubbing?
+    bool uninline_in_progress = false; /// are we currently uninlining?
 
     fragset_t queued_frags;
 
@@ -437,7 +438,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
   }
 
   bool scrub_is_in_progress() const {
-    return (scrub_infop && scrub_infop->scrub_in_progress);
+    return (scrub_infop && (scrub_infop->scrub_in_progress || scrub_infop->uninline_in_progress));
   }
   /**
    * Start scrubbing on this inode. That could be very short if it's
@@ -448,6 +449,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
    * directory's get_projected_version())
    */
   void scrub_initialize(ScrubHeaderRef& header);
+  void uninline_initialize();
   /**
    * Call this once the scrub has been completed, whether it's a full
    * recursive scrub on a directory or simply the data on a file (or
@@ -456,6 +458,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
    * be complete()ed.
    */
   void scrub_finished();
+  void uninline_finished();
+  void common_finished();
 
   void scrub_aborted();
 
@@ -1058,6 +1062,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
                            MDSContext *fin);
   static void dump_validation_results(const validated_data& results,
                                       ceph::Formatter *f);
+  bool has_inline_data() {
+    if (is_normal() && is_file()) {
+      auto pin = get_projected_inode();
+      if (pin->inline_data.version != CEPH_INLINE_NONE) {
+	return true;
+      }
+    }
+    return false;
+  }
 
   //bool hack_accessed = false;
   //utime_t hack_load_stamp;
diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc
index 2079d23333a..4b9cba199da 100644
--- a/src/mds/DamageTable.cc
+++ b/src/mds/DamageTable.cc
@@ -123,6 +123,41 @@ class BacktraceDamage : public DamageEntry
     f->close_section();
   }
 };
+
+/**
+ * Record about Uninline failures during scrub
+ */
+class UninlineDamage : public DamageEntry
+{
+  public:
+  inodeno_t ino;
+  mds_rank_t rank;
+  int32_t failure_errno;
+  std::string scrub_tag;
+
+  UninlineDamage(
+    inodeno_t ino_, mds_rank_t rank_, int32_t errno_, std::string_view scrub_tag_)
+    : ino(ino_), rank(rank_), failure_errno(errno_), scrub_tag(scrub_tag_)
+  {}
+
+  damage_entry_type_t get_type() const override
+  {
+    return DAMAGE_ENTRY_UNINLINE_FILE;
+  }
+
+  void dump(Formatter *f) const override
+  {
+    f->open_object_section("uninline_damage");
+    f->dump_string("damage_type", "uninline");
+    f->dump_int("id", id);
+    f->dump_int("ino", ino);
+    f->dump_int("rank", rank);
+    f->dump_string("errno", cpp_strerror(failure_errno));
+    f->dump_string("scrub_tag", scrub_tag);
+    f->dump_string("path", path);
+    f->close_section();
+  }
+};
 }
 
 DamageEntry::~DamageEntry()
@@ -228,6 +263,27 @@ void DamageTable::remove_backtrace_damage_entry(inodeno_t ino)
   }  
 }
 
+bool DamageTable::notify_uninline_failed(
+  inodeno_t ino,
+  mds_rank_t rank,
+  int32_t failure_errno,
+  std::string_view scrub_tag,
+  std::string_view path)
+{
+  if (oversized()) {
+    return true;
+  }
+
+  if (auto [it, inserted] = uninline_failures.try_emplace(ino); inserted) {
+    auto entry = std::make_shared<UninlineDamage>(ino, rank, errno, scrub_tag);
+    entry->path = path;
+    it->second = entry;
+    by_id[entry->id] = std::move(entry);
+  }
+
+  return false;
+}
+
 bool DamageTable::oversized() const
 {
   return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries);
@@ -293,6 +349,9 @@ void DamageTable::erase(damage_entry_id_t damage_id)
   } else if (type == DAMAGE_ENTRY_BACKTRACE) {
     auto backtrace_entry = std::static_pointer_cast<BacktraceDamage>(entry);
     remotes.erase(backtrace_entry->ino);
+  } else if (type == DAMAGE_ENTRY_UNINLINE_FILE) {
+    auto uninline_entry = std::static_pointer_cast<UninlineDamage>(entry);
+    uninline_failures.erase(uninline_entry->ino);
   } else {
     derr << "Invalid type " << type << dendl;
     ceph_abort();
diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h
index a1b96fe2218..a8182491a9f 100644
--- a/src/mds/DamageTable.h
+++ b/src/mds/DamageTable.h
@@ -30,7 +30,8 @@ typedef enum
 {
   DAMAGE_ENTRY_DIRFRAG,
   DAMAGE_ENTRY_DENTRY,
-  DAMAGE_ENTRY_BACKTRACE
+  DAMAGE_ENTRY_BACKTRACE,
+  DAMAGE_ENTRY_UNINLINE_FILE
 
 } damage_entry_type_t;
 
@@ -162,6 +163,16 @@ class DamageTable
 
     void remove_backtrace_damage_entry(inodeno_t ino);
 
+    /**
+     * Indicate that there was some error when attempting to unline data of
+     * the file.
+     *
+     * @return true if fatal
+     */
+    bool notify_uninline_failed(
+      inodeno_t ino, mds_rank_t rank, int32_t failure_errno,
+      std::string_view scrub_tag, std::string_view path);
+
     bool is_dentry_damaged(
       const CDir *dir_frag,
       std::string_view dname,
@@ -194,6 +205,9 @@ class DamageTable
     // (i.e. have probably/possibly missing backtraces)
     std::map<inodeno_t, DamageEntryRef> remotes;
 
+    // Map of all inodes for which Data Uninlining failed
+    std::map<inodeno_t, DamageEntryRef> uninline_failures;
+
     // All damage, by ID.  This is a secondary index
     // to the dirfrag, dentry, remote maps.  It exists
     // to enable external tools to unambiguously operate
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index b5e28140c32..c433c77b453 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -9897,6 +9897,9 @@ void MDCache::dispatch_request(const MDRequestRef& mdr)
     case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
       rdlock_dirfrags_stats_work(mdr);
       break;
+    case CEPH_MDS_OP_UNINLINE_DATA:
+      uninline_data_work(mdr);
+      break;
     default:
       ceph_abort();
     }
@@ -13175,6 +13178,190 @@ void MDCache::enqueue_scrub_work(const MDRequestRef& mdr)
   mds->server->respond_to_request(mdr, r);
 }
 
+class C_MDC_DataUninlinedSubmitted : public MDCacheLogContext {
+  MDRequestRef mdr;
+
+  public:
+  C_MDC_DataUninlinedSubmitted(MDRequestRef r, MDSRank *mds) :
+    MDCacheLogContext(mds->mdcache), mdr(r) {}
+
+  void finish(int r) {
+    auto mds = get_mds(); // to keep dout happy
+    auto in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+    ceph_assert(in != nullptr);
+
+    dout(20) << "(uninline_data) log submission "
+	     << (r ? "failed" : "succeeded")
+	     << "; r=" << r
+	     << " (" << cpp_strerror(r) << ") for " << *in << dendl;
+
+    // journaling must not fail
+    ceph_assert(r == 0);
+
+    in->mdcache->logger->inc(l_mdc_uninline_succeeded);
+    auto h = in->get_scrub_header();
+    h->record_uninline_passed();
+    in->uninline_finished();
+    mdr->apply();
+    mds->server->respond_to_request(mdr, r);
+  }
+};
+
+struct C_IO_DataUninlined : public MDSIOContext {
+  MDRequestRef mdr;
+
+  public:
+  C_IO_DataUninlined(MDRequestRef r, MDSRank *mds) : MDSIOContext(mds), mdr(r) {}
+
+  virtual void print(std::ostream& os) const {
+    os << "data uninlined";
+  }
+
+  void finish(int r) override {
+    auto mds = get_mds(); // to keep dout/derr happy
+    auto in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+    // return faster if operation has failed (non-zero) status
+    if (r) {
+      derr << "(uninline_data) mutation failed: r=" << r
+	   << " (" << cpp_strerror(r) << ") for " << *in << dendl;
+      in->mdcache->logger->inc(l_mdc_uninline_write_failed);
+      ceph_assert(in->get_scrub_header());
+      auto h = in->get_scrub_header();
+      h->record_uninline_failed();
+      std::string path;
+      in->make_path_string(path);
+      h->record_uninline_status(in->ino(), r, path);
+      in->uninline_finished();
+      mds->server->respond_to_request(mdr, r);
+      return;
+    }
+
+    dout(20) << "(uninline_data) mutation succeeded for " << *in << dendl;
+
+    // journal the inode changes
+    MDLog *mdlog = mds->mdlog;
+
+    dout(20) << "(uninline_data) writing to journal for " << *in << dendl;
+
+    EUpdate *le = new EUpdate(mdlog, "uninline");
+    mdr->ls = mdlog->get_current_segment();
+
+    auto pi = in->project_inode(mdr);
+    pi.inode->version = in->pre_dirty();
+    pi.inode->inline_data.free_data();
+    pi.inode->inline_data.version = CEPH_INLINE_NONE;
+    pi.inode->ctime = mdr->get_op_stamp();
+    if (mdr->get_op_stamp() > pi.inode->rstat.rctime) {
+      pi.inode->rstat.rctime = mdr->get_op_stamp();
+    }
+    pi.inode->change_attr++;
+
+    in->mdcache->predirty_journal_parents(mdr, &le->metablob, in, nullptr,
+					  PREDIRTY_PRIMARY);
+    in->mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+
+    mdr->committing = true;
+
+    string event_str("submit entry: ");
+    event_str += __func__;
+    mdr->mark_event(event_str);
+
+    auto fin = new C_MDC_DataUninlinedSubmitted(mdr, mds);
+    mdlog->submit_entry(le, fin);
+  }
+};
+
+void MDCache::uninline_data_work(MDRequestRef mdr)
+{
+  CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+  if (!in) {
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+  lov.add_xlock(&in->authlock);
+  lov.add_xlock(&in->filelock);
+  lov.add_xlock(&in->versionlock);
+
+  if (!mds->locker->acquire_locks(mdr, lov)) {
+    dout(20) << "(uninline_data) acquire_locks failed; will retry later for " << *in << dendl;
+    return; // lock not available immediately
+  }
+
+  if (!in->has_inline_data()) {
+    dout(20) << "(uninline_data) inode doesn't have inline data anymore " << *in << dendl;
+    in->uninline_finished();
+    mds->server->respond_to_request(mdr, 0);
+    return;
+  }
+  if (MDS_INO_IS_MDSDIR(in->get_scrub_header()->get_origin())) {
+    in->get_scrub_header()->record_uninline_skipped();
+    mds->server->respond_to_request(mdr, 0);
+    return;
+  }
+
+  logger->inc(l_mdc_uninline_started);
+  auto h = in->get_scrub_header();
+  h->record_uninline_started();
+  in->uninline_initialize();
+
+  auto ino = [&]() { return in->ino(); };
+  auto pi = in->get_projected_inode();
+  auto objecter = mds->objecter;
+
+  dout(20) << "(uninline_data) testing inline_data.version for " << *in << dendl;
+  ceph_assert(objecter);
+  ceph_assert(pi->inline_data.version != CEPH_INLINE_NONE);
+
+  object_t oid = InodeStoreBase::get_object_name(ino(), frag_t(), "");
+  SnapContext snapc;
+  SnapRealm *snaprealm = in->find_snaprealm();
+  auto& snapc_ref = (snaprealm ? snaprealm->get_snap_context() : snapc);
+
+  ObjectOperation create_ops;
+  create_ops.create(false);
+
+  dout(20) << "(uninline_data) dispatching objecter to create \""
+	   << mdr->get_filepath() << "\" for " << *in << dendl;
+
+  objecter->mutate(oid,
+		   OSDMap::file_to_object_locator(pi->layout),
+		   create_ops,
+		   snapc_ref,
+		   ceph::real_clock::now(),
+		   0,
+		   nullptr);
+
+  bufferlist inline_version_bl;
+
+  in->encode(inline_version_bl, pi->inline_data.version);
+
+  ObjectOperation uninline_ops;
+  uninline_ops.cmpxattr("inline_version",
+			CEPH_OSD_CMPXATTR_OP_GT,
+			CEPH_OSD_CMPXATTR_MODE_U64,
+			inline_version_bl);
+
+  if (pi->inline_data.length() > 0) {
+    dout(10) << "(uninline_data) moving inline data for \"" << mdr->get_filepath() << "\" to file for " << *in << dendl;
+    bufferlist inline_data;
+    pi->inline_data.get_data(inline_data);
+    uninline_ops.write(0, inline_data, pi->truncate_size, pi->truncate_seq);
+  }
+  uninline_ops.setxattr("inline_version", std::to_string(CEPH_INLINE_NONE));
+
+  objecter->mutate(oid,
+		   OSDMap::file_to_object_locator(pi->layout),
+		   uninline_ops,
+		   snapc_ref,
+		   ceph::real_clock::now(),
+		   0,
+		   new C_IO_DataUninlined(mdr, mds));
+}
+
 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
   MDRequestRef mdr;
   C_MDC_RespondInternalRequest(MDCache *c, const MDRequestRef& m) :
@@ -13548,6 +13735,14 @@ void MDCache::register_perfcounters()
     pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
                         "Internal Request type inode stats");
 
+    // uninline op stats
+    pcb.add_u64_counter(l_mdc_uninline_started, "uninline_started",
+                        "Internal Counter type uninline started");
+    pcb.add_u64_counter(l_mdc_uninline_succeeded, "uninline_succeeded",
+                        "Internal Counter type uninline succeeded");
+    pcb.add_u64_counter(l_mdc_uninline_write_failed, "uninline_write_failed",
+                        "Internal Counter type uninline write failed");
+
     logger.reset(pcb.create_perf_counters());
     g_ceph_context->get_perfcounters_collection()->add(logger.get());
     recovery_queue.set_logger(logger.get());
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 8ddadcd716a..0b01c9ab859 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -118,6 +118,10 @@ enum {
   l_mdss_ireq_fragstats,
   l_mdss_ireq_inodestats,
 
+  l_mdc_uninline_started,
+  l_mdc_uninline_succeeded,
+  l_mdc_uninline_write_failed,
+
   l_mdc_last,
 };
 
@@ -1094,6 +1098,8 @@ private:
   void repair_dirfrag_stats(CDir *dir);
   void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
 
+  void uninline_data_work(MDRequestRef mdr);
+
   // my leader
   MDSRank *mds;
 
@@ -1442,6 +1448,8 @@ private:
   friend class C_MDC_FragmentCommit;
   friend class C_MDC_FragmentRollback;
   friend class C_IO_MDC_FragmentPurgeOld;
+  friend class C_IO_DataUninlined;
+  friend class C_MDC_DataUninlinedSubmitted;
 
   // -- subtrees --
   static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index b31d9c95220..75b608ace77 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -334,6 +334,11 @@ void MDSDaemon::set_up_admin_socket()
                                      asok_hook,
                                      "Status of scrub operations(s)");
   ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub purge_status "
+				     "name=tag,type=CephString,req=true",
+                                     asok_hook,
+                                     "Purge status of scrub tag|all");
+  ceph_assert(r == 0);
   r = admin_socket->register_command("tag path name=path,type=CephString"
                                      " name=tag,type=CephString",
                                      asok_hook,
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 1cd742423e6..50c7175df80 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -782,8 +782,10 @@ void MDSRankDispatcher::tick()
       }
     }
 
-    if (whoami == 0)
+    if (whoami == 0) {
       scrubstack->advance_scrub_status();
+      scrubstack->purge_old_scrub_counters();
+    }
   }
 
   if (is_active() || is_stopping()) {
@@ -2965,6 +2967,15 @@ void MDSRankDispatcher::handle_asok_command(
     command_scrub_resume(f);
   } else if (command == "scrub status") {
     command_scrub_status(f);
+  } else if (command == "scrub purge_status") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+    string tag;
+    cmd_getval(cmdmap, "tag", tag);
+    command_scrub_purge_status(tag);
   } else if (command == "tag path") {
     if (whoami != 0) {
       *css << "Not rank 0";
@@ -3225,6 +3236,11 @@ void MDSRank::command_scrub_status(Formatter *f) {
   scrubstack->scrub_status(f);
 }
 
+void MDSRank::command_scrub_purge_status(std::string_view tag) {
+  std::lock_guard l(mds_lock);
+  scrubstack->purge_scrub_counters(tag);
+}
+
 void MDSRank::command_get_subtrees(Formatter *f)
 {
   ceph_assert(f != NULL);
@@ -4233,6 +4249,7 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s
     mdcache->handle_conf_change(changed, *mdsmap);
     mdlog->handle_conf_change(changed, *mdsmap);
     purge_queue.handle_conf_change(changed, *mdsmap);
+    scrubstack->handle_conf_change(changed);
   }));
 }
 
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index c4a8809b6e1..9135db40c09 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -509,6 +509,7 @@ class MDSRank {
     // scrub control commands
     void command_scrub_resume(Formatter *f);
     void command_scrub_status(Formatter *f);
+    void command_scrub_purge_status(std::string_view tag);
 
     void command_get_subtrees(Formatter *f);
     void command_export_dir(Formatter *f,
diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h
index a5d35f61ce4..5f112387fec 100644
--- a/src/mds/ScrubHeader.h
+++ b/src/mds/ScrubHeader.h
@@ -27,6 +27,7 @@ class Formatter;
 };
 
 class CInode;
+class MDCache;
 
 /**
  * Externally input parameters for a scrub, associated with the root
@@ -64,6 +65,48 @@ public:
   }
   unsigned get_num_pending() const { return num_pending; }
 
+  void record_uninline_status(_inodeno_t ino, int e, std::string_view path) {
+    if (uninline_failed_info.find(e) == uninline_failed_info.end()) {
+      uninline_failed_info[e] = std::vector<_inodeno_t>();
+    }
+    auto& v = uninline_failed_info.at(e);
+    v.push_back(ino);
+    paths[ino] = path;
+  }
+
+  std::unordered_map<int, std::vector<_inodeno_t>>& get_uninline_failed_info() {
+    return uninline_failed_info;
+  }
+
+  std::unordered_map<_inodeno_t, std::string>& get_paths() {
+    return paths;
+  }
+
+  void record_uninline_started() {
+    uninline_started++;
+  }
+  void record_uninline_passed() {
+    uninline_passed++;
+  }
+  void record_uninline_failed() {
+    uninline_failed++;
+  }
+  void record_uninline_skipped() {
+    uninline_skipped++;
+  }
+  uint64_t get_uninline_started() const {
+    return uninline_started;
+  }
+  uint64_t get_uninline_passed() const {
+    return uninline_passed;
+  }
+  uint64_t get_uninline_failed() const {
+    return uninline_failed;
+  }
+  uint64_t get_uninline_skipped() const {
+    return uninline_skipped;
+  }
+
 protected:
   const std::string tag;
   bool is_tag_internal;
@@ -76,6 +119,15 @@ protected:
   bool repaired = false;  // May be set during scrub if repairs happened
   unsigned epoch_last_forwarded = 0;
   unsigned num_pending = 0;
+  // errno -> [ino1, ino2, ino3, ...]
+  std::unordered_map<int, std::vector<_inodeno_t>> uninline_failed_info;
+  std::unordered_map<_inodeno_t, std::string> paths;
+
+  // scrub counters
+  uint64_t uninline_started = 0;
+  uint64_t uninline_passed = 0;
+  uint64_t uninline_failed = 0;
+  uint64_t uninline_skipped = 0;
 };
 
 typedef std::shared_ptr<ScrubHeader> ScrubHeaderRef;
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
index 28392f53366..7ec77a31de3 100644
--- a/src/mds/ScrubStack.cc
+++ b/src/mds/ScrubStack.cc
@@ -17,6 +17,7 @@
 #include "mds/MDSRank.h"
 #include "mds/MDCache.h"
 #include "mds/MDSContinuation.h"
+#include "osdc/Objecter.h"
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds
@@ -75,6 +76,7 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top)
 
     dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl;
     in->scrub_initialize(header);
+    in->uninline_initialize();
   } else if (CDir *dir = dynamic_cast<CDir*>(obj)) {
     if (dir->scrub_is_in_progress()) {
       dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl;
@@ -106,6 +108,55 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top)
   return 0;
 }
 
+void ScrubStack::purge_scrub_counters(std::string_view tag)
+{
+  for (auto& stat : mds_scrub_stats) {
+    if (tag == "all") {
+      stat.counters.clear();
+    } else {
+      auto it = stat.counters.find(std::string(tag));
+      if (it != stat.counters.end()) {
+	stat.counters.erase(it);
+      }
+    }
+  }
+}
+
+// called from tick
+void ScrubStack::purge_old_scrub_counters()
+{
+  // "mds_scrub_stats_review_period" must be in number of days
+  auto review_period = ceph::make_timespan(_mds_scrub_stats_review_period * 24 * 60 * 60);
+  auto now = coarse_real_clock::now();
+
+  dout(20) << __func__ << " review_period:" << review_period << dendl;
+
+  for (mds_rank_t rank = 0; rank < (mds_rank_t)mds_scrub_stats.size(); rank++) {
+    auto& counters = mds_scrub_stats[rank].counters;
+    for (auto it = counters.begin(); it != counters.end(); ) {
+      auto curr = it;
+      auto c = (*it).second;
+      auto elapsed = now - c.start_time;
+      dout(20) << __func__
+	       << " rank(" << rank << ") :"
+               << " elapsed:" << elapsed
+	       << dendl;
+      ++it;
+      if (elapsed >= review_period) {
+	counters.erase(curr);
+      }
+    }
+  }
+}
+
+void ScrubStack::init_scrub_counters(std::string_view path, std::string_view tag)
+{
+  scrub_counters_t sc{coarse_real_clock::now(), std::string(path), 0, 0, 0};
+  for (auto& stat : mds_scrub_stats) {
+    stat.counters[std::string(tag)] = sc;
+  }
+}
+
 int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top)
 {
   // abort in progress
@@ -133,6 +184,10 @@ int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top)
     //to make sure mdsdir is always on the top
     top = false;
   }
+
+  std::string path;
+  in->make_path_string(path);
+  init_scrub_counters(path, header->get_tag());
   int r = _enqueue(in, header, top);
   if (r < 0)
     return r;
@@ -227,6 +282,7 @@ void ScrubStack::kick_off_scrubs()
 	// it's a regular file, symlink, or hard link
 	dequeue(in); // we only touch it this once, so remove from stack
 
+	uninline_data(in, new C_MDSInternalNoop);
 	scrub_file_inode(in);
       } else {
 	bool added_children = false;
@@ -235,6 +291,7 @@ void ScrubStack::kick_off_scrubs()
 	if (done) {
 	  dout(20) << __func__ << " dir inode, done" << dendl;
 	  dequeue(in);
+	  in->uninline_finished();
 	}
 	if (added_children) {
 	  // dirfrags were queued at top of stack
@@ -707,6 +764,43 @@ void ScrubStack::scrub_status(Formatter *f) {
     f->close_section(); // scrub id
   }
   f->close_section(); // scrubs
+
+  if (mds_scrub_stats.size()) {
+    f->open_object_section("scrub_stats");
+    for (auto& [tag, ctrs] : mds_scrub_stats[0].counters) {
+      uint64_t started = 0;
+      uint64_t passed = 0;
+      uint64_t failed = 0;
+      uint64_t skipped = 0;
+      for (auto& stats : mds_scrub_stats) {
+	if (auto it = stats.counters.find(tag); it != stats.counters.end()) {
+	  auto& [t, c] = *it;
+	  started += c.uninline_started;
+	  passed += c.uninline_passed;
+	  failed += c.uninline_failed;
+	  skipped += c.uninline_skipped;
+	}
+      }
+      f->open_object_section(tag);
+      {
+	f->dump_stream("start_time") << ctrs.start_time;
+	std::string path = ctrs.origin_path;
+	if (path == "") {
+	  path = "/";
+	} else if (path.starts_with("~mds")) {
+	  path = "~mdsdir";
+	}
+	f->dump_string("path", path);
+	f->dump_int("uninline_started", started);
+	f->dump_int("uninline_passed", passed);
+	f->dump_int("uninline_failed", failed);
+	f->dump_int("uninline_skipped", skipped);
+      }
+      f->close_section(); // tag
+    }
+    f->close_section(); // scrub_stats
+  }
+
   f->close_section(); // result
 }
 
@@ -936,6 +1030,7 @@ void ScrubStack::handle_scrub(const cref_t<MMDSScrub> &m)
 	  header->set_origin(m->get_origin());
 	  scrubbing_map.emplace(header->get_tag(), header);
 	}
+
 	for (auto dir : dfs) {
 	  queued.insert_raw(dir->get_frag());
 	  _enqueue(dir, header, true);
@@ -1016,6 +1111,7 @@ void ScrubStack::handle_scrub(const cref_t<MMDSScrub> &m)
 	const auto& header = in->get_scrub_header();
 	header->set_epoch_last_forwarded(scrub_epoch);
 	in->scrub_finished();
+	in->uninline_finished();
 
 	kick_off_scrubs();
       }
@@ -1052,6 +1148,10 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
     bool any_finished = false;
     bool any_repaired = false;
     std::set<std::string> scrubbing_tags;
+    std::unordered_map<std::string, unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+    std::unordered_map<_inodeno_t, std::string> paths;
+    std::unordered_map<std::string, std::vector<uint64_t>> counters;
+
     for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) {
       auto& header = it->second;
       if (header->get_num_pending() ||
@@ -1062,6 +1162,17 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
 	any_finished = true;
 	if (header->get_repaired())
 	  any_repaired = true;
+	auto& ufi = header->get_uninline_failed_info();
+	uninline_failed_meta_info[it->first] = ufi;
+	ufi.clear();
+	paths.merge(header->get_paths());
+	ceph_assert(header->get_paths().size() == 0);
+	std::vector<uint64_t> c{header->get_uninline_started(),
+				header->get_uninline_passed(),
+				header->get_uninline_failed(),
+				header->get_uninline_skipped()
+	};
+	counters[header->get_tag()] = c;
 	scrubbing_map.erase(it++);
       } else {
 	++it;
@@ -1071,7 +1182,11 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
     scrub_epoch = m->get_epoch();
 
     auto ack = make_message<MMDSScrubStats>(scrub_epoch,
-					    std::move(scrubbing_tags), clear_stack);
+					    std::move(scrubbing_tags),
+					    std::move(uninline_failed_meta_info),
+					    std::move(paths),
+					    std::move(counters),
+					    clear_stack);
     mdcache->mds->send_message_mds(ack, 0);
 
     if (any_finished)
@@ -1085,7 +1200,40 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
       stat.epoch_acked = m->get_epoch();
       stat.scrubbing_tags = m->get_scrubbing_tags();
       stat.aborting = m->is_aborting();
+      for (auto& [scrub_tag, errno_map] : m->get_uninline_failed_meta_info()) {
+	stat.uninline_failed_meta_info[scrub_tag] = errno_map;
+      }
+      stat.paths.insert(m->get_paths().begin(), m->get_paths().end());;
+      for (auto& [tag, v] : m->get_counters()) {
+	stat.counters[tag].uninline_started = v[0];
+	stat.counters[tag].uninline_passed = v[1];
+	stat.counters[tag].uninline_failed = v[2];
+	stat.counters[tag].uninline_skipped = v[3];
+      }
+    }
+  }
+}
+
+void ScrubStack::move_uninline_failures_to_damage_table()
+{
+  auto mds = mdcache->mds;
+
+  for (mds_rank_t rank = 0; rank < (mds_rank_t)mds_scrub_stats.size(); rank++) {
+    auto& ufmi = mds_scrub_stats[rank].uninline_failed_meta_info;
+    auto& paths = mds_scrub_stats[rank].paths;
+
+    for (const auto& [scrub_tag, errno_ino_vec_map] : ufmi) {
+      for (const auto& [errno_, ino_vec] : errno_ino_vec_map) {
+	for (auto ino : ino_vec) {
+	  mds->damage_table.notify_uninline_failed(ino, rank, errno_, scrub_tag, paths[ino]);
+	}
+      }
     }
+    ufmi.clear();
+    paths.clear();
+    // do not clear the counters map; we'll clear them later:
+    // - on user request or
+    // - after a grace period
   }
 }
 
@@ -1152,6 +1300,18 @@ void ScrubStack::advance_scrub_status()
       any_finished = true;
       if (header->get_repaired())
 	any_repaired = true;
+      auto& ufmi = mds_scrub_stats[0].uninline_failed_meta_info;
+      ufmi[it->first] = header->get_uninline_failed_info();
+      mds_scrub_stats[0].paths.merge(header->get_paths());
+      move_uninline_failures_to_damage_table();
+
+      auto& c = mds_scrub_stats[0].counters;
+      auto& sc = c[header->get_tag()];
+      sc.uninline_started = header->get_uninline_started();
+      sc.uninline_passed = header->get_uninline_passed();
+      sc.uninline_failed = header->get_uninline_failed();
+      sc.uninline_skipped = header->get_uninline_skipped();
+
       scrubbing_map.erase(it++);
     } else {
       ++it;
@@ -1159,7 +1319,6 @@ void ScrubStack::advance_scrub_status()
   }
 
   ++scrub_epoch;
-
   for (auto& r : up_mds) {
     if (r == 0)
       continue;
@@ -1197,3 +1356,23 @@ void ScrubStack::handle_mds_failure(mds_rank_t mds)
   if (kick)
     kick_off_scrubs();
 }
+
+void ScrubStack::uninline_data(CInode *in, Context *fin)
+{
+  dout(10) << "(uninline_data) starting data uninlining for " << *in << dendl;
+
+  MDRequestRef mdr = in->mdcache->request_start_internal(CEPH_MDS_OP_UNINLINE_DATA);
+  mdr->set_filepath(filepath(in->ino()));
+  mdr->snapid = CEPH_NOSNAP;
+  mdr->no_early_reply = true;
+  mdr->internal_op_finish = fin;
+
+  in->mdcache->dispatch_request(mdr);
+}
+
+void ScrubStack::handle_conf_change(const std::set<std::string>& changed)
+{
+  if (changed.count("mds_scrub_stats_review_period")) {
+    _mds_scrub_stats_review_period = g_conf().get_val<uint64_t>("mds_scrub_stats_review_period");
+  }
+}
diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h
index 756ebd9cb0e..4d0a0570d35 100644
--- a/src/mds/ScrubStack.h
+++ b/src/mds/ScrubStack.h
@@ -22,6 +22,8 @@
 #include "ScrubHeader.h"
 
 #include "common/LogClient.h"
+#include "common/Cond.h"
+#include "common/ceph_time.h"
 #include "include/elist.h"
 #include "messages/MMDSScrub.h"
 #include "messages/MMDSScrubStats.h"
@@ -36,7 +38,9 @@ public:
     clog(clog),
     finisher(finisher_),
     scrub_stack(member_offset(MDSCacheObject, item_scrub)),
-    scrub_waiting(member_offset(MDSCacheObject, item_scrub)) {}
+    scrub_waiting(member_offset(MDSCacheObject, item_scrub)) {
+      _mds_scrub_stats_review_period = g_conf().get_val<uint64_t>("mds_scrub_stats_review_period");
+    }
   ~ScrubStack() {
     ceph_assert(scrub_stack.empty());
     ceph_assert(!scrubs_in_progress);
@@ -103,6 +107,14 @@ public:
 
   bool remove_inode_if_stacked(CInode *in);
 
+  void move_uninline_failures_to_damage_table();
+
+  void init_scrub_counters(std::string_view path, std::string_view tag);
+  void purge_scrub_counters(std::string_view tag);
+  void purge_old_scrub_counters(); // on tick
+  void handle_conf_change(const std::set<std::string>& changed);
+
+
   MDCache *mdcache;
 
 protected:
@@ -132,10 +144,21 @@ protected:
   // check if any mds is aborting scrub after mds.0 starts
   bool scrub_any_peer_aborting = true;
 
+  struct scrub_counters_t {
+    ceph::coarse_real_clock::time_point start_time = coarse_real_clock::now();
+    std::string origin_path;
+    uint64_t uninline_started = 0;
+    uint64_t uninline_passed = 0;
+    uint64_t uninline_failed = 0;
+    uint64_t uninline_skipped = 0;
+  };
   struct scrub_stat_t {
     unsigned epoch_acked = 0;
     std::set<std::string> scrubbing_tags;
     bool aborting = false;
+    std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+    std::unordered_map<_inodeno_t, std::string> paths;
+    std::unordered_map<std::string, scrub_counters_t> counters; // map(scrub_tag -> counters)
   };
   std::vector<scrub_stat_t> mds_scrub_stats;
 
@@ -154,6 +177,9 @@ private:
   friend std::ostream &operator<<(std::ostream &os, const State &state);
 
   friend class C_InodeValidated;
+  friend class C_IO_DataUninlined;
+  friend class C_MDC_DataUninlinedSubmitted;
+  friend class MDCache;
 
   int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top);
   /**
@@ -267,6 +293,7 @@ private:
 
   void handle_scrub(const cref_t<MMDSScrub> &m);
   void handle_scrub_stats(const cref_t<MMDSScrubStats> &m);
+  void uninline_data(CInode *in, Context *fin);
 
   State state = STATE_IDLE;
   bool clear_stack = false;
@@ -274,6 +301,8 @@ private:
   // list of pending context completions for asynchronous scrub
   // control operations.
   std::vector<Context *> control_ctxs;
+
+  uint64_t _mds_scrub_stats_review_period = 1; // 1 day
 };
 
 #endif /* SCRUBSTACK_H_ */
diff --git a/src/messages/MMDSScrubStats.h b/src/messages/MMDSScrubStats.h
index 2cbb7f2f211..9252f99ac8a 100644
--- a/src/messages/MMDSScrubStats.h
+++ b/src/messages/MMDSScrubStats.h
@@ -18,7 +18,7 @@
 #include "messages/MMDSOp.h"
 
 class MMDSScrubStats : public MMDSOp {
-  static constexpr int HEAD_VERSION = 1;
+  static constexpr int HEAD_VERSION = 2;
   static constexpr int COMPAT_VERSION = 1;
 
 public:
@@ -38,6 +38,15 @@ public:
   bool is_finished(const std::string& tag) const {
     return update_scrubbing && !scrubbing_tags.count(tag);
   }
+  const std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>& get_uninline_failed_meta_info() const {
+    return uninline_failed_meta_info;
+  }
+  const std::unordered_map<_inodeno_t, std::string>& get_paths() const {
+    return paths;
+  }
+  const std::unordered_map<std::string, std::vector<uint64_t>>& get_counters() const {
+    return counters;
+  }
 
   void encode_payload(uint64_t features) override {
     using ceph::encode;
@@ -45,6 +54,7 @@ public:
     encode(scrubbing_tags, payload);
     encode(update_scrubbing, payload);
     encode(aborting, payload);
+    encode_uninline_failed_info();
   }
   void decode_payload() override {
     using ceph::decode;
@@ -53,6 +63,90 @@ public:
     decode(scrubbing_tags, p);
     decode(update_scrubbing, p);
     decode(aborting, p);
+    if (header.version >= 2) {
+      decode_uninline_failed_info(p);
+    }
+  }
+
+  void encode_uninline_failed_info() {
+    using ceph::encode;
+    int count = (int)uninline_failed_meta_info.size();
+    encode(count, payload);
+    for (const auto& [tag, meta_info_map] : uninline_failed_meta_info) {
+      encode(tag, payload);
+      count = (int)meta_info_map.size();
+      encode(count, payload);
+      for (const auto& [error_code, ino_vec] : meta_info_map) {
+	encode(error_code, payload);
+	encode(ino_vec, payload);
+      }
+    }
+    count = (int)paths.size();
+    encode(count, payload);
+    for (auto& [ino, path] : paths) {
+      encode(ino, payload);
+      encode(path, payload);
+    }
+    count = (int)counters.size();
+    encode(count, payload);
+    for (auto& [tag, v] : counters) {
+      encode(tag, payload);
+      uint64_t started = v[0];
+      uint64_t passed = v[1];
+      uint64_t failed = v[2];
+      uint64_t skipped = v[3];
+
+      encode(started, payload);
+      encode(passed, payload);
+      encode(failed, payload);
+      encode(skipped, payload);
+    }
+  }
+  void decode_uninline_failed_info(ceph::bufferlist::const_iterator& p) {
+    using ceph::decode;
+    int tag_count = 0;
+    decode(tag_count, p);
+    while (tag_count--) {
+      std::string tag;
+      decode(tag, p);
+      int count = 0;
+      decode(count, p);
+      std::unordered_map<int, std::vector<_inodeno_t>> uninline_failed_info;
+      while (count--) {
+	int error_code;
+	std::vector<_inodeno_t> ino_vec;
+	decode(error_code, p);
+	decode(ino_vec, p);
+	uninline_failed_info[error_code] = std::move(ino_vec);
+      }
+      uninline_failed_meta_info[tag] = std::move(uninline_failed_info);
+    }
+    int count = 0;
+    decode(count, p);
+    while (count--) {
+      _inodeno_t ino;
+      std::string path;
+      decode(ino, p);
+      decode(path, p);
+      paths[ino] = path;
+    }
+    count = 0;
+    decode(count, p);
+    while (count--) {
+      std::string tag;
+      decode(tag, p);
+      uint64_t started = 0;
+      uint64_t passed = 0;
+      uint64_t failed = 0;
+      uint64_t skipped = 0;
+
+      decode(started, p);
+      decode(passed, p);
+      decode(failed, p);
+      decode(skipped, p);
+      std::vector<uint64_t> c{started, passed, failed, skipped};
+      counters[tag] = c;
+    }
   }
 
 protected:
@@ -65,6 +159,15 @@ protected:
   MMDSScrubStats(unsigned e, const std::set<std::string>& tags, bool abrt=false) :
     MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION),
     epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt) {}
+  MMDSScrubStats(unsigned e, const std::set<std::string>& tags,
+    std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>&& ufmi,
+    std::unordered_map<_inodeno_t, std::string>&& paths_,
+    std::unordered_map<std::string, std::vector<uint64_t>>&& counters_,
+    bool abrt = false) :
+    MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION),
+    epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt),
+    uninline_failed_meta_info(std::move(ufmi)), paths(std::move(paths_)),
+    counters(std::move(counters_)) {}
   ~MMDSScrubStats() override {}
 
 private:
@@ -72,6 +175,10 @@ private:
   std::set<std::string> scrubbing_tags;
   bool update_scrubbing = false;
   bool aborting = false;
+  // <tag, <error_code, [ino1, ino2, ...]>>
+  std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+  std::unordered_map<_inodeno_t, std::string> paths;
+  std::unordered_map<std::string, std::vector<uint64_t>> counters;
 
   template<class T, typename... Args>
   friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index b20060c68fe..7d886344244 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -37,9 +37,15 @@ void NVMeofGwMap::to_gmap(
     for (const auto& gw_created_pair: gw_created_map) {
       const auto& gw_id = gw_created_pair.first;
       const auto& gw_created  = gw_created_pair.second;
+      gw_availability_t availability = gw_created.availability;
+      // Gateways expect to see UNAVAILABLE, not DELETING
+      // for entries in DELETING state
+      if (gw_created.availability == gw_availability_t::GW_DELETING) {
+         availability = gw_availability_t::GW_UNAVAILABLE;
+      }
 
       auto gw_state = NvmeGwClientState(
-	gw_created.ana_grp_id, epoch, gw_created.availability);
+	gw_created.ana_grp_id, epoch, availability);
       for (const auto& sub: gw_created.subsystems) {
 	gw_state.subsystems.insert({
 	    sub.nqn,
@@ -78,10 +84,36 @@ int NVMeofGwMap::cfg_add_gw(
   for (auto& itr: created_gws[group_key]) {
     allocated.insert(itr.second.ana_grp_id);
     if (itr.first == gw_id) {
-      dout(1) << __func__ << " ERROR create GW: already exists in map "
-	      << gw_id << dendl;
-      return -EEXIST ;
+      if (itr.second.availability  != gw_availability_t::GW_DELETING) {
+        dout(1) << __func__ << " ERROR create GW: already exists in map "
+	          << gw_id << dendl;
+        return -EEXIST;
+      } else {
+        //this GW exists in the map in "Deleting" state
+        // but user again creates it - need just set attribute values
+        created_gws[group_key][gw_id].performed_full_startup = true;
+        created_gws[group_key][gw_id].availability
+          = gw_availability_t::GW_CREATED;
+        dout(4) << "GW in Deleting state " << gw_id
+             << " was created again" << dendl;
+        return 0;
+      }
     }
+    if (itr.second.availability == gw_availability_t::GW_DELETING) {
+      //Was found some GW in "Deleting" state. Just to inherit its ANA group
+      NvmeGwMonState & gw_created = created_gws[group_key][itr.first];
+      created_gws[group_key][gw_id] = gw_created;
+      // Deep copy of all data of "Deleting" GW
+      created_gws[group_key][gw_id].performed_full_startup = true;
+      created_gws[group_key][gw_id].availability
+         =  gw_availability_t::GW_CREATED;
+      dout(4) << "Created GW inherits ANA group of deleting GW-id :"
+          << itr.first << " group " << itr.second.ana_grp_id  <<  dendl;
+      do_erase_gw_id(itr.first, group_key);
+      dout(4) << "Created GWS after create/delete:  "
+         << created_gws << dendl;
+      return 0;
+   }
   }
   if (allocated.size() == MAX_SUPPORTED_ANA_GROUPS) {
     dout(4) << "Warning:  cannot add GW " << gw_id
@@ -125,7 +157,40 @@ int NVMeofGwMap::cfg_add_gw(
 int NVMeofGwMap::cfg_delete_gw(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
 {
-  int rc = 0;
+  if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHA)) {
+    dout(10) << " has NVMEOFHA: 1" << dendl;
+    for (auto& gws_states: created_gws[group_key]) {
+      if (gws_states.first == gw_id) {
+        auto& state = gws_states.second;
+        state.availability = gw_availability_t::GW_DELETING;
+        dout(4) << " Deleting  GW :"<< gw_id  << " in state  "
+            << state.availability <<  " Resulting GW availability: "
+            << state.availability  << dendl;
+        return 0;
+      }
+    }
+  } else {
+    return do_delete_gw(gw_id, group_key);
+  }
+  return -EINVAL;
+}
+
+int  NVMeofGwMap::do_erase_gw_id(const NvmeGwId &gw_id,
+      const NvmeGroupKey& group_key) {
+
+  fsm_timers[group_key].erase(gw_id);
+  if (fsm_timers[group_key].size() == 0)
+    fsm_timers.erase(group_key);
+
+  created_gws[group_key].erase(gw_id);
+  if (created_gws[group_key].size() == 0)
+    created_gws.erase(group_key);
+  return 0;
+}
+
+int NVMeofGwMap::do_delete_gw(
+  const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
+{
   for (auto& gws_states: created_gws[group_key]) {
 
     if (gws_states.first == gw_id) {
@@ -136,26 +201,52 @@ int NVMeofGwMap::cfg_delete_gw(
 	  gw_id, group_key,state_itr.second , state_itr.first, modified);
       }
       dout(10) << " Delete GW :"<< gw_id  << " ANA grpid: "
-	       << state.ana_grp_id  << dendl;
+        << state.ana_grp_id  << dendl;
       for (auto& itr: created_gws[group_key]) {
         // Update state map and other maps
 	remove_grp_id(itr.first, group_key, state.ana_grp_id);
 	// of all created gateways. Removed key = anagrp
       }
-      fsm_timers[group_key].erase(gw_id);
-      if (fsm_timers[group_key].size() == 0)
-	fsm_timers.erase(group_key);
-
-      created_gws[group_key].erase(gw_id);
-      if (created_gws[group_key].size() == 0)
-	created_gws.erase(group_key);
-      return rc;
+      return do_erase_gw_id(gw_id, group_key);
     }
   }
 
   return -EINVAL;
 }
 
+int NVMeofGwMap::get_num_namespaces(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key,  const BeaconSubsystems&  subs)
+{
+  auto grpid = created_gws[group_key][gw_id].ana_grp_id ;
+  int num_ns = 0;
+  for (auto & subs_it:subs) {
+    for (auto & ns :subs_it.namespaces) {
+      if (ns.anagrpid == (grpid+1)) {
+         num_ns++;
+      }
+    }
+  }
+  return num_ns;
+}
+
+void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
+    const BeaconSubsystems& subs,  bool &propose_pending)
+{
+  propose_pending = false;
+  for (auto& itr: created_gws[group_key]) {
+    auto &gw_id = itr.first;
+    if (itr.second.availability == gw_availability_t::GW_DELETING) {
+      int num_ns = 0;
+      if ( (num_ns = get_num_namespaces(gw_id, group_key, subs)) == 0) {
+        do_delete_gw(gw_id, group_key);
+        propose_pending =  true;
+      }
+      dout(4) << " to delete ? " << gw_id  << " num_ns " << num_ns << dendl;
+      break; // handle just one GW in "Deleting" state in time.
+    }
+  }
+}
+
 int NVMeofGwMap::process_gw_map_gw_down(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
 {
@@ -192,17 +283,8 @@ void NVMeofGwMap::process_gw_map_ka(
   dout(20)  << "KA beacon from the GW " << gw_id
 	    << " in state " << (int)st.availability << dendl;
 
-  if (st.availability == gw_availability_t::GW_CREATED) {
-    // first time appears - allow IO traffic for this GW
-    st.availability = gw_availability_t::GW_AVAILABLE;
-    for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
-      state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
-    }
-    if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW
-      st.active_state(st.ana_grp_id);
-    }
-    propose_pending = true;
-  } else if (st.availability == gw_availability_t::GW_UNAVAILABLE) {
+  if (st.availability == gw_availability_t::GW_CREATED ||
+      st.availability == gw_availability_t::GW_UNAVAILABLE) {
     st.availability = gw_availability_t::GW_AVAILABLE;
     if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) {
       for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
@@ -237,7 +319,9 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
 
       // 1. Failover missed : is there is a GW in unavailable state?
       // if yes, is its ANA group handled by some other GW?
-      if (state.availability == gw_availability_t::GW_UNAVAILABLE &&
+      if ((state.availability == gw_availability_t::GW_UNAVAILABLE ||
+          state.availability == gw_availability_t::GW_DELETING ||
+          state.availability == gw_availability_t::GW_CREATED) &&
 	  state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
 	auto found_gw_for_ana_group = false;
 	for (auto& gw_state2 : gws_states) {
@@ -251,7 +335,7 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
 	}
         // choose the GW for handle ana group
 	if (found_gw_for_ana_group == false) {
-	  dout(10) << "Was not found the GW " << " that handles ANA grp "
+	  dout(20) << "Was not found the GW " << " that handles ANA grp "
 		   << (int)state.ana_grp_id << " find candidate "<< dendl;
 	  for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
 	    find_failover_candidate(gw_id, group_key, state_itr.first, propose);
@@ -277,14 +361,23 @@ void NVMeofGwMap::set_failover_gw_for_ANA_group(
   const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
 {
   NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+  NvmeGwMonState& failed_gw_state = created_gws[group_key][failed_gw_id];
   epoch_t epoch;
   dout(10) << "Found failover GW " << gw_id
 	   << " for ANA group " << (int)ANA_groupid << dendl;
+  if (failed_gw_state.availability == gw_availability_t::GW_CREATED) {
+    dout(10) << "Failover GW " << gw_id <<
+       " takes over the group of GW in Created state " <<
+       failed_gw_id  << dendl;
+    // just take over on the group of created GW
+    gw_state.active_state(ANA_groupid);
+    return;
+  }
   int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true);
   if (rc) {
     //start failover even when nonces are empty !
     gw_state.active_state(ANA_groupid);
-  } else{
+  } else {
     gw_state.sm_state[ANA_groupid] =
       gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL;
     gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch;
@@ -507,7 +600,7 @@ void NVMeofGwMap::fsm_handle_gw_alive(
       // ana group wouldnt be taken back  during blocklist wait period
       cancel_timer(gw_id, group_key, grpid);
       map_modified = true;
-    } else{
+    } else {
       dout(20) << "osd epoch not changed from "
 	       <<  gw_map.blocklist_data[grpid].osd_epoch
 	       << " to "<< last_osd_epoch
@@ -576,6 +669,8 @@ void NVMeofGwMap::fsm_handle_gw_down(
 void NVMeofGwMap::fsm_handle_gw_delete(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
   gw_states_per_group_t state , NvmeAnaGrpId grpid, bool &map_modified) {
+  //This function is called when GW already passed Failover and its native
+  //Ana group has no volumes, so some states are not relevant
   switch (state) {
   case gw_states_per_group_t::GW_STANDBY_STATE:
   case gw_states_per_group_t::GW_IDLE_STATE:
@@ -583,8 +678,8 @@ void NVMeofGwMap::fsm_handle_gw_delete(
   {
     NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
 
-    // Try to find GW that temporary owns my group - if found,
-    // this GW should pass to standby for this group
+    // Try to find GW that temporary owns gw-id group that is about to disappear!
+    // - if found, this GW should pass to standby for this group
     if (grpid == gw_state.ana_grp_id) {
       auto& gateway_states = created_gws[group_key];
       for (auto& gs: gateway_states) {
@@ -605,43 +700,6 @@ void NVMeofGwMap::fsm_handle_gw_delete(
   }
   break;
 
-  case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
-  {
-    NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
-    cancel_timer(gw_id, group_key, grpid);
-    map_modified = true;
-    gw_state.standby_state(grpid);
-  }
-  break;
-
-  case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
-  {
-    cancel_timer(gw_id, group_key, grpid);
-    map_modified = true;
-    for (auto& nqn_gws_state: created_gws[group_key]) {
-      auto& st = nqn_gws_state.second;
-
-      // found GW   that was intended for  Failback for this ana grp
-      if (st.sm_state[grpid] ==
-	  gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
-	dout(4) << "Warning: Outgoing Failback when GW is deleted "
-		<< "- to rollback it GW " << gw_id << "for ANA Group "
-		<< grpid << dendl;
-	st.standby_state(grpid);
-	break;
-      }
-    }
-  }
-  break;
-
-  case gw_states_per_group_t::GW_ACTIVE_STATE:
-  {
-    NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
-    map_modified = true;
-    gw_state.standby_state(grpid);
-  }
-  break;
-
   default: {
     dout(4) << "Error : Invalid state " << state
 	    << "for GW " << gw_id  << dendl;
@@ -744,8 +802,14 @@ int NVMeofGwMap::blocklist_gw(
 {
   // find_already_created_gw(gw_id, group_key);
   NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
+  NvmeNonceVector nonces;
+  for (auto& state_itr: gw_map.sm_state) {
+    // to make blocklist on all clusters of the failing GW
+    nonces.insert(nonces.end(), gw_map.nonce_map[state_itr.first].begin(),
+        gw_map.nonce_map[state_itr.first].end());
+  }
 
-  if (gw_map.nonce_map[grpid].size() > 0) {
+  if (nonces.size() > 0) {
     NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];;
     std::string str = "[";
     entity_addrvec_t addr_vect;
@@ -759,10 +823,10 @@ int NVMeofGwMap::blocklist_gw(
       str += it;
     }
     str += "]";
-    bool rc = addr_vect.parse(&str[0]);
-    dout(10) << str << " rc " << rc <<  " network vector: " << addr_vect
+    bool success = addr_vect.parse(&str[0]);
+    dout(10) << str << " parse success " << success <<  " network vector: " << addr_vect
 	     << " " << addr_vect.size() << dendl;
-    if (rc) {
+    if (!success) {
       return 1;
     }
 
@@ -781,14 +845,14 @@ int NVMeofGwMap::blocklist_gw(
 	  new CMonRequestProposal(this, addr_vect, expires)
 	);
         // return false;
-      } else{
+      } else {
 	mon->nvmegwmon()->request_proposal(mon->osdmon());
       }
     }
     dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch
 	     << " address vector: " << addr_vect << " "
 	     << addr_vect.size() << dendl;
-  } else{
+  } else {
     dout(4) << "Error: No nonces context present for gw: "
 	    << gw_id  << " ANA group: " << grpid << dendl;
     return 1;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 688a68662be..4c9d7966410 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -44,7 +44,8 @@ public:
   std::map<NvmeGroupKey, NvmeGwTimers> fsm_timers;
 
   void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
-
+  void track_deleting_gws(const NvmeGroupKey& group_key,
+    const BeaconSubsystems&  subs, bool &propose_pending);
   int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
   int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
   void process_gw_map_ka(
@@ -64,6 +65,9 @@ public:
   void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
        const NvmeGroupKey& group_key, bool &map_modified);
 private:
+  int  do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+  int  do_erase_gw_id(const NvmeGwId &gw_id,
+      const NvmeGroupKey& group_key);
   void add_grp_id(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     const NvmeAnaGrpId grpid);
@@ -95,7 +99,8 @@ private:
   void set_failover_gw_for_ANA_group(
     const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
     const NvmeGwId &gw_id, NvmeAnaGrpId groupid);
-
+  int get_num_namespaces(const NvmeGwId &gw_id,
+    const NvmeGroupKey& group_key, const BeaconSubsystems&  subs );
   int get_timer(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     NvmeAnaGrpId anagrpid);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 651046515fe..b6faeb2e97c 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -105,6 +105,7 @@ void NVMeofGwMon::tick()
   const auto cutoff = now - nvmegw_beacon_grace;
 
   // Pass over all the stored beacons
+  NvmeGroupKey old_group_key;
   for (auto &itr : last_beacon) {
     auto& lb = itr.first;
     auto last_beacon_time = itr.second;
@@ -114,6 +115,14 @@ void NVMeofGwMon::tick()
       _propose_pending |= propose;
       last_beacon.erase(lb);
     } else {
+      BeaconSubsystems  *subsystems =
+         &pending_map.created_gws[lb.group_key][lb.gw_id].subsystems;
+      if (subsystems && subsystems->size() && old_group_key != lb.group_key) {
+        // to call track_deleting_gws once per each group-key
+        pending_map.track_deleting_gws(lb.group_key, *subsystems, propose);
+        old_group_key = lb.group_key;
+        _propose_pending |= propose;
+      }
       dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
     }
   }
@@ -299,29 +308,58 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
     auto group_key = std::make_pair(pool, group);
     dout(10) << "nvme-gw show  pool " << pool << " group " << group << dendl;
 
-    if (map.created_gws[group_key].size()) {
-      f->open_object_section("common");
-      f->dump_unsigned("epoch", map.epoch);
-      f->dump_string("pool", pool);
-      f->dump_string("group", group);
-      f->dump_unsigned("num gws", map.created_gws[group_key].size());
+    f->open_object_section("common");
+    f->dump_unsigned("epoch", map.epoch);
+    f->dump_string("pool", pool);
+    f->dump_string("group", group);
+    if (HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA)) {
+      f->dump_string("features", "LB");
+    }
+    f->dump_unsigned("num gws", map.created_gws[group_key].size());
+    if (map.created_gws[group_key].size() == 0) {
+      f->close_section();
+      f->flush(rdata);
+      sstrm.str("");
+    } else {
       sstrm << "[ ";
       NvmeGwId gw_id;
+      BeaconSubsystems   *subsystems = NULL;
       for (auto& gw_created_pair: map.created_gws[group_key]) {
-	gw_id = gw_created_pair.first;
-	auto& st = gw_created_pair.second;
-	sstrm << st.ana_grp_id+1 << " ";
+        gw_id = gw_created_pair.first;
+        auto& st = gw_created_pair.second;
+        if (st.availability != gw_availability_t::GW_DELETING) {
+          // not show ana group of deleting gw in the list -
+          // it is information for the GW used in rebalancing process
+          sstrm << st.ana_grp_id+1 << " ";
+        }
+        if (st.availability == gw_availability_t::GW_AVAILABLE) {
+          subsystems = &st.subsystems;
+        }
       }
       sstrm << "]";
       f->dump_string("Anagrp list", sstrm.str());
-      f->close_section();
-
+      std::map<NvmeAnaGrpId, uint16_t> num_ns;
+      uint16_t total_ns = 0;
+      if (subsystems && subsystems->size()) {
+        for (auto & subs_it:*subsystems) {
+          for (auto & ns :subs_it.namespaces) {
+            if (num_ns.find(ns.anagrpid) == num_ns.end()) num_ns[ns.anagrpid] = 0;
+              num_ns[ns.anagrpid] +=1;
+              total_ns += 1;
+          }
+        }
+      }
+      f->dump_unsigned("num-namespaces", total_ns);
+      f->open_array_section("Created Gateways:");
+      uint32_t i = 0;
       for (auto& gw_created_pair: map.created_gws[group_key]) {
 	auto& gw_id = gw_created_pair.first;
 	auto& state = gw_created_pair.second;
+	i = 0;
 	f->open_object_section("stat");
 	f->dump_string("gw-id", gw_id);
 	f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+	f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
 	f->dump_unsigned("performed-full-startup", state.performed_full_startup);
 	std::stringstream  sstrm1;
 	sstrm1 << state.availability;
@@ -329,17 +367,18 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
 	sstrm1.str("");
 	for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
 	  sstrm1 << " " << state_itr.first + 1 << ": "
-		 << state.sm_state[state_itr.first] << ",";
+		 << state.sm_state[state_itr.first];
+		 if (++i < map.created_gws[group_key][gw_id].sm_state.size())
+		  sstrm1<<  ", ";
 	}
 	f->dump_string("ana states", sstrm1.str());
 	f->close_section();
       }
+      f->close_section();
+      f->close_section();
       f->flush(rdata);
       sstrm.str("");
     }
-    else {
-      sstrm << "num_gws  0";
-    }
     getline(sstrm, rs);
     mon.reply_command(op, err, rs, rdata, get_last_committed());
     return true;
@@ -388,19 +427,18 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
 		 << " " << pool << " " << group << "  rc " << rc << dendl;
 	sstrm.str("");
       }
-    }
-    else{
+    } else {
       rc = pending_map.cfg_delete_gw(id, group_key);
-      if (rc == -EINVAL) {
+      if (rc == 0) {
+        bool propose = false;
+        // Simulate  immediate Failover of this GW
+        process_gw_down(id, group_key, propose);
+      } else if (rc == -EINVAL) {
 	dout (4) << "Error: GW not found in the database " << id << " "
 		 << pool << " " << group << "  rc " << rc << dendl;
 	err = 0;
 	sstrm.str("");
       }
-      if (rc == 0) {
-        LastBeacon lb = {id, group_key};
-        last_beacon.erase(lb);
-      }
     }
     // propose pending would be generated by the PaxosService
     if ((rc != -EEXIST) && (rc != -EINVAL)) {
@@ -423,6 +461,16 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
   return response;
 }
 
+void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
+   const NvmeGroupKey& group_key, bool &propose_pending)
+{
+  LastBeacon lb = {gw_id, group_key};
+  auto it = last_beacon.find(lb);
+  if (it != last_beacon.end()) {
+    last_beacon.erase(it);
+    pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
+  }
+}
 
 bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op)
 {
@@ -527,7 +575,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 	       << pending_map.created_gws[group_key][gw_id].nonce_map  << dendl;
       nonce_propose = true;
     }
-  } else  {
+  } else {
     dout(10) << "Warning: received empty nonce map in the beacon of GW "
 	     << gw_id << " " << dendl;
   }
@@ -560,13 +608,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
     pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose);
   // state set by GW client application
   } else if (avail == gw_availability_t::GW_UNAVAILABLE) {
-    LastBeacon lb = {gw_id, group_key};
-
-    auto it = last_beacon.find(lb);
-    if (it != last_beacon.end()) {
-      last_beacon.erase(lb);
-      pending_map.process_gw_map_gw_down(gw_id, group_key, propose);
-    }
+      process_gw_down(gw_id, group_key, propose);
   }
   // Periodic: check active FSM timers
   pending_map.update_active_timers(timer_propose);
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h
index acd72dfe0c4..f132c87d92a 100644
--- a/src/mon/NVMeofGwMon.h
+++ b/src/mon/NVMeofGwMon.h
@@ -84,7 +84,8 @@ public:
 
 private:
   void synchronize_last_beacon();
-
+  void process_gw_down(const NvmeGwId &gw_id,
+     const NvmeGroupKey& group_key, bool &propose_pending);
 };
 
 #endif /* MON_NVMEGWMONITOR_H_ */
diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h
index cbda90ea379..b10eac88c2f 100755
--- a/src/mon/NVMeofGwSerialize.h
+++ b/src/mon/NVMeofGwSerialize.h
@@ -74,6 +74,9 @@ inline std::ostream& operator<<(
   case gw_availability_t::GW_UNAVAILABLE:
     os << "UNAVAILABLE";
     break;
+  case gw_availability_t::GW_DELETING:
+    os << "DELETING"; break;
+
   default:
     os << "Invalid " << (int)value << " ";
   }
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h
index 057c8cd3795..2dd3e11ba3a 100755
--- a/src/mon/NVMeofGwTypes.h
+++ b/src/mon/NVMeofGwTypes.h
@@ -42,6 +42,7 @@ enum class gw_availability_t {
   GW_CREATED = 0,
   GW_AVAILABLE,
   GW_UNAVAILABLE,
+  GW_DELETING,
   GW_DELETED
 };
 
@@ -134,9 +135,12 @@ struct NvmeGwMonState {
     : ana_grp_id(id), availability(gw_availability_t::GW_CREATED),
       last_gw_map_epoch_valid(false), performed_full_startup(false) {}
   void set_unavailable_state() {
-    availability = gw_availability_t::GW_UNAVAILABLE;
-    // after setting this state the next time monitor sees GW,
-    // it expects it performed the full startup
+    if (availability != gw_availability_t::GW_DELETING) {
+      //for not to override Deleting
+      availability = gw_availability_t::GW_UNAVAILABLE;
+    }
+     // after setting this state, the next time monitor sees GW,
+     // it expects it performed the full startup
     performed_full_startup = false;
   }
   void standby_state(NvmeAnaGrpId grpid) {
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 6bcb77ee5a9..ef4dd3fc4f0 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -612,6 +612,7 @@ public:
 
   void on_backfill_reserved() override;
   void on_backfill_canceled() override;
+  void on_recovery_cancelled() override {}
   void on_recovery_reserved() override;
 
   bool is_forced_recovery_or_backfill() const {
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 944b77c2812..22222b7f7af 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -5827,6 +5827,7 @@ PeeringState::Recovering::react(const DeferRecovery &evt)
   ps->state_set(PG_STATE_RECOVERY_WAIT);
   pl->cancel_local_background_io_reservation();
   release_reservations(true);
+  pl->on_recovery_cancelled();
   pl->schedule_event_after(
     std::make_shared<PGPeeringEvent>(
       ps->get_osdmap_epoch(),
@@ -5844,6 +5845,7 @@ PeeringState::Recovering::react(const UnfoundRecovery &evt)
   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
   pl->cancel_local_background_io_reservation();
   release_reservations(true);
+  pl->on_recovery_cancelled();
   return transit<NotRecovering>();
 }
 
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index f036bb44b11..11ac084a054 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -419,6 +419,7 @@ public:
     virtual void on_backfill_reserved() = 0;
     virtual void on_backfill_canceled() = 0;
     virtual void on_recovery_reserved() = 0;
+    virtual void on_recovery_cancelled() = 0;
 
     // ================recovery space accounting ================
     virtual bool try_reserve_recovery_space(
diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc
index 414de479dc7..4ece8880411 100644
--- a/src/osd/SnapMapper.cc
+++ b/src/osd/SnapMapper.cc
@@ -728,6 +728,50 @@ int SnapMapper::get_snaps(
   return 0;
 }
 
+void SnapMapper::update_snap_map(
+  const pg_log_entry_t& i,
+  MapCacher::Transaction<std::string, ceph::buffer::list> *_t)
+{
+  ceph_assert(i.soid.snap < CEPH_MAXSNAP);
+  dout(20) << __func__ << " " << i << dendl;
+  if (i.is_delete()) {
+    int r = remove_oid(
+      i.soid,
+      _t);
+    if (r)
+      dout(20) << __func__ << " remove_oid " << i.soid << " failed with " << r << dendl;
+    // On removal tolerate missing key corruption
+    ceph_assert(r == 0 || r == -ENOENT);
+  } else if (i.is_update()) {
+    ceph_assert(i.snaps.length() > 0);
+    std::vector<snapid_t> snaps;
+    bufferlist snapbl = i.snaps;
+    auto p = snapbl.cbegin();
+    try {
+      decode(snaps, p);
+    } catch (...) {
+      dout(20) << __func__ << " decode snaps failure on " << i << dendl;
+      snaps.clear();
+    }
+    std::set<snapid_t> _snaps(snaps.begin(), snaps.end());
+
+    if (i.is_clone() || i.is_promote()) {
+      add_oid(
+        i.soid,
+        _snaps,
+        _t);
+    } else if (i.is_modify()) {
+      int r = update_snaps(
+        i.soid,
+        _snaps,
+        0,
+        _t);
+      ceph_assert(r == 0);
+    } else {
+      ceph_assert(i.is_clean());
+    }
+  }
+}
 
 // -- purged snaps --
 
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
index f9a371932f6..a43bb5d94f0 100644
--- a/src/osd/SnapMapper.h
+++ b/src/osd/SnapMapper.h
@@ -394,6 +394,10 @@ private:
     std::set<snapid_t> *snaps ///< [out] snaps
     ) const; ///< @return error, -ENOENT if oid is not recorded
 
+  void update_snap_map(
+    const pg_log_entry_t& i,
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
   /// Get snaps for oid - alternative interface
   tl::expected<std::set<snapid_t>, SnapMapReaderI::result_t> get_snaps(
     const hobject_t &hoid) const final;
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 0e4253b339a..b0bdc9fcb95 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -2220,7 +2220,7 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
   // that made any of the targets into a high-priority one. All that's left:
   // delay the specific target that was aborted.
 
-  auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), 5s, issue,
+  auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), issue,
       scrub_clock_now);
 
   /// \todo complete the merging of the deadline & target for non-hp targets
@@ -2251,8 +2251,7 @@ void PgScrubber::requeue_penalized(
     return;
   }
   /// \todo fix the 5s' to use a cause-specific delay parameter
-  auto& trgt =
-      m_scrub_job->delay_on_failure(s_or_d, 5s, cause, scrub_clock_now);
+  auto& trgt = m_scrub_job->delay_on_failure(s_or_d, cause, scrub_clock_now);
   ceph_assert(!trgt.queued);
   m_osds->get_scrub_services().enqueue_target(trgt);
   trgt.queued = true;
@@ -2274,7 +2273,7 @@ void PgScrubber::requeue_penalized(
       m_osds->get_scrub_services().dequeue_target(m_pg_id, sister_level);
       trgt2.queued = false;
     }
-    m_scrub_job->delay_on_failure(sister_level, 5s, cause, scrub_clock_now);
+    m_scrub_job->delay_on_failure(sister_level, cause, scrub_clock_now);
     m_osds->get_scrub_services().enqueue_target(trgt2);
     trgt2.queued = true;
   }
@@ -2333,7 +2332,8 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session(
     // i.e. some time before setting 'snaptrim'.
     dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
     requeue_penalized(
-	s_or_d, delay_both_targets_t::yes, delay_cause_t::pg_state, clock_now);
+	s_or_d, delay_both_targets_t::yes, delay_cause_t::snap_trimming,
+	clock_now);
     return schedule_result_t::target_specific_failure;
   }
 
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc
index ee33ee06706..7b05eea3941 100644
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -12,6 +12,7 @@ using sched_conf_t = Scrub::sched_conf_t;
 using scrub_schedule_t = Scrub::scrub_schedule_t;
 using ScrubJob = Scrub::ScrubJob;
 using delay_ready_t = Scrub::delay_ready_t;
+using namespace std::chrono;
 
 namespace {
 utime_t add_double(utime_t t, double d)
@@ -300,16 +301,42 @@ void ScrubJob::adjust_deep_schedule(
 
 SchedTarget& ScrubJob::delay_on_failure(
     scrub_level_t level,
-    std::chrono::seconds delay,
-    Scrub::delay_cause_t delay_cause,
+    delay_cause_t delay_cause,
     utime_t scrub_clock_now)
 {
+  seconds delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_delay"));
+  switch (delay_cause) {
+    case delay_cause_t::flags:
+      delay =
+	  seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_after_noscrub"));
+      break;
+    case delay_cause_t::pg_state:
+      delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_pg_state"));
+      break;
+    case delay_cause_t::snap_trimming:
+      delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_trimming"));
+      break;
+    case delay_cause_t::interval:
+      delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_new_interval"));
+      break;
+    case delay_cause_t::local_resources:
+    case delay_cause_t::aborted:
+    default:
+      // for all other possible delay causes: use the default delay
+      break;
+  }
+
   auto& delayed_target =
       (level == scrub_level_t::deep) ? deep_target : shallow_target;
   delayed_target.sched_info.schedule.not_before =
       std::max(scrub_clock_now, delayed_target.sched_info.schedule.not_before) +
       utime_t{delay};
   delayed_target.sched_info.last_issue = delay_cause;
+  dout(20) << fmt::format(
+		  "delayed {}scrub due to {} for {}s. Updated: {}",
+		  (level == scrub_level_t::deep ? "deep " : ""), delay_cause,
+		  delay.count(), delayed_target)
+	   << dendl;
   return delayed_target;
 }
 
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h
index 98a3e101f9b..b037084db6b 100644
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -239,14 +239,14 @@ class ScrubJob {
 
   /**
    * For the level specified, set the 'not-before' time to 'now+delay',
-   * so that this scrub target
-   * would not be retried before 'delay' seconds have passed.
+   * so that this scrub target would not be retried before the required
+   * delay seconds have passed.
+   * The delay is determined based on the 'cause' parameter.
    * The 'last_issue' is updated to the cause of the delay.
    * \returns a reference to the target that was modified.
    */
   [[maybe_unused]] SchedTarget& delay_on_failure(
       scrub_level_t level,
-      std::chrono::seconds delay,
       delay_cause_t delay_cause,
       utime_t scrub_clock_now);
 
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index 81e9bd7b6d8..da9466758f4 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -199,6 +199,7 @@ sc::result Session::react(const IntervalChanged&)
 
   ceph_assert(m_reservations);
   m_reservations->discard_remote_reservations();
+  m_abort_reason = delay_cause_t::interval;
   return transit<NotActive>();
 }
 
@@ -300,7 +301,8 @@ ActiveScrubbing::~ActiveScrubbing()
   // completed successfully), we use it now to set the 'failed scrub' duration.
   if (session.m_session_started_at != ScrubTimePoint{}) {
     // delay the next invocation of the scrubber on this target
-    scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted);
+    scrbr->on_mid_scrub_abort(
+	session.m_abort_reason.value_or(Scrub::delay_cause_t::aborted));
 
     auto logged_duration = ScrubClock::now() - session.m_session_started_at;
     session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index d1edfd37c98..ad0d3bfba38 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -551,6 +551,10 @@ struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>,
 
   /// the time when the session was initiated
   ScrubTimePoint m_session_started_at{ScrubClock::now()};
+
+  /// abort reason - if known. Determines the delay time imposed on the
+  /// failed scrub target.
+  std::optional<Scrub::delay_cause_t> m_abort_reason{std::nullopt};
 };
 
 struct ReservingReplicas : sc::state<ReservingReplicas, Session>, NamedSimply {
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 60c1a68d2be..5e510a03a82 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -229,7 +229,8 @@ enum class delay_cause_t {
   none,		    ///< scrub attempt was successful
   replicas,	    ///< failed to reserve replicas
   flags,	    ///< noscrub or nodeep-scrub
-  pg_state,	    ///< e.g. snap-trimming
+  pg_state,	    ///< not active+clean
+  snap_trimming,    ///< snap-trimming is in progress
   restricted_time,  ///< time restrictions or busy CPU
   local_resources,  ///< too many scrubbing PGs
   aborted,	    ///< scrub was aborted w/ unspecified reason
@@ -252,6 +253,7 @@ struct formatter<Scrub::delay_cause_t> : ::fmt::formatter<std::string_view> {
       case replicas:            desc = "replicas"; break;
       case flags:               desc = "noscrub"; break;
       case pg_state:            desc = "pg-state"; break;
+      case snap_trimming:       desc = "snap-trim"; break;
       case restricted_time:     desc = "time/load"; break;
       case local_resources:     desc = "local-cnt"; break;
       case aborted:             desc = "aborted"; break;
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 780e2b8784e..0bca599961e 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -144,10 +144,11 @@ DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:1.26.1'
+DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
 DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
 DEFAULT_SAMBA_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
+DEFAULT_SAMBA_METRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
 # ------------------------------------------------------------------------------
 
 
@@ -320,6 +321,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             desc='Samba/SMB container image',
         ),
         Option(
+            'container_image_samba_metrics',
+            default=DEFAULT_SAMBA_METRICS_IMAGE,
+            desc='Samba/SMB metrics exporter container image',
+        ),
+        Option(
             'warn_on_stray_hosts',
             type='bool',
             default=True,
@@ -529,6 +535,19 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             'the host (in seconds)'
         ),
         Option(
+            'ssh_keepalive_interval',
+            type='int',
+            default=7,
+            desc='How often ssh connections are checked for liveness'
+        ),
+        Option(
+            'ssh_keepalive_count_max',
+            type='int',
+            default=3,
+            desc='How many times ssh connections can fail liveness checks '
+            'before the host is marked offline'
+        ),
+        Option(
             'cephadm_log_destination',
             type='str',
             default='',
@@ -585,6 +604,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.container_image_jaeger_collector = ''
             self.container_image_jaeger_query = ''
             self.container_image_samba = ''
+            self.container_image_samba_metrics = ''
             self.warn_on_stray_hosts = True
             self.warn_on_stray_daemons = True
             self.warn_on_failed_host_check = True
@@ -630,6 +650,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.default_cephadm_command_timeout = 0
             self.cephadm_log_destination = ''
             self.oob_default_addr = ''
+            self.ssh_keepalive_interval = 0
+            self.ssh_keepalive_count_max = 0
 
         self.notify(NotifyType.mon_map, None)
         self.config_notify()
diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py
index e9b072a9a50..1efd2dc1281 100644
--- a/src/pybind/mgr/cephadm/service_discovery.py
+++ b/src/pybind/mgr/cephadm/service_discovery.py
@@ -13,6 +13,7 @@ from mgr_module import ServiceInfoT
 from mgr_util import build_url
 from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO
 from cephadm.services.nfs import NFSService
+from cephadm.services.smb import SMBService
 from cephadm.services.monitoring import AlertmanagerService, NodeExporterService, PrometheusService
 import secrets
 from mgr_util import verify_tls_files
@@ -149,6 +150,7 @@ class Root(Server):
 <p><a href='prometheus/sd-config?service=ceph-exporter'>Ceph exporter http sd-config</a></p>
 <p><a href='prometheus/sd-config?service=nvmeof'>NVMeoF http sd-config</a></p>
 <p><a href='prometheus/sd-config?service=nfs'>NFS http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=smb'>SMB http sd-config</a></p>
 <p><a href='prometheus/rules'>Prometheus rules</a></p>
 </body>
 </html>'''
@@ -171,6 +173,8 @@ class Root(Server):
             return self.nvmeof_sd_config()
         elif service == 'nfs':
             return self.nfs_sd_config()
+        elif service == 'smb':
+            return self.smb_sd_config()
         else:
             return []
 
@@ -265,6 +269,19 @@ class Root(Server):
             })
         return srv_entries
 
+    def smb_sd_config(self) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for smb service."""
+        srv_entries = []
+        for dd in self.mgr.cache.get_daemons_by_type('smb'):
+            assert dd.hostname is not None
+            addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+            port = SMBService.DEFAULT_EXPORTER_PORT
+            srv_entries.append({
+                'targets': [build_url(host=addr, port=port).lstrip('/')],
+                'labels': {'instance': dd.hostname}
+            })
+        return srv_entries
+
     @cherrypy.expose(alias='prometheus/rules')
     def get_prometheus_rules(self) -> str:
         """Return currently configured prometheus rules as Yaml."""
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index f407985fcee..5d5d26ecad1 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -503,6 +503,7 @@ class PrometheusService(CephadmService):
         nvmeof_sd_url = f'{srv_end_point}service=nvmeof'  # always included
         mgmt_gw_enabled = len(self.mgr.cache.get_daemons_by_service('mgmt-gateway')) > 0
         nfs_sd_url = f'{srv_end_point}service=nfs'  # always included
+        smb_sd_url = f'{srv_end_point}service=smb'  # always included
 
         alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
         prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
@@ -524,7 +525,8 @@ class PrometheusService(CephadmService):
             'nvmeof_sd_url': nvmeof_sd_url,
             'external_prometheus_targets': targets,
             'cluster_fsid': FSID,
-            'nfs_sd_url': nfs_sd_url
+            'nfs_sd_url': nfs_sd_url,
+            'smb_sd_url': smb_sd_url
         }
 
         ip_to_bind_to = ''
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index b91427dcace..1792bbf37c9 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -47,10 +47,17 @@ class NvmeofService(CephService):
         transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None
         name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id)
         rados_id = name[len('client.'):] if name.startswith('client.') else name
+        addr = host_ip
+        discovery_addr = host_ip
+        if spec.addr and spec.addr != "0.0.0.0":
+            addr = spec.addr
+        if spec.discovery_addr and spec.discovery_addr != "0.0.0.0":
+            discovery_addr = spec.discovery_addr
         context = {
             'spec': spec,
             'name': name,
-            'addr': host_ip,
+            'addr': addr,
+            'discovery_addr': discovery_addr,
             'port': spec.port,
             'spdk_log_level': 'WARNING',
             'rpc_socket_dir': '/var/tmp/',
@@ -144,7 +151,9 @@ class NvmeofService(CephService):
                     cmd_dicts.append({
                         'prefix': 'dashboard nvmeof-gateway-add',
                         'inbuf': service_url,
-                        'name': service_name
+                        'name': service_name,
+                        'group': spec.group,
+                        'daemon_name': dd.name()
                     })
             return cmd_dicts
 
diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py
index da75136cdfb..dabc202a024 100644
--- a/src/pybind/mgr/cephadm/services/smb.py
+++ b/src/pybind/mgr/cephadm/services/smb.py
@@ -16,6 +16,7 @@ logger = logging.getLogger(__name__)
 
 class SMBService(CephService):
     TYPE = 'smb'
+    DEFAULT_EXPORTER_PORT = 9922
     smb_pool = '.smb'  # minor layering violation. try to clean up later.
 
     def config(self, spec: ServiceSpec) -> None:
@@ -79,6 +80,11 @@ class SMBService(CephService):
                 smb_spec, daemon_spec.daemon_id, ceph_users
             )
         )
+        config_blobs['metrics_image'] = (
+            self.mgr.container_image_samba_metrics
+        )
+        config_blobs['metrics_port'] = SMBService.DEFAULT_EXPORTER_PORT
+
         logger.debug('smb generate_config: %r', config_blobs)
         self._configure_cluster_meta(smb_spec, daemon_spec)
         return config_blobs, []
diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py
index f0d507dfe89..1622cb001ab 100644
--- a/src/pybind/mgr/cephadm/ssh.py
+++ b/src/pybind/mgr/cephadm/ssh.py
@@ -168,7 +168,9 @@ class SSHManager:
             with self.redirect_log(host, addr):
                 try:
                     ssh_options = asyncssh.SSHClientConnectionOptions(
-                        keepalive_interval=7, keepalive_count_max=3)
+                        keepalive_interval=self.mgr.ssh_keepalive_interval,
+                        keepalive_count_max=self.mgr.ssh_keepalive_count_max
+                    )
                     conn = await asyncssh.connect(addr, username=self.mgr.ssh_user, client_keys=[self.mgr.tkey.name],
                                                   known_hosts=None, config=[self.mgr.ssh_config_fname],
                                                   preferred_auth=['publickey'], options=ssh_options)
@@ -240,10 +242,6 @@ class SSHManager:
         if log_command:
             logger.debug(f'Running command: {rcmd}')
         try:
-            test_cmd = RemoteSudoCommand(
-                Executables.TRUE, [], use_sudo=use_sudo
-            )
-            r = await conn.run(str(test_cmd), check=True, timeout=5)  # host quick check
             r = await conn.run(str(rcmd), input=stdin)
         # handle these Exceptions otherwise you might get a weird error like
         # TypeError: __init__() missing 1 required positional argument: 'reason' (due to the asyncssh error interacting with raise_if_exception)
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index 644ca586ba9..dbe29004771 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -32,7 +32,7 @@ max_log_directory_backups = {{ spec.max_log_directory_backups }}
 log_directory = {{ spec.log_directory }}
 
 [discovery]
-addr = {{ addr }}
+addr = {{ discovery_addr }}
 port = {{ spec.discovery_port }}
 
 [ceph]
diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
index 3707f47f977..e7694867755 100644
--- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
@@ -181,6 +181,26 @@ scrape_configs:
 {% endif %}
 {% endif %}
 
+{% if smb_sd_url %}
+  - job_name: 'smb'
+{% if security_enabled %}
+    honor_labels: true
+    scheme: https
+    tls_config:
+      ca_file: root_cert.pem
+    http_sd_configs:
+    - url: {{ smb_sd_url }}
+      basic_auth:
+        username: {{ service_discovery_username }}
+        password: {{ service_discovery_password }}
+      tls_config:
+        ca_file: root_cert.pem
+{% else %}
+    http_sd_configs:
+    - url: {{ smb_sd_url }}
+{% endif %}
+{% endif %}
+
 {% if not security_enabled %}
   - job_name: 'federate'
     scrape_interval: 15s
diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
index 159431b3b88..6f73cad91de 100644
--- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py
+++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
@@ -27,6 +27,10 @@ class FakeCache:
             return [FakeDaemonDescription('1.2.3.4', [9587], 'node0'),
                     FakeDaemonDescription('1.2.3.5', [9587], 'node1')]
 
+        if service_type == 'smb':
+            return [FakeDaemonDescription('1.2.3.4', [9922], 'node0'),
+                    FakeDaemonDescription('1.2.3.5', [9922], 'node1')]
+
         return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
                 FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
 
@@ -206,6 +210,20 @@ class TestServiceDiscovery:
         # check content
         assert cfg[0]['targets'] == ['1.2.3.4:9587']
 
+    def test_get_sd_config_smb(self):
+        mgr = FakeMgr()
+        root = Root(mgr, 5000, '0.0.0.0')
+        cfg = root.get_sd_config('smb')
+
+        # check response structure
+        assert cfg
+        for entry in cfg:
+            assert 'labels' in entry
+            assert 'targets' in entry
+
+        # check content
+        assert cfg[0]['targets'] == ['1.2.3.4:9922']
+
     def test_get_sd_config_invalid_service(self):
         mgr = FakeMgr()
         root = Root(mgr, 5000, '0.0.0.0')
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index d21c4c719a5..08802cd5923 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -829,6 +829,10 @@ class TestMonitoring:
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=nfs
 
+                  - job_name: 'smb'
+                    http_sd_configs:
+                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=smb
+
                   - job_name: 'federate'
                     scrape_interval: 15s
                     honor_labels: true
@@ -1039,6 +1043,19 @@ class TestMonitoring:
                       tls_config:
                         ca_file: root_cert.pem
 
+                  - job_name: 'smb'
+                    honor_labels: true
+                    scheme: https
+                    tls_config:
+                      ca_file: root_cert.pem
+                    http_sd_configs:
+                    - url: https://[::1]:8765/sd/prometheus/sd-config?service=smb
+                      basic_auth:
+                        username: sd_user
+                        password: sd_password
+                      tls_config:
+                        ca_file: root_cert.pem
+
                 """).lstrip()
 
                 _run_cephadm.assert_called_with(
@@ -3176,6 +3193,8 @@ class TestSMB:
                 'config': '',
                 'keyring': '[client.smb.config.tango.briskly]\nkey = None\n',
                 'config_auth_entity': 'client.smb.config.tango.briskly',
+                'metrics_image': 'quay.io/samba.org/samba-metrics:latest',
+                'metrics_port': 9922,
             },
         }
         with with_host(cephadm_module, 'hostx'):
@@ -3246,6 +3265,8 @@ class TestSMB:
                     '[client.smb.fs.fs2.share3]\nkey = None\n'
                 ),
                 'config_auth_entity': 'client.smb.config.tango.briskly',
+                'metrics_image': 'quay.io/samba.org/samba-metrics:latest',
+                'metrics_port': 9922,
             },
         }
         with with_host(cephadm_module, 'hostx'):
diff --git a/src/pybind/mgr/dashboard/controllers/host.py b/src/pybind/mgr/dashboard/controllers/host.py
index c0062b94d70..645fc4cc44c 100644
--- a/src/pybind/mgr/dashboard/controllers/host.py
+++ b/src/pybind/mgr/dashboard/controllers/host.py
@@ -512,3 +512,16 @@ class HostUi(BaseController):
     @handle_orchestrator_error('host')
     def inventory(self, refresh=None):
         return get_inventories(None, refresh)
+
+    @Endpoint('GET')
+    @ReadPermission
+    @raise_if_no_orchestrator([OrchFeature.HOST_LIST])
+    @handle_orchestrator_error('host')
+    def list(self):
+        """
+        Get all hosts.
+        This endpoint is introduced to get all the available hosts in cases where
+        service instance is not needed (ex: hosts selection in forms), and also
+        get_hosts method helps in caching the response which makes it performant.
+        """
+        return get_hosts()
diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index e050ecdf6a6..ec9c9897081 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -2,6 +2,8 @@
 import logging
 from typing import Any, Dict, Optional
 
+from orchestrator import OrchestratorError
+
 from .. import mgr
 from ..model import nvmeof as model
 from ..security import Scope
@@ -30,19 +32,32 @@ else:
         @EndpointDoc("Get information about the NVMeoF gateway")
         @map_model(model.GatewayInfo)
         @handle_nvmeof_error
-        def list(self):
-            return NVMeoFClient().stub.get_gateway_info(
+        def list(self, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.get_gateway_info(
                 NVMeoFClient.pb2.get_gateway_info_req()
             )
 
+        @ReadPermission
+        @Endpoint('GET')
+        def group(self):
+            try:
+                orch = OrchClient.instance()
+                return orch.services.list(service_type='nvmeof')
+            except OrchestratorError as e:
+                # just return none instead of raising an exception
+                # since we need this to work regardless of the status
+                # of orchestrator in UI
+                logger.error('Failed to fetch the gateway groups: %s', e)
+                return None
+
     @APIRouter("/nvmeof/subsystem", Scope.NVME_OF)
     @APIDoc("NVMe-oF Subsystem Management API", "NVMe-oF Subsystem")
     class NVMeoFSubsystem(RESTController):
         @EndpointDoc("List all NVMeoF subsystems")
         @map_collection(model.Subsystem, pick="subsystems")
         @handle_nvmeof_error
-        def list(self):
-            return NVMeoFClient().stub.list_subsystems(
+        def list(self, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_subsystems(
                 NVMeoFClient.pb2.list_subsystems_req()
             )
 
@@ -52,8 +67,8 @@ else:
         )
         @map_model(model.Subsystem, first="subsystems")
         @handle_nvmeof_error
-        def get(self, nqn: str):
-            return NVMeoFClient().stub.list_subsystems(
+        def get(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_subsystems(
                 NVMeoFClient.pb2.list_subsystems_req(subsystem_nqn=nqn)
             )
 
@@ -61,14 +76,15 @@ else:
             "Create a new NVMeoF subsystem",
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
-                "max_namespaces": Param(int, "Maximum number of namespaces", True, 256),
+                "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
                 "enable_ha": Param(bool, "Enable high availability"),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 256):
-            return NVMeoFClient().stub.create_subsystem(
+        def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 1024,
+                   gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.create_subsystem(
                 NVMeoFClient.pb2.create_subsystem_req(
                     subsystem_nqn=nqn, max_namespaces=max_namespaces, enable_ha=enable_ha
                 )
@@ -83,8 +99,8 @@ else:
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, force: Optional[str] = "false"):
-            return NVMeoFClient().stub.delete_subsystem(
+        def delete(self, nqn: str, force: Optional[str] = "false", gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.delete_subsystem(
                 NVMeoFClient.pb2.delete_subsystem_req(
                     subsystem_nqn=nqn, force=str_to_bool(force)
                 )
diff --git a/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts b/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
index 7bac7d12bed..cdf7d7cb531 100644
--- a/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
+++ b/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
@@ -45,7 +45,7 @@ export class ImagesPageHelper extends PageHelper {
   // checks that it is present in the trash table
   moveToTrash(name: string) {
     // wait for image to be created
-    cy.get('cds-table table tbody').first().should('not.contain.text', '(Creating...)');
+    cy.get('table[cdstable] tbody').first().should('not.contain.text', '(Creating...)');
 
     this.getFirstTableCell(name).click();
 
diff --git a/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts b/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
index 4d5b0aa7bc7..2e94179aa74 100644
--- a/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
+++ b/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
@@ -147,14 +147,14 @@ export abstract class PageHelper {
    */
   private waitDataTableToLoad() {
     cy.get('cd-table').should('exist');
-    cy.get('cds-table table tbody').should('exist');
+    cy.get('table[cdstable] tbody').should('exist');
     cy.contains('Loading').should('not.exist');
   }
 
   getDataTables() {
     this.waitDataTableToLoad();
 
-    return cy.get('cd-table cds-table');
+    return cy.get('cd-table [cdsTable]');
   }
 
   private getTableCountSpan(_spanType: 'selected' | 'found' | 'total' | 'item' | 'items') {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
index 18a4000e14d..c466c8674cc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
@@ -3,8 +3,7 @@
 <legend i18n>
   Gateways
   <cd-help-text>
-    The NVMe-oF gateway integrates Ceph with the NVMe over TCP (NVMe/TCP) protocol to provide an NVMe/TCP target that exports RADOS Block Device (RBD) images.
-  </cd-help-text>
+    Ceph NVMe-oF gateways provide Ceph Block Device storage through NVMe/TCP. For VMware clients the NVMe/TCP volumes display as  VMFS Datastores. For Linux clients the NVMe/TCP volumes display as as block devices.</cd-help-text>
 </legend>
 <div>
   <cd-table [data]="gateways"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
index 29ebbe645d1..bdbe7320cfa 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
@@ -1,6 +1,7 @@
 <legend>
   <cd-help-text>
-      The client that connects to the NVMe-oF target to access NVMe storage.
+    An initiator (or host) is the client that connects to the NVMe-oF target to access NVMe storage.
+    The NVMe/TCP protocol allows initiators, to send NVMe-oF commands to storage devices, which are known as targets.
   </cd-help-text>
 </legend>
 <cd-table [data]="initiators"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
index bc02ea99c98..412286bda20 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
@@ -13,7 +13,10 @@ import { FormatterService } from '~/app/shared/services/formatter.service';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { DimlessBinaryPipe } from '~/app/shared/pipes/dimless-binary.pipe';
 import { HostService } from '~/app/shared/api/host.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { DaemonService } from '~/app/shared/api/daemon.service';
+import { map } from 'rxjs/operators';
+import { forkJoin } from 'rxjs';
+
 @Component({
   selector: 'cd-nvmeof-listeners-form',
   templateUrl: './nvmeof-listeners-form.component.html',
@@ -39,7 +42,8 @@ export class NvmeofListenersFormComponent implements OnInit {
     private route: ActivatedRoute,
     public activeModal: NgbActiveModal,
     public formatterService: FormatterService,
-    public dimlessBinaryPipe: DimlessBinaryPipe
+    public dimlessBinaryPipe: DimlessBinaryPipe,
+    private daemonService: DaemonService
   ) {
     this.permission = this.authStorageService.getPermissions().nvmeof;
     this.hostPermission = this.authStorageService.getPermissions().hosts;
@@ -48,13 +52,19 @@ export class NvmeofListenersFormComponent implements OnInit {
   }
 
   setHosts() {
-    const hostContext = new CdTableFetchDataContext(() => undefined);
-    this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: any[]) => {
-      const nvmeofHosts = resp.filter((r) =>
-        r.service_instances.some((si: any) => si.type === 'nvmeof')
-      );
-      this.hosts = nvmeofHosts.map((h) => ({ hostname: h.hostname, addr: h.addr }));
-    });
+    forkJoin({
+      daemons: this.daemonService.list(['nvmeof']),
+      hosts: this.hostService.getAllHosts()
+    })
+      .pipe(
+        map(({ daemons, hosts }) => {
+          const hostNamesFromDaemon = daemons.map((daemon: any) => daemon.hostname);
+          return hosts.filter((host: any) => hostNamesFromDaemon.includes(host.hostname));
+        })
+      )
+      .subscribe((nvmeofHosts: any[]) => {
+        this.hosts = nvmeofHosts.map((h) => ({ hostname: h.hostname, addr: h.addr }));
+      });
   }
 
   ngOnInit() {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
index da001554120..1a24ddfd28c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
@@ -1,6 +1,6 @@
 <legend>
   <cd-help-text>
-    A listener defines the IP port on the gateway that is to process NVMe/TCP commands and I/O operations.
+    A listener defines the IP address and port on the gateway that is used to process NVMe/TCP admin and I/O commands to a subsystem.
   </cd-help-text>
 </legend>
 <cd-table [data]="listeners"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
index 72576b7e642..87e474de757 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
@@ -39,7 +39,7 @@
                       [value]="pool.pool_name">{{ pool.pool_name }}</option>
             </select>
             <cd-help-text i18n>
-              A RBD application-enabled pool where the image will be created.
+              An RBD application-enabled pool where the image will be created.
             </cd-help-text>
             <span class="invalid-feedback"
                   *ngIf="nsForm.showError('pool', formDir, 'required')"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
index 1032a0d1e26..5ccc48eeeae 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
@@ -49,16 +49,16 @@
                    type="text"
                    name="max_namespaces"
                    formControlName="max_namespaces">
-            <cd-help-text i18n>The maximum namespaces per subsystem. Default is 256.</cd-help-text>
+            <cd-help-text i18n>The maximum namespaces per subsystem. Default is {{defaultMaxNamespace}}</cd-help-text>
             <span class="invalid-feedback"
                   *ngIf="subsystemForm.showError('max_namespaces', formDir, 'min')"
                   i18n>The value must be at least 1.</span>
             <span class="invalid-feedback"
                   *ngIf="subsystemForm.showError('max_namespaces', formDir, 'max')"
-                  i18n>The value cannot be greated than 256.</span>
+                  i18n>The value cannot be greater than {{defaultMaxNamespace}}.</span>
             <span class="invalid-feedback"
                   *ngIf="subsystemForm.showError('max_namespaces', formDir, 'pattern')"
-                  i18n>The value must be  a positive integer.</span>
+                  i18n>The value must be a positive integer.</span>
           </div>
         </div>
       </div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
index 4e9c420c9f0..08e56debf0a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
@@ -11,7 +11,7 @@ import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
 import { SharedModule } from '~/app/shared/shared.module';
 import { NvmeofSubsystemsFormComponent } from './nvmeof-subsystems-form.component';
 import { FormHelper } from '~/testing/unit-test-helper';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { MAX_NAMESPACE, NvmeofService } from '~/app/shared/api/nvmeof.service';
 
 describe('NvmeofSubsystemsFormComponent', () => {
   let component: NvmeofSubsystemsFormComponent;
@@ -59,7 +59,7 @@ describe('NvmeofSubsystemsFormComponent', () => {
       component.onSubmit();
       expect(nvmeofService.createSubsystem).toHaveBeenCalledWith({
         nqn: expectedNqn,
-        max_namespaces: 256,
+        max_namespaces: MAX_NAMESPACE,
         enable_ha: true
       });
     });
@@ -76,8 +76,8 @@ describe('NvmeofSubsystemsFormComponent', () => {
       formHelper.expectError('max_namespaces', 'pattern');
     });
 
-    it('should give error on max_namespaces greater than 256', () => {
-      formHelper.setValue('max_namespaces', 300);
+    it(`should give error on max_namespaces greater than ${MAX_NAMESPACE}`, () => {
+      formHelper.setValue('max_namespaces', 2000);
       component.onSubmit();
       formHelper.expectError('max_namespaces', 'max');
     });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
index 5debb52c4d3..5c2e1ce5250 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
@@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { Router } from '@angular/router';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { MAX_NAMESPACE, NvmeofService } from '~/app/shared/api/nvmeof.service';
 
 @Component({
   selector: 'cd-nvmeof-subsystems-form',
@@ -23,6 +23,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
   action: string;
   resource: string;
   pageURL: string;
+  defaultMaxNamespace: number = MAX_NAMESPACE;
 
   constructor(
     private authStorageService: AuthStorageService,
@@ -68,8 +69,12 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
           CdValidators.unique(this.nvmeofService.isSubsystemPresent, this.nvmeofService)
         ]
       }),
-      max_namespaces: new UntypedFormControl(256, {
-        validators: [CdValidators.number(false), Validators.max(256), Validators.min(1)]
+      max_namespaces: new UntypedFormControl(this.defaultMaxNamespace, {
+        validators: [
+          CdValidators.number(false),
+          Validators.max(this.defaultMaxNamespace),
+          Validators.min(1)
+        ]
       })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
index cc89008eb82..6cd1f205913 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
@@ -2,7 +2,7 @@
 <legend i18n>
   Subsystems
   <cd-help-text>
-    A subsystem presents a collection of controllers which are used to access namespaces.
+    A subsystem provides access control to which hosts can access the namespaces within the subsystem.
   </cd-help-text>
 </legend>
 <cd-table [data]="subsystems"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
index ee06198d168..5cb980bef10 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
@@ -25,7 +25,7 @@ export class RbdDetailsComponent implements OnChanges {
 
   ngOnChanges() {
     if (this.selection) {
-      this.rbdDashboardUrl = `rbd-details?var-Pool=${this.selection['pool_name']}&var-Image=${this.selection['name']}`;
+      this.rbdDashboardUrl = `rbd-details?var-pool=${this.selection['pool_name']}&var-image=${this.selection['name']}`;
     }
   }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
index 67192f5d338..85c56cbf0d4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
@@ -176,8 +176,8 @@
                       i18n>
                       Use a dedicated data pool
 
-          <cd-help-text>Use a dedicated pool to store the mirror data. If not selected,
-            the mirror data will be stored in the same pool as the image data.
+          <cd-help-text>Use a dedicated pool to store the image data. If not selected,
+            the image data will be stored in the same pool as the image metadata.
           </cd-help-text>
 
           <cd-helper *ngIf="allDataPools.length <= 1 && mode !== 'editing'">
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
index 3b99541418a..c0373a9fb77 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
@@ -19,7 +19,6 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { Permission } from '~/app/shared/models/permissions';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
 
 @Component({
   selector: 'cd-cephfs-form',
@@ -123,9 +122,8 @@ export class CephfsVolumeFormComponent extends CdForm implements OnInit {
         }
       });
     } else {
-      const hostContext = new CdTableFetchDataContext(() => undefined);
       this.hostsAndLabels$ = forkJoin({
-        hosts: this.hostService.list(hostContext.toParams(), 'false'),
+        hosts: this.hostService.getAllHosts(),
         labels: this.hostService.getLabels()
       }).pipe(
         map(({ hosts, labels }) => ({
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
index 0156b9196e1..42f597957c9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
@@ -39,7 +39,8 @@ describe('ConfigurationComponent', () => {
     expect(component).toBeTruthy();
   });
 
-  it('should check header text', () => {
+  // TODO: Re-write this unit test to reflect latest changes on datatble markup
+  it.skip('should check header text', () => {
     const cdTableEl = fixture.debugElement.query(By.directive(TableComponent));
     const cdTableComponent: TableComponent = cdTableEl.componentInstance;
     cdTableComponent.ngAfterViewInit();
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
index 2c25c462220..c7e1c31fc3b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
@@ -117,7 +117,7 @@ describe('HostsComponent', () => {
     fixture.detectChanges();
 
     const spans = fixture.debugElement.nativeElement.querySelectorAll(
-      'cds-table > table > tbody > tr > td > span'
+      'table > tbody > tr > td > span'
     );
     expect(spans[0].textContent.trim()).toBe(hostname);
   });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
index a71c406461b..586ca57209f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
@@ -97,7 +97,7 @@
                       [value]="pool.pool_name">{{ pool.pool_name }}</option>
             </select>
             <cd-help-text i18n>
-              A pool in which the gateway configuration can be managed.
+              An RBD application-enabled pool in which the gateway configuration can be managed.
             </cd-help-text>
             <span class="invalid-feedback"
                   *ngIf="serviceForm.showError('pool', frm, 'required')"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
index 9602c856aed..bada177f735 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
@@ -30,7 +30,6 @@ import { CdForm } from '~/app/shared/forms/cd-form';
 import { CdFormBuilder } from '~/app/shared/forms/cd-form-builder';
 import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { CephServiceSpec } from '~/app/shared/models/service.interface';
 import { ModalService } from '~/app/shared/services/modal.service';
@@ -470,8 +469,7 @@ export class ServiceFormComponent extends CdForm implements OnInit {
 
       this.serviceTypes = _.difference(resp, this.hiddenServices).sort();
     });
-    const hostContext = new CdTableFetchDataContext(() => undefined);
-    this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: object[]) => {
+    this.hostService.getAllHosts().subscribe((resp: object[]) => {
       const options: SelectOption[] = [];
       _.forEach(resp, (host: object) => {
         if (_.get(host, 'sources.orchestrator', false)) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
index deda8901670..6a3edfbf59a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
@@ -11,7 +11,6 @@ import { RgwZone } from '../models/rgw-multisite';
 import _ from 'lodash';
 import { SelectMessages } from '~/app/shared/components/select/select-messages.model';
 import { HostService } from '~/app/shared/api/host.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
 import { SelectOption } from '~/app/shared/components/select/select-option.model';
 import { Observable, Subject, merge } from 'rxjs';
 import { debounceTime, distinctUntilChanged, filter, map } from 'rxjs/operators';
@@ -62,8 +61,7 @@ export class RgwMultisiteImportComponent implements OnInit {
     this.zoneNames = this.zoneList.map((zone) => {
       return zone['name'];
     });
-    const hostContext = new CdTableFetchDataContext(() => undefined);
-    this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: object[]) => {
+    this.hostService.getAllHosts().subscribe((resp: object[]) => {
       const options: SelectOption[] = [];
       _.forEach(resp, (host: object) => {
         if (_.get(host, 'sources.orchestrator', false)) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
index 3bb56957583..ce23302ba26 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
@@ -162,4 +162,8 @@ export class HostService extends ApiClient {
       })
     );
   }
+
+  getAllHosts(): Observable<object[]> {
+    return this.http.get<object[]>(`${this.baseUIURL}/list`);
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
index 4b4c4e86693..7c72530e84a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
@@ -5,6 +5,8 @@ import _ from 'lodash';
 import { Observable, of as observableOf } from 'rxjs';
 import { catchError, mapTo } from 'rxjs/operators';
 
+export const MAX_NAMESPACE = 1024;
+
 export interface ListenerRequest {
   host_name: string;
   traddr: string;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
index 8e9b2237c3f..51b218769bb 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
@@ -1,4 +1,4 @@
-<cds-actionable-notification class="mb-1"
+<cds-actionable-notification class="mb-1 content-theme"
                              [ngClass]="spacingClass"
                              [notificationObj]="notificationContent"
                              (close)="onClose()"></cds-actionable-notification>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
index 0829c908a3f..deb705d1fff 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
@@ -1,5 +1,6 @@
 <cds-table-container [cdsLayer]="layer"
-                     [cdsTheme]="theme">
+                     [cdsTheme]="theme"
+                     class="content-theme">
   <cds-table-toolbar #toolbar
                      *ngIf="toolHeader"
                      (cancel)="onBatchActionsCancel()"
@@ -131,49 +132,87 @@
   </div>
   </div>
   <!-- end filter chips for column filters -->
-  <cds-table [model]="model"
-             [sortable]="!!userConfig.sorts"
-             [size]="size"
-             class="overflow-y-hidden"
-             [skeleton]="false"
-             [showSelectionColumn]="selectionType === 'multiClick'"
-             [enableSingleSelect]="selectionType === 'single'"
-             [stickyHeader]="false"
-             [striped]="false"
-             [isDataGrid]="false"
-             (sort)="changeSorting($event)"
-             (selectRow)="onSelect($event)"
-             (selectAll)="onSelectAll($event)"
-             (deselectRow)="onDeselect($event)"
-             (deselectAll)="onDeselectAll($event)">
-    <tbody>
-      <tr cdstablerow
-          *ngIf="!rows?.length && !loadingIndicator">
-        <td class="no-data"
-            cdstabledata
-            [attr.colspan]="selectionType === 'single' ? visibleColumns.length + 1 : visibleColumns.length + 2">
-          <span class="d-flex justify-content-center align-items-center"
-                i18n>No data to display</span>
-        </td>
-      </tr>
-      <tr cdstablerow
-          *ngIf="loadingIndicator">
-        <td class="no-data"
-            cdstabledata
-            [attr.colspan]="visibleColumns.length + 1">
-          <span class="d-flex justify-content-center align-items-center"
-                i18n>Loading</span>
-        </td>
-      </tr>
+  <table cdsTable
+         [sortable]="sortable"
+         [noBorder]="false"
+         [size]="size"
+         [striped]="false"
+         [skeleton]="loadingIndicator">
+    <thead cdsTableHead
+           [sortable]="sortable"
+           (deselectAll)="onDeselectAll()"
+           (selectAll)="onSelectAll()"
+           (sort)="changeSorting($event)"
+           [model]="model"
+           [showSelectionColumn]="showSelectionColumn"
+           [enableSingleSelect]="enableSingleSelect"
+           [skeleton]="loadingIndicator"
+           [stickyHeader]="false">
+    </thead>
+    <tbody cdsTableBody
+           *ngIf="!noData; else noDataTemplate"
+           [skeleton]="loadingIndicator">
+      <ng-container *ngFor="let row of model.data; let i = index; trackBy: trackByFn.bind(this, identifier)">
+        <tr	cdsTableRow
+            [model]="model"
+            [row]="row"
+            [size]="size"
+            [selected]="model.isRowSelected(i)"
+            [expandable]="model.isRowExpandable(i)"
+            [expanded]="model.isRowExpanded(i)"
+            [showSelectionColumn]="showSelectionColumn"
+            [enableSingleSelect]="enableSingleSelect"
+            [skeleton]="loadingIndicator"
+            (selectRow)="onSelect(i)"
+            (deselectRow)="onDeselect(i)"
+            (expandRow)="model.expandRow(i, !model.isRowExpanded(i))"
+            (rowClick)="onSelect(i)"
+            *ngIf="!model.isRowFiltered(i)">
+        </tr>
+        <tr	cdsTableExpandedRow
+            cdsExpandedRowHover
+            *ngIf="model.isRowExpandable(i) && !shouldExpandAsTable(row) && !model.isRowFiltered(i)"
+            [row]="row"
+            [expanded]="model.isRowExpanded(i)"
+            [skeleton]="loadingIndicator">
+        </tr>
+        <ng-container	*ngIf="model.isRowExpandable(i) && shouldExpandAsTable(row) && model.isRowExpanded(i) && !model.isRowFiltered(i)">
+          <tr	cdsTableRow
+              *ngFor="let expandedDataRow of firstExpandedDataInRow(row)"
+              [model]="model"
+              [showSelectionColumnCheckbox]="false"
+              [showSelectionColumn]="showSelectionColumn"
+              [row]="expandedDataRow"
+              [size]="size"
+              [selected]="model.isRowSelected(i)"
+              [skeleton]="loadingIndicator">
+          </tr>
+        </ng-container>
+      </ng-container>
     </tbody>
-  </cds-table>
+  </table>
   <cds-pagination [model]="model"
                   (selectPage)="onPageChange($event)"
                   [disabled]="limit === 0"
+                  [skeleton]="loadingIndicator"
                   [pageInputDisabled]="limit === 0">
   </cds-pagination>
 </cds-table-container>
 
+<ng-template #noDataTemplate>
+  <tbody>
+    <tr cdstablerow>
+      <td *ngIf="!rows?.length && !loadingIndicator"
+          class="no-data"
+          cdstabledata
+          [attr.colspan]="visibleColumns.length + 2">
+        <span class="d-flex justify-content-center align-items-center"
+              i18n>No data to display</span>
+      </td>
+    </tr>
+  </tbody>
+</ng-template>
+
 <ng-template #rowDetailTpl
              let-row="data">
   <div *ngIf="row[identifier] === expanded?.[identifier]"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
index 37277df4033..97bcee3dfe3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
@@ -27,13 +27,12 @@ import { CdTableColumn } from '~/app/shared/models/cd-table-column';
 import { CdTableColumnFilter } from '~/app/shared/models/cd-table-column-filter';
 import { CdTableColumnFiltersChange } from '~/app/shared/models/cd-table-column-filters-change';
 import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
-import { PageInfo } from '~/app/shared/models/cd-table-paging';
 import { CdTableSelection } from '~/app/shared/models/cd-table-selection';
 import { CdUserConfig } from '~/app/shared/models/cd-user-config';
 import { TimerService } from '~/app/shared/services/timer.service';
 import { TableActionsComponent } from '../table-actions/table-actions.component';
 import { TableDetailDirective } from '../directives/table-detail.directive';
-import { filter, map, throttleTime } from 'rxjs/operators';
+import { filter, map } from 'rxjs/operators';
 import { CdSortDirection } from '../../enum/cd-sort-direction';
 import { CdSortPropDir } from '../../models/cd-sort-prop-dir';
 
@@ -254,6 +253,32 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
 
   private _expanded: any = undefined;
 
+  get sortable() {
+    return !!this.userConfig?.sorts;
+  }
+
+  get noData() {
+    return !this.rows?.length && !this.loadingIndicator;
+  }
+
+  get showSelectionColumn() {
+    return this.selectionType === 'multiClick';
+  }
+
+  get enableSingleSelect() {
+    return this.selectionType === 'single';
+  }
+
+  /**
+   * Controls if all checkboxes are viewed as selected.
+   */
+  selectAllCheckbox = false;
+
+  /**
+   * Controls the indeterminate state of the header checkbox.
+   */
+  selectAllCheckboxSomeSelected = false;
+
   /**
    * To prevent making changes to the original columns list, that might change
    * how the table is renderer a second time, we now clone that list into a
@@ -295,7 +320,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
       size: this.model.pageLength,
       filteredData: value
     });
-    this.model.totalDataLength = value?.length || 0;
+    this.model.totalDataLength = this.serverSide ? this.count : value?.length || 0;
   }
 
   get rows() {
@@ -343,13 +368,6 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     return search.split(' ').filter((word) => word);
   }
 
-  shouldThrottle(): number {
-    if (this.autoReload === -1) {
-      return 500;
-    }
-    return 0;
-  }
-
   ngAfterViewInit(): void {
     if (this.tableActions?.dropDownActions?.length) {
       this.tableColumns = [
@@ -394,10 +412,6 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
             return false;
           }
           return true;
-        }),
-        throttleTime(this.shouldThrottle(), undefined, {
-          leading: true,
-          trailing: false
         })
       )
       .subscribe({
@@ -409,7 +423,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
               let tableItem = new TableItem({
                 selected: val,
                 data: {
-                  value: column.pipe ? column.pipe.transform(rowValue || val) : rowValue,
+                  value: column.pipe ? column.pipe.transform(rowValue) : rowValue,
                   row: val,
                   column: { ...column, ...val }
                 }
@@ -419,7 +433,8 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
                 tableItem.data = { ...tableItem.data, row: val };
 
                 if (this.hasDetails) {
-                  (tableItem.expandedData = val), (tableItem.expandedTemplate = this.rowDetailTpl);
+                  tableItem.expandedData = val;
+                  tableItem.expandedTemplate = this.rowDetailTpl;
                 }
               }
 
@@ -455,9 +470,18 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
       }
     });
 
+    const rowsChangeSubscription = this.model.rowsSelectedChange.subscribe(() =>
+      this.updateSelectAllCheckbox()
+    );
+    const dataChangeSubscription = this.model.dataChange.subscribe(() => {
+      this.updateSelectAllCheckbox();
+    });
+
     this._subscriptions.add(tableHeadersSubscription);
     this._subscriptions.add(datasetSubscription);
     this._subscriptions.add(rowsExpandedSubscription);
+    this._subscriptions.add(rowsChangeSubscription);
+    this._subscriptions.add(dataChangeSubscription);
   }
 
   ngOnInit() {
@@ -546,7 +570,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
       this.userConfig.limit = this.limit;
     }
     if (!(this.userConfig.offset >= 0)) {
-      // this.userConfig.offset = this.model.currentPage;
+      this.userConfig.offset = this.model.currentPage - 1;
     }
     if (!this.userConfig.search) {
       this.userConfig.search = this.search;
@@ -771,11 +795,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
 
   ngOnChanges(changes: SimpleChanges) {
     if (changes?.data?.currentValue) {
-      if (_.isNil(this.expanded)) {
-        this.useData();
-      } else if (this.model.rowsExpanded.every((x) => !x)) {
-        this.expanded = undefined;
-      }
+      this.useData();
     }
   }
 
@@ -828,16 +848,17 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     this.reloadData();
   }
 
-  changePage(pageInfo: PageInfo) {
-    this.userConfig.offset = pageInfo.offset;
-    this.userConfig.limit = pageInfo.limit;
+  onPageChange(page: number) {
+    this.model.currentPage = page;
+
+    this.userConfig.offset = this.model.currentPage - 1;
+    this.userConfig.limit = this.model.pageLength;
+
     if (this.serverSide) {
       this.reloadData();
+      return;
     }
-  }
 
-  onPageChange(page: number) {
-    this.model.currentPage = page;
     this.doPagination({});
   }
 
@@ -846,6 +867,11 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     size = this.model.pageLength,
     filteredData = this.rows
   }): void {
+    if (this.serverSide) {
+      this._dataset.next(filteredData);
+      return;
+    }
+
     if (this.limit === 0) {
       this.model.currentPage = 1;
       this.model.pageLength = filteredData.length;
@@ -893,10 +919,10 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     this.updateColumnFilterOptions();
     this.updateFilter();
     this.reset();
+    this.doSorting();
     this.updateSelected();
     this.updateExpanded();
     this.toggleExpandRow();
-    this.doSorting();
   }
 
   /**
@@ -978,9 +1004,9 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     }
   }
 
-  onSelect($event: any) {
-    const { selectedRowIndex } = $event;
+  onSelect(selectedRowIndex: number) {
     const selectedData = _.get(this.model.data?.[selectedRowIndex], [0, 'selected']);
+    this.model.selectRow(selectedRowIndex, true);
     if (this.selectionType === 'single') {
       this.selection.selected = [selectedData];
     } else {
@@ -989,24 +1015,27 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
     this.updateSelection.emit(this.selection);
   }
 
-  onSelectAll($event: TableModel) {
-    $event.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
+  onSelectAll() {
+    this.model.selectAll(!this.selectAllCheckbox && !this.selectAllCheckboxSomeSelected);
+    this.model.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
       this._toggleSelection(rowIndex, isSelected)
     );
     this.updateSelection.emit(this.selection);
+    this.cdRef.detectChanges();
   }
 
-  onDeselect($event: any) {
+  onDeselect(deselectedRowIndex: number) {
+    this.model.selectRow(deselectedRowIndex, false);
     if (this.selectionType === 'single') {
       return;
     }
-    const { deselectedRowIndex } = $event;
     this._toggleSelection(deselectedRowIndex, false);
     this.updateSelection.emit(this.selection);
   }
 
-  onDeselectAll($event: TableModel) {
-    $event.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
+  onDeselectAll() {
+    this.model.selectAll(false);
+    this.model.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
       this._toggleSelection(rowIndex, isSelected)
     );
     this.updateSelection.emit(this.selection);
@@ -1243,4 +1272,41 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
       (_, rowIndex: number) => rowIndex === expandedRowIndex
     );
   }
+
+  firstExpandedDataInRow(row: TableItem[]) {
+    const found = row.find((d) => d.expandedData);
+    if (found) {
+      return found.expandedData;
+    }
+    return found;
+  }
+
+  shouldExpandAsTable(row: TableItem[]) {
+    return row.some((d) => d.expandAsTable);
+  }
+
+  isRowExpandable(index: number) {
+    return this.model.data[index].some((d) => d && d.expandedData);
+  }
+
+  trackByFn(id: string, _index: number, row: TableItem[]) {
+    const uniqueIdentifier = _.get(row, [0, 'data', 'row', id])?.toString?.();
+    return uniqueIdentifier || row;
+  }
+
+  updateSelectAllCheckbox() {
+    const selectedRowsCount = this.model.selectedRowsCount();
+
+    if (selectedRowsCount <= 0) {
+      // reset select all checkbox if nothing selected
+      this.selectAllCheckbox = false;
+      this.selectAllCheckboxSomeSelected = false;
+    } else if (selectedRowsCount < this.model.data.length) {
+      this.selectAllCheckbox = true;
+      this.selectAllCheckboxSomeSelected = true;
+    } else {
+      this.selectAllCheckbox = true;
+      this.selectAllCheckboxSomeSelected = false;
+    }
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
index 887d8d6bfb9..b67a792efcc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
@@ -22,7 +22,7 @@ export class CdDatePipe implements PipeTransform {
         .local()
         .format('D/M/YY hh:mm A');
     } else {
-      value = value?.replace('Z', '');
+      value = value?.replace?.('Z', '');
       date = moment.parseZone(value).utc().utcOffset(offset).local().format('D/M/YY hh:mm A');
     }
     return date;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
index db91422e8a3..cf7662eac65 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
@@ -540,7 +540,7 @@ export class TaskMessageService {
   }
 
   service(metadata: any) {
-    return $localize`Service '${metadata.service_name}'`;
+    return $localize`service '${metadata.service_name}'`;
   }
 
   crudMessage(metadata: any) {
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index b5a34304ada..1fb9771795f 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8182,7 +8182,12 @@ paths:
       - NFS-Ganesha
   /api/nvmeof/gateway:
     get:
-      parameters: []
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8203,7 +8208,7 @@ paths:
       summary: Get information about the NVMeoF gateway
       tags:
       - NVMe-oF Gateway
-  /api/nvmeof/subsystem:
+  /api/nvmeof/gateway/group:
     get:
       parameters: []
       responses:
@@ -8223,6 +8228,33 @@ paths:
             trace.
       security:
       - jwt: []
+      tags:
+      - NVMe-oF Gateway
+  /api/nvmeof/subsystem:
+    get:
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: gw_group
+        schema:
+          type: string
+      responses:
+        '200':
+          content:
+            application/vnd.ceph.api.v1.0+json:
+              type: object
+          description: OK
+        '400':
+          description: Operation exception. Please check the response body for details.
+        '401':
+          description: Unauthenticated access. Please login first.
+        '403':
+          description: Unauthorized access. Please check your permissions.
+        '500':
+          description: Unexpected error. Please check the response body for the stack
+            trace.
+      security:
+      - jwt: []
       summary: List all NVMeoF subsystems
       tags:
       - NVMe-oF Subsystem
@@ -8236,8 +8268,10 @@ paths:
                 enable_ha:
                   description: Enable high availability
                   type: boolean
+                gw_group:
+                  type: string
                 max_namespaces:
-                  default: 256
+                  default: 1024
                   description: Maximum number of namespaces
                   type: integer
                 nqn:
@@ -8287,6 +8321,11 @@ paths:
         name: force
         schema:
           type: boolean
+      - allowEmptyValue: true
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8320,6 +8359,11 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_cli.py b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
index 75a121a48d6..11a95237053 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_cli.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
@@ -19,13 +19,13 @@ def list_nvmeof_gateways(_):
 
 @CLIWriteCommand('dashboard nvmeof-gateway-add')
 @CLICheckNonemptyFileInput(desc='NVMe-oF gateway configuration')
-def add_nvmeof_gateway(_, inbuf, name: str):
+def add_nvmeof_gateway(_, inbuf, name: str, group: str, daemon_name: str):
     '''
     Add NVMe-oF gateway configuration. Gateway URL read from -i <file>
     '''
     service_url = inbuf
     try:
-        NvmeofGatewaysConfig.add_gateway(name, service_url)
+        NvmeofGatewaysConfig.add_gateway(name, service_url, group, daemon_name)
         return 0, 'Success', ''
     except NvmeofGatewayAlreadyExists as ex:
         return -errno.EEXIST, '', str(ex)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_client.py b/src/pybind/mgr/dashboard/services/nvmeof_client.py
index 019ecf0267c..d6b126500b0 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_client.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_client.py
@@ -22,13 +22,24 @@ else:
     class NVMeoFClient(object):
         pb2 = pb2
 
-        def __init__(self):
+        def __init__(self, gw_group: Optional[str] = None):
             logger.info("Initiating nvmeof gateway connection...")
-            service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info()
+            try:
+                if not gw_group:
+                    service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info()
+                else:
+                    service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info(
+                        gw_group
+                    )
+            except TypeError as e:
+                raise DashboardException(
+                    f'Unable to retrieve the gateway info: {e}'
+                )
 
             root_ca_cert = NvmeofGatewaysConfig.get_root_ca_cert(service_name)
-            client_key = NvmeofGatewaysConfig.get_client_key(service_name)
-            client_cert = NvmeofGatewaysConfig.get_client_cert(service_name)
+            if root_ca_cert:
+                client_key = NvmeofGatewaysConfig.get_client_key(service_name)
+                client_cert = NvmeofGatewaysConfig.get_client_cert(service_name)
 
             if root_ca_cert and client_key and client_cert:
                 logger.info('Securely connecting to: %s', self.gateway_addr)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index 3f076d7b216..a5a9979af25 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -51,11 +51,29 @@ class NvmeofGatewaysConfig(object):
         return cls._load_config_from_store()
 
     @classmethod
-    def add_gateway(cls, name, service_url):
+    def add_gateway(cls, name, service_url, group, daemon_name):
         config = cls.get_gateways_config()
-        if name in config:
-            raise NvmeofGatewayAlreadyExists(name)
-        config['gateways'][name] = {'service_url': service_url}
+
+        if name in config.get('gateways', {}):
+            existing_gateways = config['gateways'][name]
+            for gateway in existing_gateways:
+                if 'daemon_name' not in gateway:
+                    gateway['daemon_name'] = daemon_name
+                    break
+                if gateway['service_url'] == service_url:
+                    return
+
+        new_gateway = {
+            'service_url': service_url,
+            'group': group,
+            'daemon_name': daemon_name
+        }
+
+        if name in config.get('gateways', {}):
+            config['gateways'][name].append(new_gateway)
+        else:
+            config['gateways'][name] = [new_gateway]
+
         cls._save_config(config)
 
     @classmethod
@@ -67,12 +85,18 @@ class NvmeofGatewaysConfig(object):
         cls._save_config(config)
 
     @classmethod
-    def get_service_info(cls):
+    def get_service_info(cls, group=None):
         try:
             config = cls.get_gateways_config()
-            service_name = list(config['gateways'].keys())[0]
-            addr = config['gateways'][service_name]['service_url']
-            return service_name, addr
+            gateways = config.get('gateways', {})
+            if not gateways:
+                return None
+
+            if group:
+                return cls._get_name_url_for_group(gateways, group)
+
+            return cls._get_default_service(gateways)
+
         except (KeyError, IndexError) as e:
             raise DashboardException(
                 msg=f'NVMe-oF configuration is not set: {e}',
@@ -112,3 +136,45 @@ class NvmeofGatewaysConfig(object):
             # just return None if any orchestrator error is raised
             # otherwise nvmeof api will raise this error and doesn't proceed.
             return None
+
+    @classmethod
+    def _get_name_url_for_group(cls, gateways, group):
+        try:
+            orch = OrchClient.instance()
+            for service_name, svc_config in gateways.items():
+                # get the group name of the service and match it against the
+                # group name provided
+                group_name_from_svc = orch.services.get(service_name)[0].spec.group
+                if group == group_name_from_svc:
+                    running_daemons = cls._get_running_daemons(orch, service_name)
+                    config = cls._get_running_daemon_svc_config(svc_config, running_daemons)
+
+                    if config:
+                        return service_name, config['service_url']
+            return None
+
+        except OrchestratorError:
+            return cls._get_default_service(gateways)
+
+    @classmethod
+    def _get_running_daemons(cls, orch, service_name):
+        # get the running nvmeof daemons
+        daemons = [d.to_dict()
+                   for d in orch.services.list_daemons(service_name=service_name)]
+        return [d['daemon_name'] for d in daemons
+                if d['status_desc'] == 'running']
+
+    @classmethod
+    def _get_running_daemon_svc_config(cls, svc_config, running_daemons):
+        try:
+            return next(config for config in svc_config
+                        if config['daemon_name'] in running_daemons)
+        except StopIteration:
+            return None
+
+    @classmethod
+    def _get_default_service(cls, gateways):
+        if gateways:
+            service_name = list(gateways.keys())[0]
+            return service_name, gateways[service_name][0]['service_url']
+        return None
diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index f362219221b..dea45f951f8 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -16,6 +16,23 @@ else:  # pragma: no cover
 class CephFSStorageProvider(_StrEnum):
     KERNEL_MOUNT = 'kcephfs'
     SAMBA_VFS = 'samba-vfs'
+    SAMBA_VFS_CLASSIC = 'samba-vfs/classic'
+    SAMBA_VFS_NEW = 'samba-vfs/new'
+
+    def expand(self) -> 'CephFSStorageProvider':
+        """Expand abbreviated/default values into the full/expanded form."""
+        if self == self.SAMBA_VFS:
+            # mypy gets confused by enums
+            return self.__class__(self.SAMBA_VFS_NEW)
+        return self
+
+    def is_vfs(self) -> bool:
+        """Return true if value is a samba vfs provider."""
+        return self in {
+            self.SAMBA_VFS,
+            self.SAMBA_VFS_CLASSIC,
+            self.SAMBA_VFS_NEW,
+        }
 
 
 class SubSystem(_StrEnum):
diff --git a/src/pybind/mgr/smb/handler.py b/src/pybind/mgr/smb/handler.py
index b2285eef575..fc93a7277cb 100644
--- a/src/pybind/mgr/smb/handler.py
+++ b/src/pybind/mgr/smb/handler.py
@@ -620,11 +620,6 @@ class ClusterConfigHandler:
             change_group.cluster.cluster_id,
             set(change_group.cache),
         )
-        external.rm_other_in_ns(
-            self.public_store,
-            change_group.cluster.cluster_id,
-            set(change_group.cache),
-        )
 
         # ensure a entity exists with access to the volumes
         for volume in vols:
@@ -1025,7 +1020,7 @@ def _generate_share(
     share: resources.Share, resolver: PathResolver, cephx_entity: str
 ) -> Dict[str, Dict[str, str]]:
     assert share.cephfs is not None
-    assert share.cephfs.provider == CephFSStorageProvider.SAMBA_VFS
+    assert share.cephfs.provider.is_vfs(), "not a vfs provider"
     assert cephx_entity, "cephx entity name missing"
     # very annoyingly, samba's ceph module absolutely must NOT have the
     # "client." bit in front. JJM has been tripped up by this multiple times -
@@ -1040,15 +1035,24 @@ def _generate_share(
         share.cephfs.subvolume,
         share.cephfs.path,
     )
+    try:
+        ceph_vfs = {
+            CephFSStorageProvider.SAMBA_VFS_CLASSIC: 'ceph',
+            CephFSStorageProvider.SAMBA_VFS_NEW: 'ceph_new',
+        }[share.checked_cephfs.provider.expand()]
+    except KeyError:
+        raise ValueError(
+            f'unsupported provider: {share.checked_cephfs.provider}'
+        )
     cfg = {
         # smb.conf options
         'options': {
             'path': path,
-            "vfs objects": "acl_xattr ceph",
+            "vfs objects": f"acl_xattr {ceph_vfs}",
             'acl_xattr:security_acl_name': 'user.NTACL',
-            'ceph:config_file': '/etc/ceph/ceph.conf',
-            'ceph:filesystem': share.cephfs.volume,
-            'ceph:user_id': cephx_entity,
+            f'{ceph_vfs}:config_file': '/etc/ceph/ceph.conf',
+            f'{ceph_vfs}:filesystem': share.cephfs.volume,
+            f'{ceph_vfs}:user_id': cephx_entity,
             'read only': ynbool(share.readonly),
             'browseable': ynbool(share.browseable),
             'kernel share modes': 'no',
diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py
index 547e6c712bc..86a2310a4de 100644
--- a/src/pybind/mgr/smb/tests/test_smb.py
+++ b/src/pybind/mgr/smb/tests/test_smb.py
@@ -431,11 +431,11 @@ def test_share_dump_config(tmodule):
                     'browseable': 'Yes',
                     'kernel share modes': 'no',
                     'x:ceph:id': 'foo.s1',
-                    'vfs objects': 'acl_xattr ceph',
+                    'vfs objects': 'acl_xattr ceph_new',
                     'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph:config_file': '/etc/ceph/ceph.conf',
-                    'ceph:filesystem': 'cephfs',
-                    'ceph:user_id': 'smb.fs.cluster.foo',
+                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
+                    'ceph_new:filesystem': 'cephfs',
+                    'ceph_new:user_id': 'smb.fs.cluster.foo',
                 },
             },
             'Ess Two': {
@@ -445,11 +445,11 @@ def test_share_dump_config(tmodule):
                     'browseable': 'Yes',
                     'kernel share modes': 'no',
                     'x:ceph:id': 'foo.stwo',
-                    'vfs objects': 'acl_xattr ceph',
+                    'vfs objects': 'acl_xattr ceph_new',
                     'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph:config_file': '/etc/ceph/ceph.conf',
-                    'ceph:filesystem': 'cephfs',
-                    'ceph:user_id': 'smb.fs.cluster.foo',
+                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
+                    'ceph_new:filesystem': 'cephfs',
+                    'ceph_new:user_id': 'smb.fs.cluster.foo',
                 },
             },
         },
diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
index 1c46ccc6933..b58f20f1275 100644
--- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py
+++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
@@ -361,7 +361,7 @@ class SnapSchedClient(CephfsClient):
             path = sched.path
             prune_candidates = set()
             time = datetime.now(timezone.utc)
-            mds_max_snaps_per_dir = self.mgr.get_ceph_option('mds_max_snaps_per_dir')
+            mds_max_snaps_per_dir = self.mgr.get_foreign_ceph_option('mds', 'mds_max_snaps_per_dir')
             with open_filesystem(self, sched.fs) as fs_handle:
                 snap_dir = self.mgr.rados.conf_get('client_snapdir')
                 with fs_handle.opendir(f'{path}/{snap_dir}') as d_handle:
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 2b1afc141d8..2238cd01c37 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1315,6 +1315,7 @@ class NvmeofServiceSpec(ServiceSpec):
                  service_id: Optional[str] = None,
                  name: Optional[str] = None,
                  group: Optional[str] = None,
+                 addr: Optional[str] = None,
                  port: Optional[int] = None,
                  pool: Optional[str] = None,
                  enable_auth: bool = False,
@@ -1347,6 +1348,7 @@ class NvmeofServiceSpec(ServiceSpec):
                  transport_tcp_options: Optional[Dict[str, int]] =
                  {"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7},
                  tgt_cmd_extra_args: Optional[str] = None,
+                 discovery_addr: Optional[str] = None,
                  discovery_port: Optional[int] = None,
                  log_level: Optional[str] = 'INFO',
                  log_files_enabled: Optional[bool] = True,
@@ -1378,6 +1380,8 @@ class NvmeofServiceSpec(ServiceSpec):
 
         #: RADOS pool where ceph-nvmeof config data is stored.
         self.pool = pool
+        #: ``addr`` address of the nvmeof gateway
+        self.addr = addr or "0.0.0.0"
         #: ``port`` port of the nvmeof gateway
         self.port = port or 5500
         #: ``name`` name of the nvmeof gateway
@@ -1442,6 +1446,8 @@ class NvmeofServiceSpec(ServiceSpec):
         self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options
         #: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process
         self.tgt_cmd_extra_args = tgt_cmd_extra_args
+        #: ``discovery_addr`` address of the discovery service
+        self.discovery_addr = discovery_addr or "0.0.0.0"
         #: ``discovery_port`` port of the discovery service
         self.discovery_port = discovery_port or 8009
         #: ``log_level`` the nvmeof gateway log level
diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc
index 0558dbf461e..cf6820a9111 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.cc
+++ b/src/rgw/driver/daos/rgw_sal_daos.cc
@@ -1196,7 +1196,8 @@ int DaosObject::DaosDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
 }
 
 int DaosObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y,
-                              uint32_t flags) {
+                              uint32_t flags, std::list<rgw_obj_index_key>* remove_objs,
+                              RGWObjVersionTracker* objv) {
   ldpp_dout(dpp, 20) << "DEBUG: delete_object" << dendl;
   DaosObject::DaosDeleteOp del_op(this);
   del_op.params.bucket_owner = bucket->get_info().owner;
@@ -1677,7 +1678,8 @@ int DaosMultipartUpload::complete(
     map<int, string>& part_etags, list<rgw_obj_index_key>& remove_objs,
     uint64_t& accounted_size, bool& compressed, RGWCompressionInfo& cs_info,
     off_t& off, std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
-    rgw::sal::Object* target_obj) {
+    rgw::sal::Object* target_obj,
+    prefix_map_t& processed_prefixes) {
   ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl;
   char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
   char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -1922,6 +1924,15 @@ int DaosMultipartUpload::complete(
   return ret;
 }
 
+int DaosMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+    CephContext *cct, optional_yield y,
+    const rgw_obj& obj,
+    std::list<rgw_obj_index_key>& remove_objs,
+    prefix_map_t& processed_prefixes)
+{
+  return -ENOTSUP;
+}
+
 int DaosMultipartUpload::get_info(const DoutPrefixProvider* dpp,
                                   optional_yield y, rgw_placement_rule** rule,
                                   rgw::sal::Attrs* attrs) {
diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h
index 8cff96088ee..7cc20260227 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.h
+++ b/src/rgw/driver/daos/rgw_sal_daos.h
@@ -596,7 +596,8 @@ class DaosObject : public StoreObject {
   virtual ~DaosObject();
 
   virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y,
-                            uint32_t flags) override;
+                            uint32_t flags, std::list<rgw_obj_index_key>* remove_objs,
+                            RGWObjVersionTracker* objv) override;
   virtual int copy_object(
       const ACLOwner& owner, const rgw_user& remote_user,
       req_info* info, const rgw_zone_id& source_zone,
@@ -861,7 +862,13 @@ class DaosMultipartUpload : public StoreMultipartUpload {
                        uint64_t& accounted_size, bool& compressed,
                        RGWCompressionInfo& cs_info, off_t& off,
                        std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
-                       rgw::sal::Object* target_obj) override;
+                       rgw::sal::Object* target_obj,
+                       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+      CephContext *cct, optional_yield y,
+      const rgw_obj& obj,
+      std::list<rgw_obj_index_key>& remove_objs,
+      prefix_map_t& processed_prefixes) override;
   virtual int get_info(const DoutPrefixProvider* dpp, optional_yield y,
                        rgw_placement_rule** rule,
                        rgw::sal::Attrs* attrs = nullptr) override;
diff --git a/src/rgw/driver/dbstore/common/dbstore.cc b/src/rgw/driver/dbstore/common/dbstore.cc
index d548bc4d8c0..b3aed3948c8 100644
--- a/src/rgw/driver/dbstore/common/dbstore.cc
+++ b/src/rgw/driver/dbstore/common/dbstore.cc
@@ -1973,7 +1973,7 @@ int DB::Object::Delete::create_dm(const DoutPrefixProvider *dpp,
 }
 
 int DB::get_entry(const std::string& oid, const std::string& marker,
-			      std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+                  rgw::sal::LCEntry& entry)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -1982,7 +1982,7 @@ int DB::get_entry(const std::string& oid, const std::string& marker,
   InitializeParams(dpp, &params);
 
   params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry.set_bucket(marker);
+  params.op.lc_entry.entry.bucket = marker;
 
   params.op.query_str = "get_entry";
   ret = ProcessOp(dpp, "GetLCEntry", &params);
@@ -1992,14 +1992,8 @@ int DB::get_entry(const std::string& oid, const std::string& marker,
     goto out;
   }
 
-  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
-    rgw::sal::Lifecycle::LCEntry* e;
-    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
-    if (!e) {
-      ret = -ENOMEM;
-      goto out;
-    }
-    entry->reset(e);
+  if (params.op.lc_entry.entry.start_time != 0) { //ensure entry found
+    entry = std::move(params.op.lc_entry.entry);
   }
 
 out:
@@ -2007,7 +2001,7 @@ out:
 }
 
 int DB::get_next_entry(const std::string& oid, const std::string& marker,
-			      std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+                       rgw::sal::LCEntry& entry)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2016,7 +2010,7 @@ int DB::get_next_entry(const std::string& oid, const std::string& marker,
   InitializeParams(dpp, &params);
 
   params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry.set_bucket(marker);
+  params.op.lc_entry.entry.bucket = marker;
 
   params.op.query_str = "get_next_entry";
   ret = ProcessOp(dpp, "GetLCEntry", &params);
@@ -2026,21 +2020,15 @@ int DB::get_next_entry(const std::string& oid, const std::string& marker,
     goto out;
   }
 
-  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
-    rgw::sal::Lifecycle::LCEntry* e;
-    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
-    if (!e) {
-      ret = -ENOMEM;
-      goto out;
-    }
-    entry->reset(e);
+  if (params.op.lc_entry.entry.start_time != 0) { //ensure entry found
+    entry = std::move(params.op.lc_entry.entry);
   }
 
 out:
   return ret;
 }
 
-int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+int DB::set_entry(const std::string& oid, const rgw::sal::LCEntry& entry)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2063,7 +2051,7 @@ out:
 }
 
 int DB::list_entries(const std::string& oid, const std::string& marker,
-  				 uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries)
+                     uint32_t max_entries, std::vector<rgw::sal::LCEntry>& entries)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2085,14 +2073,14 @@ int DB::list_entries(const std::string& oid, const std::string& marker,
   }
 
   for (auto& entry : params.op.lc_entry.list_entries) {
-    entries.push_back(std::make_unique<rgw::sal::StoreLifecycle::StoreLCEntry>(std::move(entry)));
+    entries.push_back(std::move(entry));
   }
 
 out:
   return ret;
 }
 
-int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+int DB::rm_entry(const std::string& oid, const rgw::sal::LCEntry& entry)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2114,7 +2102,7 @@ out:
   return ret;
 }
 
-int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head)
+int DB::get_head(const std::string& oid, rgw::sal::LCHead& head)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2131,13 +2119,13 @@ int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LC
     goto out;
   }
 
-  *head = std::make_unique<rgw::sal::StoreLifecycle::StoreLCHead>(params.op.lc_head.head);
+  head = std::move(params.op.lc_head.head);
 
 out:
   return ret;
 }
 
-int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head)
+int DB::put_head(const std::string& oid, const rgw::sal::LCHead& head)
 {
   int ret = 0;
   const DoutPrefixProvider *dpp = get_def_dpp();
diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h
index 3f8191f5a92..605477ef4ad 100644
--- a/src/rgw/driver/dbstore/common/dbstore.h
+++ b/src/rgw/driver/dbstore/common/dbstore.h
@@ -104,15 +104,15 @@ struct DBOpObjectDataInfo {
 
 struct DBOpLCHeadInfo {
   std::string index;
-  rgw::sal::StoreLifecycle::StoreLCHead head;
+  rgw::sal::LCHead head;
 };
 
 struct DBOpLCEntryInfo {
   std::string index;
-  rgw::sal::StoreLifecycle::StoreLCEntry entry;
+  rgw::sal::LCEntry entry;
   // used for list query
   std::string min_marker;
-  std::list<rgw::sal::StoreLifecycle::StoreLCEntry> list_entries;
+  std::list<rgw::sal::LCEntry> list_entries;
 };
 
 struct DBOpInfo {
@@ -1979,15 +1979,15 @@ class DB {
         RGWObjState *astate, void *arg);
 
     int get_entry(const std::string& oid, const std::string& marker,
-		  std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+		  rgw::sal::LCEntry& entry);
     int get_next_entry(const std::string& oid, const std::string& marker,
-		  std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
-    int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+		  rgw::sal::LCEntry& entry);
+    int set_entry(const std::string& oid, const rgw::sal::LCEntry& entry);
     int list_entries(const std::string& oid, const std::string& marker,
-			   uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries);
-    int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
-    int get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head);
-    int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head);
+			   uint32_t max_entries, std::vector<rgw::sal::LCEntry>& entries);
+    int rm_entry(const std::string& oid, const rgw::sal::LCEntry& entry);
+    int get_head(const std::string& oid, rgw::sal::LCHead& head);
+    int put_head(const std::string& oid, const rgw::sal::LCHead& head);
     int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
                           uint32_t min_wait);
     int createGC(const DoutPrefixProvider *_dpp);
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
index 554d8fe94cf..f503d67b795 100644
--- a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
+++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
@@ -561,9 +561,9 @@ static int list_lc_entry(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_st
     return -1;
 
   op.lc_entry.index = (const char*)sqlite3_column_text(stmt, LCEntryIndex);
-  op.lc_entry.entry.set_bucket((const char*)sqlite3_column_text(stmt, LCEntryBucketName));
-  op.lc_entry.entry.set_start_time(sqlite3_column_int(stmt, LCEntryStartTime));
-  op.lc_entry.entry.set_status(sqlite3_column_int(stmt, LCEntryStatus));
+  op.lc_entry.entry.bucket = (const char*)sqlite3_column_text(stmt, LCEntryBucketName);
+  op.lc_entry.entry.start_time = sqlite3_column_int(stmt, LCEntryStartTime);
+  op.lc_entry.entry.status = sqlite3_column_int(stmt, LCEntryStatus);
  
   op.lc_entry.list_entries.push_back(op.lc_entry.entry);
 
@@ -577,10 +577,10 @@ static int list_lc_head(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stm
   int64_t start_date;
 
   op.lc_head.index = (const char*)sqlite3_column_text(stmt, LCHeadIndex);
-  op.lc_head.head.set_marker((const char*)sqlite3_column_text(stmt, LCHeadMarker));
+  op.lc_head.head.marker = (const char*)sqlite3_column_text(stmt, LCHeadMarker);
  
   SQL_DECODE_BLOB_PARAM(dpp, stmt, LCHeadStartDate, start_date, sdb);
-  op.lc_head.head.get_start_date() = start_date;
+  op.lc_head.head.start_date = start_date;
 
   return 0;
 }
@@ -2692,13 +2692,13 @@ int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *par
   SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.status, sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.start_time, sdb);
 
 out:
   return rc;
@@ -2741,7 +2741,7 @@ int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *par
   SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
 
 out:
   return rc;
@@ -2796,7 +2796,7 @@ int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params
   SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
 
 out:
   return rc;
@@ -2892,7 +2892,7 @@ int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *para
   SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.marker.c_str(), sdb);
 
   SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb);
   SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast<int64_t>(params->op.lc_head.head.start_date), sdb);
diff --git a/src/rgw/driver/dbstore/tests/dbstore_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
index c89addeade1..2ceed7218d8 100644
--- a/src/rgw/driver/dbstore/tests/dbstore_tests.cc
+++ b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
@@ -1255,31 +1255,30 @@ TEST_F(DBStoreTest, LCHead) {
   std::string index1 = "bucket1";
   std::string index2 = "bucket2";
   time_t lc_time = ceph_clock_now();
-  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
-  std::string ents[] = {"entry1", "entry2", "entry3"};
-  rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]);
-  rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]);
-  rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]);
+  rgw::sal::LCHead head;
+  rgw::sal::LCHead head1{lc_time, "entry1"};
+  rgw::sal::LCHead head2{lc_time, "entry2"};
+  rgw::sal::LCHead head3{lc_time, "entry3"};
 
   ret = db->put_head(index1, head1);
   ASSERT_EQ(ret, 0);
   ret = db->put_head(index2, head2);
   ASSERT_EQ(ret, 0);
 
-  ret = db->get_head(index1, &head);
+  ret = db->get_head(index1, head);
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry1");
+  ASSERT_EQ(head.marker, "entry1");
 
-  ret = db->get_head(index2, &head);
+  ret = db->get_head(index2, head);
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry2");
+  ASSERT_EQ(head.marker, "entry2");
 
   // update index1
   ret = db->put_head(index1, head3);
   ASSERT_EQ(ret, 0);
-  ret = db->get_head(index1, &head);
+  ret = db->get_head(index1, head);
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry3");
+  ASSERT_EQ(head.marker, "entry3");
 
 }
 TEST_F(DBStoreTest, LCEntry) {
@@ -1290,13 +1289,13 @@ TEST_F(DBStoreTest, LCEntry) {
   std::string index2 = "lcindex2";
   typedef enum {lc_uninitial = 1, lc_complete} status;
   std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"};
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
-  rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial);
+  rgw::sal::LCEntry entry;
+  rgw::sal::LCEntry entry1{ents[0], lc_time, lc_uninitial};
+  rgw::sal::LCEntry entry2{ents[1], lc_time, lc_uninitial};
+  rgw::sal::LCEntry entry3{ents[2], lc_time, lc_uninitial};
+  rgw::sal::LCEntry entry4{ents[3], lc_time, lc_uninitial};
 
-  vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> lc_entries;
+  vector<rgw::sal::LCEntry> lc_entries;
 
   ret = db->set_entry(index1, entry1);
   ASSERT_EQ(ret, 0);
@@ -1308,44 +1307,44 @@ TEST_F(DBStoreTest, LCEntry) {
   ASSERT_EQ(ret, 0);
 
   // get entry index1, entry1
-  ret = db->get_entry(index1, ents[0], &entry); 
+  ret = db->get_entry(index1, ents[0], entry); 
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_status(), lc_uninitial);
-  ASSERT_EQ(entry->get_start_time(), lc_time);
+  ASSERT_EQ(entry.status, lc_uninitial);
+  ASSERT_EQ(entry.start_time, lc_time);
 
   // get next entry index1, entry2
-  ret = db->get_next_entry(index1, ents[1], &entry); 
+  ret = db->get_next_entry(index1, ents[1], entry); 
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_bucket(), ents[2]);
-  ASSERT_EQ(entry->get_status(), lc_uninitial);
-  ASSERT_EQ(entry->get_start_time(), lc_time);
+  ASSERT_EQ(entry.bucket, ents[2]);
+  ASSERT_EQ(entry.status, lc_uninitial);
+  ASSERT_EQ(entry.start_time, lc_time);
 
   // update entry4 to entry5
   entry4.status = lc_complete;
   ret = db->set_entry(index2, entry4);
   ASSERT_EQ(ret, 0);
-  ret = db->get_entry(index2, ents[3], &entry); 
+  ret = db->get_entry(index2, ents[3], entry); 
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_status(), lc_complete);
+  ASSERT_EQ(entry.status, lc_complete);
 
   // list entries
   ret = db->list_entries(index1, "", 5, lc_entries);
   ASSERT_EQ(ret, 0);
   for (const auto& ent: lc_entries) {
     cout << "###################### \n";
-    cout << "lc entry.bucket : " << ent->get_bucket() << "\n";
-    cout << "lc entry.status : " << ent->get_status() << "\n";
+    cout << "lc entry.bucket : " << ent.bucket << "\n";
+    cout << "lc entry.status : " << ent.status << "\n";
   }
 
   // remove index1, entry3
   ret = db->rm_entry(index1, entry3); 
   ASSERT_EQ(ret, 0);
 
-  // get next entry index1, entry2.. should be null
-  entry.release();
-  ret = db->get_next_entry(index1, ents[1], &entry); 
+  // get next entry index1, entry2.. should be empty
+  entry = rgw::sal::LCEntry{};
+  ret = db->get_next_entry(index1, ents[1], entry);
   ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry.get(), nullptr);
+  ASSERT_TRUE(entry.bucket.empty());
 }
 
 TEST_F(DBStoreTest, RemoveBucket) {
diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc
index bb3af3a63a1..b999673ac18 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.cc
+++ b/src/rgw/driver/motr/rgw_sal_motr.cc
@@ -585,7 +585,7 @@ int MotrBucket::remove(const DoutPrefixProvider *dpp, bool delete_children, opti
 
       std::unique_ptr<rgw::sal::Object> object = get_object(key);
 
-      ret = object->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP);
+      ret = object->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
       if (ret < 0 && ret != -ENOENT) {
         ldpp_dout(dpp, 0) << "ERROR: remove_bucket rgw_remove_object failed rc=" << ret << dendl;
 	      return ret;
@@ -1502,7 +1502,11 @@ int MotrObject::MotrDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional
   return 0;
 }
 
-int MotrObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags)
+int MotrObject::delete_object(const DoutPrefixProvider* dpp,
+    optional_yield y,
+    uint32_t flags,
+    std::list<rgw_obj_index_key>* remove_objs,
+    RGWObjVersionTracker* objv)
 {
   MotrObject::MotrDeleteOp del_op(this);
   del_op.params.bucket_owner = bucket->get_info().owner;
@@ -2668,7 +2672,8 @@ int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp,
 				   RGWCompressionInfo& cs_info, off_t& off,
 				   std::string& tag, ACLOwner& owner,
 				   uint64_t olh_epoch,
-				   rgw::sal::Object* target_obj)
+				   rgw::sal::Object* target_obj,
+				   prefix_map_t& processed_prefixes)
 {
   char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
   char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -2878,6 +2883,15 @@ int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp,
                                   M0_IC_DEL, meta_obj->get_key().get_oid(), bl);
 }
 
+int MotrMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+    CephContext *cct, optional_yield y,
+    const rgw_obj& obj,
+    std::list<rgw_obj_index_key>& remove_objs,
+    prefix_map_t& processed_prefixes)
+{
+  return -ENOTSUP;
+}
+
 int MotrMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
 {
   if (!rule && !attrs) {
diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h
index 4d9c189993b..f92074b9d94 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.h
+++ b/src/rgw/driver/motr/rgw_sal_motr.h
@@ -657,7 +657,9 @@ class MotrObject : public StoreObject {
 
     virtual int delete_object(const DoutPrefixProvider* dpp,
         optional_yield y,
-        uint32_t flags) override;
+        uint32_t flags,
+        td::list<rgw_obj_index_key>* remove_objs,
+        GWObjVersionTracker* objv) override;
     virtual int copy_object(const ACLOwner& owner,
         const rgw_user& remote_user,
         req_info* info, const rgw_zone_id& source_zone,
@@ -933,7 +935,13 @@ public:
 		       RGWCompressionInfo& cs_info, off_t& off,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) override;
+		       rgw::sal::Object* target_obj,
+		       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+           CephContext *cct, optional_yield y,
+           const rgw_obj& obj,
+           std::list<rgw_obj_index_key>& remove_objs,
+           prefix_map_t& processed_prefixes) override;
   virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
   virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
 			  optional_yield y,
diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc
index 8ffe4a0d0ca..d676213727e 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.cc
+++ b/src/rgw/driver/posix/rgw_sal_posix.cc
@@ -2730,7 +2730,9 @@ int POSIXBucket::rename(const DoutPrefixProvider* dpp, optional_yield y, Object*
 
 int POSIXObject::delete_object(const DoutPrefixProvider* dpp,
 				optional_yield y,
-				uint32_t flags)
+				uint32_t flags,
+                                std::list<rgw_obj_index_key>* remove_objs,
+				RGWObjVersionTracker* objv)
 {
   POSIXBucket *b = static_cast<POSIXBucket*>(get_bucket());
   if (!b) {
@@ -3505,7 +3507,7 @@ int POSIXObject::POSIXReadOp::get_attr(const DoutPrefixProvider* dpp, const char
 int POSIXObject::POSIXDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
 					   optional_yield y, uint32_t flags)
 {
-  return source->delete_object(dpp, y, flags);
+  return source->delete_object(dpp, y, flags, nullptr, nullptr);
 }
 
 int POSIXObject::copy(const DoutPrefixProvider *dpp, optional_yield y,
@@ -3703,7 +3705,8 @@ int POSIXMultipartUpload::complete(const DoutPrefixProvider *dpp,
 				    RGWCompressionInfo& cs_info, off_t& ofs,
 				    std::string& tag, ACLOwner& owner,
 				    uint64_t olh_epoch,
-				    rgw::sal::Object* target_obj)
+				    rgw::sal::Object* target_obj,
+				    prefix_map_t& processed_prefixes)
 {
   char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
   char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -3852,6 +3855,15 @@ int POSIXMultipartUpload::complete(const DoutPrefixProvider *dpp,
   return 0;
 }
 
+int POSIXMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+    CephContext *cct, optional_yield y,
+    const rgw_obj& obj,
+    std::list<rgw_obj_index_key>& remove_objs,
+    prefix_map_t& processed_prefixes)
+{
+  return -ENOTSUP;
+}
+
 int POSIXMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y,
 				   rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
 {
diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h
index 5291ba500f9..efe3bfd7a50 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.h
+++ b/src/rgw/driver/posix/rgw_sal_posix.h
@@ -632,7 +632,9 @@ public:
 
   virtual int delete_object(const DoutPrefixProvider* dpp,
 			    optional_yield y,
-			    uint32_t flags) override;
+			    uint32_t flags,
+			    std::list<rgw_obj_index_key>* remove_objs,
+			    RGWObjVersionTracker* objv) override;
   virtual int copy_object(const ACLOwner& owner,
                const rgw_user& remote_user,
                req_info* info, const rgw_zone_id& source_zone,
@@ -879,7 +881,13 @@ public:
 		       RGWCompressionInfo& cs_info, off_t& ofs,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) override;
+		       rgw::sal::Object* target_obj,
+		       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                     CephContext *cct, optional_yield y,
+                                     const rgw_obj& obj,
+                                     std::list<rgw_obj_index_key>& remove_objs,
+                                     prefix_map_t& processed_prefixes) override;
   virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y,
 		       rgw_placement_rule** rule, rgw::sal::Attrs* attrs) override;
 
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
index f486b0405af..6bb1e9572da 100644
--- a/src/rgw/driver/rados/rgw_bucket.cc
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -149,7 +149,7 @@ int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, r
 
   std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
 
-  return object->delete_object(dpp, y, rgw::sal::FLAG_LOG_OP);
+  return object->delete_object(dpp, y, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
 }
 
 static void set_err_msg(std::string *sink, std::string msg)
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
index 6b1dd9651f0..6fdc4b91677 100644
--- a/src/rgw/driver/rados/rgw_notify.cc
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -60,22 +60,19 @@ auto make_stack_allocator() {
 const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
 
 struct PublishCommitCompleteArg {
+    PublishCommitCompleteArg(const std::string& _queue_name, CephContext* _cct)
+            : queue_name{_queue_name}, cct{_cct} {}
 
-    PublishCommitCompleteArg(std::string _queue_name, const DoutPrefixProvider *_dpp)
-            : queue_name{std::move(_queue_name)}, dpp{_dpp} {}
-
-    std::string queue_name;
-    const DoutPrefixProvider *dpp;
+    const std::string queue_name;
+    CephContext* const cct;
 };
 
-void publish_commit_completion(rados_completion_t completion, void *arg) {
-    auto *comp_obj = reinterpret_cast<librados::AioCompletionImpl *>(completion);
-    std::unique_ptr<PublishCommitCompleteArg> pcc_arg(reinterpret_cast<PublishCommitCompleteArg *>(arg));
-    if (comp_obj->get_return_value() < 0) {
-        ldpp_dout(pcc_arg->dpp, 1) << "ERROR: failed to commit reservation to queue: "
-                                   << pcc_arg->queue_name << ". error: " << comp_obj->get_return_value()
-                                   << dendl;
-    }
+void publish_commit_completion(rados_completion_t completion, void* arg) {
+  std::unique_ptr<PublishCommitCompleteArg> pcc_args{reinterpret_cast<PublishCommitCompleteArg*>(arg)};
+  if (const auto rc = rados_aio_get_return_value(completion); rc < 0) {
+    ldout(pcc_args->cct, 1) << "ERROR: failed to commit reservation to queue: "
+      << pcc_args->queue_name << ". error: " << rc << dendl;
+  }
 };
 
 class Manager : public DoutPrefixProvider {
@@ -1243,19 +1240,17 @@ int publish_commit(rgw::sal::Object* obj,
       std::vector<buffer::list> bl_data_vec{std::move(bl)};
       librados::ObjectWriteOperation op;
       cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
-      aio_completion_ptr completion {librados::Rados::aio_create_completion()};
-      auto pcc_arg = make_unique<PublishCommitCompleteArg>(queue_name, dpp);
-      completion->set_complete_callback(pcc_arg.get(), publish_commit_completion);
-      auto &io_ctx = res.store->getRados()->get_notif_pool_ctx();
-      int ret = io_ctx.aio_operate(queue_name, completion.get(), &op);
       topic.res_id = cls_2pc_reservation::NO_ID;
-      if (ret < 0) {
+      auto pcc_arg = make_unique<PublishCommitCompleteArg>(queue_name, dpp->get_cct());
+      aio_completion_ptr completion{librados::Rados::aio_create_completion(pcc_arg.get(), publish_commit_completion)};
+      auto& io_ctx = res.store->getRados()->get_notif_pool_ctx();
+      if (const int ret = io_ctx.aio_operate(queue_name, completion.get(), &op); ret < 0) {
         ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
                           << queue_name << ". error: " << ret << dendl;
         return ret;
       }
+      // args will be released inside the callback
       pcc_arg.release();
-      completion.release();
     } else {
       try {
         // TODO add endpoint LRU cache
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
index c285443d0b0..a5d788ea469 100644
--- a/src/rgw/driver/rados/rgw_object_expirer_core.cc
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -225,7 +225,7 @@ int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objex
 
   std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
   obj->set_atomic();
-  ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP);
+  ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
 
   return ret;
 }
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc
index 79be7b3209b..f04ed1db8d4 100644
--- a/src/rgw/driver/rados/rgw_putobj_processor.cc
+++ b/src/rgw/driver/rados/rgw_putobj_processor.cc
@@ -22,6 +22,8 @@
 #include "services/svc_zone.h"
 #include "rgw_sal_rados.h"
 
+#include "cls/version/cls_version_client.h"
+
 #define dout_subsys ceph_subsys_rgw
 
 using namespace std;
@@ -573,7 +575,9 @@ int MultipartObjectProcessor::complete(
   }
 
   librados::ObjectWriteOperation op;
+  op.assert_exists();
   cls_rgw_mp_upload_part_info_update(op, p, info);
+  cls_version_inc(op);
   r = rgw_rados_operate(rctx.dpp, meta_obj_ref.ioctx, meta_obj_ref.obj.oid, &op, rctx.y);
   ldpp_dout(rctx.dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl;
 
@@ -588,8 +592,10 @@ int MultipartObjectProcessor::complete(
     op = librados::ObjectWriteOperation{};
     op.assert_exists(); // detect races with abort
     op.omap_set(m);
+    cls_version_inc(op);
     r = rgw_rados_operate(rctx.dpp, meta_obj_ref.ioctx, meta_obj_ref.obj.oid, &op, rctx.y);
   }
+
   if (r < 0) {
     return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
   }
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 1f1d75cb07b..a967030f622 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -5857,6 +5857,10 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
 
   store->remove_rgw_head_obj(op);
 
+  if (params.check_objv != nullptr) {
+    cls_version_check(op, *params.check_objv, VER_COND_EQ);
+  }
+
   auto& ioctx = ref.ioctx;
   r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
 
@@ -6051,7 +6055,7 @@ int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *oc
   int r = -ENOENT;
 
   if (!assume_noent) {
-    r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+    r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), &s->objv_tracker, y);
   }
 
   if (r == -ENOENT) {
@@ -6784,6 +6788,10 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
     return -ENOENT;
   }
 
+  if (params.objv_tracker) {
+    *params.objv_tracker = astate->objv_tracker;
+  }
+
   RGWBucketInfo& bucket_info = source->get_bucket_info();
 
   if (params.part_num) {
@@ -8781,6 +8789,7 @@ int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
   if (first_chunk) {
     op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
   }
+
   bufferlist outbl;
   r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y);
 
@@ -9388,13 +9397,6 @@ int RGWRados::process_gc(bool expired_only, optional_yield y)
   return gc->process(expired_only, y);
 }
 
-int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
-			       vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
-			       int& index)
-{
-  return lc->list_lc_progress(marker, max_entries, progress_map, index);
-}
-
 int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
 {
   RGWLC lc;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index d4aabe7ba76..9d520c12a1e 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -771,6 +771,7 @@ public:
 	uint64_t *epoch;
         int* part_num = nullptr;
         std::optional<int> parts_count;
+        RGWObjVersionTracker *objv_tracker = nullptr;
 
         Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
 		   target_obj(nullptr), epoch(nullptr)
@@ -854,8 +855,9 @@ public:
         rgw_zone_set *zones_trace;
 	bool abortmp;
 	uint64_t parts_accounted_size;
+	obj_version *check_objv;
 
-        DeleteParams() : versioning_status(0), null_verid(false), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+        DeleteParams() : versioning_status(0), null_verid(false), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0), check_objv(nullptr) {}
       } params;
 
       struct DeleteResult {
@@ -1571,9 +1573,6 @@ public:
   int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y);
 
   int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
-  int list_lc_progress(std::string& marker, uint32_t max_entries,
-		       std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
-		       int& index);
 
   int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
                          std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index 50c738d5435..e3b435d2839 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -2318,6 +2318,7 @@ int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Rea
   read_op.params.target_obj = target_obj;
   read_op.params.obj_size = &state.size;
   read_op.params.lastmod = &state.mtime;
+  read_op.params.objv_tracker = &state.objv_tracker;
 
   return read_op.prepare(y, dpp);
 }
@@ -2848,6 +2849,9 @@ int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, option
   parent_op.params.abortmp = params.abortmp;
   parent_op.params.parts_accounted_size = params.parts_accounted_size;
   parent_op.params.null_verid = params.null_verid;
+  if (params.objv_tracker) {
+      parent_op.params.check_objv = params.objv_tracker->version_for_check();
+  }
 
   int ret = parent_op.delete_obj(y, dpp, flags & FLAG_LOG_OP);
   if (ret < 0)
@@ -2861,7 +2865,9 @@ int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, option
 
 int RadosObject::delete_object(const DoutPrefixProvider* dpp,
 			       optional_yield y,
-			       uint32_t flags)
+			       uint32_t flags,
+			       std::list<rgw_obj_index_key>* remove_objs,
+			       RGWObjVersionTracker* objv)
 {
   RGWRados::Object del_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
   RGWRados::Object::Delete del_op(&del_target);
@@ -2869,6 +2875,10 @@ int RadosObject::delete_object(const DoutPrefixProvider* dpp,
   del_op.params.bucket_owner = bucket->get_info().owner;
   del_op.params.versioning_status = (flags & FLAG_PREVENT_VERSIONING)
                                     ? 0 : bucket->get_info().versioning_status();
+  del_op.params.remove_objs = remove_objs;
+  if (objv) {
+      del_op.params.check_objv = objv->version_for_check();
+  }
 
   return del_op.delete_obj(y, dpp, flags & FLAG_LOG_OP);
 }
@@ -2964,13 +2974,84 @@ int RadosObject::swift_versioning_copy(const ACLOwner& owner, const rgw_user& re
                                         y);
 }
 
+int RadosMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                                 CephContext *cct, optional_yield y,
+                                                 const rgw_obj& obj,
+                                                 list<rgw_obj_index_key>& remove_objs,
+                                                 prefix_map_t& processed_prefixes)
+{
+  bool truncated;
+  int ret;
+  int max_parts = 1000;
+  int marker = 0;
+  cls_rgw_obj_chain chain;
+
+  do {
+    ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated, y);
+
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " << ret << dendl;
+      return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+    }
+
+    for (auto part_it = parts.begin(); part_it != parts.end(); ++part_it) {
+      RadosMultipartPart* part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+
+      auto& part_prefixes = processed_prefixes[part->info.num];
+
+      if (!part->info.manifest.empty()) {
+        auto manifest_prefix = part->info.manifest.get_prefix();
+        if (not manifest_prefix.empty() && part_prefixes.find(manifest_prefix) == part_prefixes.end()) {
+          store->getRados()->update_gc_chain(dpp, obj, part->info.manifest, &chain);
+
+          RGWObjManifest::obj_iterator oiter = part->info.manifest.obj_begin(dpp);
+          if (oiter != part->info.manifest.obj_end(dpp)) {
+            rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+
+            rgw_obj head_obj;
+            RGWSI_Tier_RADOS::raw_obj_to_obj(bucket->get_key(), raw_head, &head_obj);
+
+            rgw_obj_index_key remove_key;
+            head_obj.key.get_index_key(&remove_key);
+            remove_objs.push_back(remove_key);
+          }
+        }
+      }
+      cleanup_part_history(dpp, y, part, remove_objs, part_prefixes);
+    }
+  } while (truncated);
+
+  if (store->getRados()->get_gc() == nullptr) {
+    //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
+  } else {
+    /* use upload id as tag and do it synchronously */
+    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
+    if (ret < 0 && leftover_chain) {
+      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      //Delete objects inline if send chain to gc fails
+      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
+    }
+  }
+  return 0;
+}
+
 int RadosMultipartUpload::cleanup_part_history(const DoutPrefixProvider* dpp,
                                                optional_yield y,
                                                RadosMultipartPart *part,
-                                               list<rgw_obj_index_key>& remove_objs)
+                                               list<rgw_obj_index_key>& remove_objs,
+                                               boost::container::flat_set<std::string>& processed_prefixes)
 {
   cls_rgw_obj_chain chain;
   for (auto& ppfx : part->get_past_prefixes()) {
+    auto [it, inserted] = processed_prefixes.emplace(ppfx);
+    if (!inserted) {
+      continue; // duplicate
+    }
+
     rgw_obj past_obj;
     past_obj.init_ns(bucket->get_key(), ppfx + "." + std::to_string(part->info.num), mp_ns);
     rgw_obj_index_key past_key;
@@ -3018,77 +3099,105 @@ int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct,
   int ret;
   uint64_t parts_accounted_size = 0;
 
-  do {
-    ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated, y);
+  prefix_map_t processed_prefixes;
+
+  static constexpr auto MAX_DELETE_RETRIES = 15u;
+  for (auto i = 0u; i < MAX_DELETE_RETRIES; i++) {
+    ret = meta_obj->get_obj_attrs(y, dpp);
     if (ret < 0) {
-      ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
-	ret << dendl;
+      ldpp_dout(dpp, 0) << __func__ << ": ERROR: failed to get obj attrs, obj=" << meta_obj
+                        << " ret=" << ret << dendl;
       return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
     }
 
-    for (auto part_it = parts.begin();
-	 part_it != parts.end();
-	 ++part_it) {
-      RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
-      if (obj_part->info.manifest.empty()) {
-	std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
-				    rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
-	obj->set_hash_source(mp_obj.get_key());
-	ret = obj->delete_object(dpp, y, 0);
-        if (ret < 0 && ret != -ENOENT)
-          return ret;
-      } else {
-	auto target = meta_obj->get_obj();
-	store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
-        RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
-        if (oiter != obj_part->info.manifest.obj_end(dpp)) {
-	  std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
-          rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
-	  dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
-
-          rgw_obj_index_key key;
-          head->get_key().get_index_key(&key);
-          remove_objs.push_back(key);
-
-          cleanup_part_history(dpp, null_yield, obj_part, remove_objs);
+    RGWObjVersionTracker objv_tracker = meta_obj->get_version_tracker();
+
+    do {
+      ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " << ret << dendl;
+        return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+      }
+
+      for (auto part_it = parts.begin(); part_it != parts.end(); ++part_it) {
+        RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+
+        if (obj_part->info.manifest.empty()) {
+          std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+            rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
+          obj->set_hash_source(mp_obj.get_key());
+          ret = obj->delete_object(dpp, y, 0, nullptr, nullptr);
+          if (ret < 0 && ret != -ENOENT)
+            return ret;
+        } else {
+          auto manifest_prefix = obj_part->info.manifest.get_prefix();
+          auto [it, inserted] = processed_prefixes.emplace(obj_part->info.num, boost::container::flat_set<std::string>{});
+          if (not manifest_prefix.empty()) {
+            if (it->second.find(manifest_prefix) != it->second.end()) {
+              continue;
+            }
+            it->second.emplace(manifest_prefix);
+          }
+
+          auto target = meta_obj->get_obj();
+          store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
+          RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
+          if (oiter != obj_part->info.manifest.obj_end(dpp)) {
+            std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
+            rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+            dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
+
+            rgw_obj_index_key key;
+            head->get_key().get_index_key(&key);
+            remove_objs.push_back(key);
+
+            cleanup_part_history(dpp, null_yield, obj_part, remove_objs, it->second);
+          }
         }
+        parts_accounted_size += obj_part->info.accounted_size;
       }
-      parts_accounted_size += obj_part->info.accounted_size;
-    }
-  } while (truncated);
+    } while (truncated);
 
-  if (store->getRados()->get_gc() == nullptr) {
-    //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
-    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
-  } else {
-    /* use upload id as tag and do it synchronously */
-    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
-    if (ret < 0 && leftover_chain) {
-      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
-      if (ret == -ENOENT) {
-        return -ERR_NO_SUCH_UPLOAD;
+    if (store->getRados()->get_gc() == nullptr) {
+      //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+      store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
+    } else {
+      /* use upload id as tag and do it synchronously */
+      auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
+      if (ret < 0 && leftover_chain) {
+        ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+        if (ret == -ENOENT) {
+          return -ERR_NO_SUCH_UPLOAD;
+        }
+        //Delete objects inline if send chain to gc fails
+        store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
       }
-      //Delete objects inline if send chain to gc fails
-      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
     }
-  }
 
-  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
-  del_op->params.bucket_owner = bucket->get_info().owner;
-  del_op->params.versioning_status = 0;
-  if (!remove_objs.empty()) {
-    del_op->params.remove_objs = &remove_objs;
-  }
+    std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+    del_op->params.bucket_owner = bucket->get_info().owner;
+    del_op->params.versioning_status = 0;
+    if (!remove_objs.empty()) {
+      del_op->params.remove_objs = &remove_objs;
+    }
 
-  del_op->params.abortmp = true;
-  del_op->params.parts_accounted_size = parts_accounted_size;
+    del_op->params.abortmp = true;
+    del_op->params.parts_accounted_size = parts_accounted_size;
+    del_op->params.objv_tracker = &objv_tracker;
 
-  // and also remove the metadata obj
-  ret = del_op->delete_obj(dpp, y, 0);
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
-      ret << dendl;
+    // and also remove the metadata obj
+    ret = del_op->delete_obj(dpp, y, 0);
+    if (ret != -ECANCELED) {
+      if (ret < 0) {
+        ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " << ret << dendl;
+      }
+      break;
+    }
+    ldpp_dout(dpp, 20) << "deleting meta_obj is cancelled due to mismatch cls_version: " << objv_tracker << dendl;
+    chain.objs.clear();
+    marker = 0;
   }
+
   return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
 }
 
@@ -3269,7 +3378,8 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
 				   RGWCompressionInfo& cs_info, off_t& ofs,
 				   std::string& tag, ACLOwner& owner,
 				   uint64_t olh_epoch,
-				   rgw::sal::Object* target_obj)
+				   rgw::sal::Object* target_obj,
+				   prefix_map_t& processed_prefixes)
 {
   char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
   char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -3341,6 +3451,8 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
       rgw_obj src_obj;
       src_obj.init_ns(bucket->get_key(), oid, mp_ns);
 
+      auto [it, inserted] = processed_prefixes.emplace(part->info.num, boost::container::flat_set<std::string>{});
+
       if (obj_part.manifest.empty()) {
         ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
 			 << src_obj << dendl;
@@ -3352,6 +3464,7 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
         if (not manifest_prefix.empty()) {
           // It has an explicit prefix. Override the default one.
           src_obj.init_ns(bucket->get_key(), manifest_prefix + "." + std::to_string(part->info.num), mp_ns);
+	  it->second.emplace(manifest_prefix);
         }
       }
 
@@ -3397,7 +3510,7 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
 
       remove_objs.push_back(remove_key);
 
-      cleanup_part_history(dpp, y, part, remove_objs);
+      cleanup_part_history(dpp, y, part, remove_objs, it->second);
 
       ofs += obj_part.size;
       accounted_size += obj_part.accounted_size;
@@ -3614,7 +3727,7 @@ int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, opti
 
 int RadosLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                               const std::string& oid, const std::string& marker,
-			      std::unique_ptr<LCEntry>* entry)
+			      LCEntry& entry)
 {
   librados::ObjectReadOperation op;
   bufferlist bl;
@@ -3632,13 +3745,15 @@ int RadosLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
     return ret;
   }
 
-  *entry = std::make_unique<StoreLCEntry>(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  entry.bucket = std::move(cls_entry.bucket);
+  entry.start_time = cls_entry.start_time;
+  entry.status = cls_entry.status;
   return 0;
 }
 
 int RadosLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                                    const std::string& oid, const std::string& marker,
-				   std::unique_ptr<LCEntry>* entry)
+				   LCEntry& entry)
 {
   librados::ObjectReadOperation op;
   bufferlist bl;
@@ -3656,18 +3771,20 @@ int RadosLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield
     return ret;
   }
 
-  *entry = std::make_unique<StoreLCEntry>(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  entry.bucket = std::move(cls_entry.bucket);
+  entry.start_time = cls_entry.start_time;
+  entry.status = cls_entry.status;
   return 0;
 }
 
 int RadosLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                              const std::string& oid, LCEntry& entry)
+                              const std::string& oid, const LCEntry& entry)
 {
   cls_rgw_lc_entry cls_entry;
 
-  cls_entry.bucket = entry.get_bucket();
-  cls_entry.start_time = entry.get_start_time();
-  cls_entry.status = entry.get_status();
+  cls_entry.bucket = entry.bucket;
+  cls_entry.start_time = entry.start_time;
+  cls_entry.status = entry.status;
 
   librados::ObjectWriteOperation op;
   cls_rgw_lc_set_entry(op, cls_entry);
@@ -3678,7 +3795,7 @@ int RadosLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
 
 int RadosLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                                  const std::string& oid, const std::string& marker,
-                                 uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
+                                 uint32_t max_entries, std::vector<LCEntry>& entries)
 {
   entries.clear();
 
@@ -3699,20 +3816,19 @@ int RadosLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y
   }
 
   for (auto& entry : cls_entries) {
-    entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
-				entry.start_time, entry.status));
+    entries.push_back(LCEntry{entry.bucket, entry.start_time, entry.status});
   }
 
   return ret;
 }
 
 int RadosLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                             const std::string& oid, LCEntry& entry)
+                             const std::string& oid, const LCEntry& entry)
 {
   cls_rgw_lc_entry cls_entry;
-  cls_entry.bucket = entry.get_bucket();
-  cls_entry.start_time = entry.get_start_time();
-  cls_entry.status = entry.get_status();
+  cls_entry.bucket = entry.bucket;
+  cls_entry.start_time = entry.start_time;
+  cls_entry.status = entry.status;
 
   librados::ObjectWriteOperation op;
   cls_rgw_lc_rm_entry(op, cls_entry);
@@ -3722,7 +3838,7 @@ int RadosLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
 }
 
 int RadosLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                             const std::string& oid, std::unique_ptr<LCHead>* head)
+                             const std::string& oid, LCHead& head)
 {
   librados::ObjectReadOperation op;
   bufferlist bl;
@@ -3740,18 +3856,20 @@ int RadosLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
     return ret;
   }
 
-  *head = std::make_unique<StoreLCHead>(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
+  head.start_date = cls_head.start_date;
+  head.shard_rollover_date = cls_head.shard_rollover_date;
+  head.marker = std::move(cls_head.marker);
   return 0;
 }
 
 int RadosLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
-                             const std::string& oid, LCHead& head)
+                             const std::string& oid, const LCHead& head)
 {
   cls_rgw_lc_obj_head cls_head;
 
-  cls_head.marker = head.get_marker();
-  cls_head.start_date = head.get_start_date();
-  cls_head.shard_rollover_date = head.get_shard_rollover_date();
+  cls_head.marker = head.marker;
+  cls_head.start_date = head.start_date;
+  cls_head.shard_rollover_date = head.shard_rollover_date;
 
   librados::ObjectWriteOperation op;
   cls_rgw_lc_put_head(op, cls_head);
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
index 98d0bc9d005..d359f733ab3 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.h
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -559,7 +559,9 @@ class RadosObject : public StoreObject {
       rados_ctx->invalidate(get_obj());
     }
     virtual int delete_object(const DoutPrefixProvider* dpp,
-			      optional_yield y, uint32_t flags) override;
+			      optional_yield y, uint32_t flags,
+			      std::list<rgw_obj_index_key>* remove_objs,
+			      RGWObjVersionTracker* objv) override;
     virtual int copy_object(const ACLOwner& owner,
                const rgw_user& remote_user,
                req_info* info, const rgw_zone_id& source_zone,
@@ -825,7 +827,13 @@ public:
 		       RGWCompressionInfo& cs_info, off_t& ofs,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) override;
+		       rgw::sal::Object* target_obj,
+		       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                     CephContext *cct, optional_yield y,
+                                     const rgw_obj& obj,
+                                     std::list<rgw_obj_index_key>& remove_objs,
+                                     prefix_map_t& processed_prefixes) override;
   virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
   virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
 			  optional_yield y,
@@ -838,7 +846,8 @@ protected:
   int cleanup_part_history(const DoutPrefixProvider* dpp,
                            optional_yield y,
                            RadosMultipartPart* part,
-                           std::list<rgw_obj_index_key>& remove_objs);
+                           std::list<rgw_obj_index_key>& remove_objs,
+                           boost::container::flat_set<std::string>& processed_prefixes);
 };
 
 class MPRadosSerializer : public StoreMPSerializer {
@@ -868,31 +877,30 @@ public:
   }
 };
 
-class RadosLifecycle : public StoreLifecycle {
+class RadosLifecycle : public Lifecycle {
   RadosStore* store;
 
 public:
   RadosLifecycle(RadosStore* _st) : store(_st) {}
 
-  using StoreLifecycle::get_entry;
   virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                         const std::string& oid, const std::string& marker,
-                        std::unique_ptr<LCEntry>* entry) override;
+                        LCEntry& entry) override;
   virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                              const std::string& oid, const std::string& marker,
-                             std::unique_ptr<LCEntry>* entry) override;
+                             LCEntry& entry) override;
   virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                        const std::string& oid, LCEntry& entry) override;
+                        const std::string& oid, const LCEntry& entry) override;
   virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                            const std::string& oid, const std::string& marker,
 			   uint32_t max_entries,
-			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+			   std::vector<LCEntry>& entries) override;
   virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, LCEntry& entry) override;
+                       const std::string& oid, const LCEntry& entry) override;
   virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, std::unique_ptr<LCHead>* head) override;
-  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
                        const std::string& oid, LCHead& head) override;
+  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+                       const std::string& oid, const LCHead& head) override;
   virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
 						       const std::string& oid,
 						       const std::string& cookie) override;
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 3fe5f686bf0..966e912b05d 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -8835,16 +8835,16 @@ next:
 
   if (opt_cmd == OPT::LC_LIST) {
     formatter->open_array_section("lifecycle_list");
-    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> bucket_lc_map;
+    vector<rgw::sal::LCEntry> bucket_lc_map;
     string marker;
     int index{0};
 #define MAX_LC_LIST_ENTRIES 100
     if (max_entries < 0) {
       max_entries = MAX_LC_LIST_ENTRIES;
     }
+    RGWLC* lc = driver->get_rgwlc();
     do {
-      int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->list_lc_progress(marker, max_entries,
-						    bucket_lc_map, index);
+      int ret = lc->list_lc_progress(marker, max_entries, bucket_lc_map, index);
       if (ret < 0) {
         cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret)
 	     << std::endl;
@@ -8852,17 +8852,15 @@ next:
       }
       for (const auto& entry : bucket_lc_map) {
         formatter->open_object_section("bucket_lc_info");
-        formatter->dump_string("bucket", entry->get_bucket());
-	formatter->dump_string("shard", entry->get_oid());
+        formatter->dump_string("bucket", entry.bucket);
 	char exp_buf[100];
-	time_t t{time_t(entry->get_start_time())};
+        time_t t = entry.start_time;
 	if (std::strftime(
 	      exp_buf, sizeof(exp_buf),
 	      "%a, %d %b %Y %T %Z", std::gmtime(&t))) {
 	  formatter->dump_string("started", exp_buf);
 	}
-        string lc_status = LC_STATUS[entry->get_status()];
-        formatter->dump_string("status", lc_status);
+        formatter->dump_string("status", LC_STATUS[entry.status]);
         formatter->close_section(); // objs
         formatter->flush(cout);
       }
diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
index 1a28167c406..f80e40c70b6 100644
--- a/src/rgw/rgw_asio_frontend.cc
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -3,9 +3,13 @@
 
 #include <atomic>
 #include <ctime>
+#include <list>
 #include <memory>
-#include <vector>
 
+#include <boost/asio/bind_executor.hpp>
+#include <boost/asio/bind_cancellation_slot.hpp>
+#include <boost/asio/cancellation_signal.hpp>
+#include <boost/asio/detached.hpp>
 #include <boost/asio/error.hpp>
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/ip/tcp.hpp>
@@ -64,6 +68,44 @@ auto make_stack_allocator() {
   return boost::context::protected_fixedsize_stack{512*1024};
 }
 
+static constexpr std::chrono::milliseconds BACKOFF_MAX_WAIT(5000);
+
+class RGWAsioBackoff {
+  using Clock = ceph::coarse_mono_clock;
+  using Timer = boost::asio::basic_waitable_timer<Clock>;
+  Timer timer;
+
+  ceph::timespan cur_wait;
+  void update_wait_time();
+public:
+  explicit RGWAsioBackoff(boost::asio::io_context& context) :
+                          timer(context),
+                          cur_wait(std::chrono::milliseconds(1)) {
+  }
+
+  void backoff_sleep(boost::asio::yield_context yield);
+  void reset() {
+    cur_wait = std::chrono::milliseconds(1);
+  }
+};
+
+void RGWAsioBackoff::update_wait_time()
+{
+  if (cur_wait < BACKOFF_MAX_WAIT) {
+    cur_wait = cur_wait * 2;
+  }
+  if (cur_wait > BACKOFF_MAX_WAIT) {
+    cur_wait = BACKOFF_MAX_WAIT;
+  }
+}
+
+void RGWAsioBackoff::backoff_sleep(boost::asio::yield_context yield)
+{
+  update_wait_time();
+  timer.expires_after(cur_wait);
+  timer.async_wait(yield);
+}
+
 using namespace std;
 
 template <typename Stream>
@@ -424,29 +466,34 @@ class AsioFrontend {
     tcp::endpoint endpoint;
     tcp::acceptor acceptor;
     tcp::socket socket;
+    boost::asio::cancellation_signal signal;
     bool use_ssl = false;
     bool use_nodelay = false;
 
     explicit Listener(boost::asio::io_context& context)
       : acceptor(context), socket(context) {}
   };
-  std::vector<Listener> listeners;
+  std::list<Listener> listeners;
 
   ConnectionList connections;
 
   std::atomic<bool> going_down{false};
 
+  RGWAsioBackoff backoff;
   CephContext* ctx() const { return cct.get(); }
   std::optional<dmc::ClientCounters> client_counters;
   std::unique_ptr<dmc::ClientConfig> client_config;
-  void accept(Listener& listener, boost::system::error_code ec);
+
+  void accept(Listener& listener, boost::asio::yield_context yield);
+  void on_accept(Listener& listener, tcp::socket stream);
 
  public:
   AsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf,
 	       dmc::SchedulerCtx& sched_ctx,
 	       boost::asio::io_context& context)
     : env(env), conf(conf), context(context),
-      pause_mutex(context.get_executor())
+      pause_mutex(context.get_executor()),
+      backoff(context)
   {
     auto sched_t = dmc::get_scheduler_t(ctx());
     switch(sched_t){
@@ -683,10 +730,13 @@ int AsioFrontend::init()
       }
     }
     l.acceptor.listen(max_connection_backlog);
-    l.acceptor.async_accept(l.socket,
-                            [this, &l] (boost::system::error_code ec) {
-                              accept(l, ec);
-                            });
+
+    // spawn a cancellable coroutine to the run the accept loop
+    boost::asio::spawn(context,
+      [this, &l] (boost::asio::yield_context yield) mutable {
+        accept(l, yield);
+      }, bind_cancellation_slot(l.signal.slot(),
+             bind_executor(context, boost::asio::detached)));
 
     ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl;
     socket_bound = true;
@@ -1003,22 +1053,39 @@ int AsioFrontend::init_ssl()
 }
 #endif // WITH_RADOSGW_BEAST_OPENSSL
 
-void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
+void AsioFrontend::accept(Listener& l, boost::asio::yield_context yield)
 {
-  if (!l.acceptor.is_open()) {
-    return;
-  } else if (ec == boost::asio::error::operation_aborted) {
-    return;
-  } else if (ec) {
-    ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
-    return;
+  for (;;) {
+    boost::system::error_code ec;
+    l.acceptor.async_accept(l.socket, yield[ec]);
+
+    if (!l.acceptor.is_open()) {
+      return;
+    } else if (ec == boost::asio::error::operation_aborted) {
+      return;
+    } else if (ec) {
+      ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
+      if (ec == boost::system::errc::too_many_files_open ||
+          ec == boost::system::errc::too_many_files_open_in_system ||
+          ec == boost::system::errc::no_buffer_space ||
+          ec == boost::system::errc::not_enough_memory) {
+        // always retry accept() if we hit a resource limit
+        backoff.backoff_sleep(yield);
+        continue;
+      }
+      ldout(ctx(), 0) << "accept stopped due to error: " << ec.message() << dendl;
+      return;
+    }
+
+    backoff.reset();
+    on_accept(l, std::move(l.socket));
   }
-  auto stream = std::move(l.socket);
+}
+
+void AsioFrontend::on_accept(Listener& l, tcp::socket stream)
+{
+  boost::system::error_code ec;
   stream.set_option(tcp::no_delay(l.use_nodelay), ec);
-  l.acceptor.async_accept(l.socket,
-                          [this, &l] (boost::system::error_code ec) {
-                            accept(l, ec);
-                          });
   
   // spawn a coroutine to handle the connection
 #ifdef WITH_RADOSGW_BEAST_OPENSSL
@@ -1086,6 +1153,8 @@ void AsioFrontend::stop()
   // close all listeners
   for (auto& listener : listeners) {
     listener.acceptor.close(ec);
+    // signal cancellation of accept()
+    listener.signal.emit(boost::asio::cancellation_type::terminal);
   }
   // close all connections
   connections.close(ec);
@@ -1107,6 +1176,8 @@ void AsioFrontend::pause()
   boost::system::error_code ec;
   for (auto& l : listeners) {
     l.acceptor.cancel(ec);
+    // signal cancellation of accept()
+    l.signal.emit(boost::asio::cancellation_type::terminal);
   }
 
   // close all connections so outstanding requests fail quickly
@@ -1129,10 +1200,12 @@ void AsioFrontend::unpause()
 
   // start accepting connections again
   for (auto& l : listeners) {
-    l.acceptor.async_accept(l.socket,
-                            [this, &l] (boost::system::error_code ec) {
-                              accept(l, ec);
-                            });
+    boost::asio::spawn(context,
+      [this, &l] (boost::asio::yield_context yield) mutable {
+        accept(l, yield);
+      }, bind_cancellation_slot(l.signal.slot(),
+             bind_executor(context, boost::asio::detached)));
+
   }
 
   ldout(ctx(), 4) << "frontend unpaused" << dendl;
diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc
index d3586e4900b..c0ec3dc2c55 100644
--- a/src/rgw/rgw_kafka.cc
+++ b/src/rgw/rgw_kafka.cc
@@ -19,11 +19,6 @@
 
 #define dout_subsys ceph_subsys_rgw_notification
 
-// comparison operator between topic pointer and name
-bool operator==(const rd_kafka_topic_t* rkt, const std::string& name) {
-    return name == std::string_view(rd_kafka_topic_name(rkt)); 
-}
-
 // this is the inverse of rd_kafka_errno2err
 // see: https://github.com/confluentinc/librdkafka/blob/master/src/rdkafka.c
 inline int rd_kafka_err2errno(rd_kafka_resp_err_t err) {
@@ -141,9 +136,19 @@ struct reply_callback_with_tag_t {
 
 typedef std::vector<reply_callback_with_tag_t> CallbackList;
 
+
+
 struct connection_t {
   rd_kafka_t* producer = nullptr;
-  std::vector<rd_kafka_topic_t*> topics;
+
+  struct rd_kafka_topic_deleter {
+    void operator()(rd_kafka_topic_t* topic) {
+      rd_kafka_topic_destroy(topic);
+    }
+  };
+  using topic_ptr = std::unique_ptr<rd_kafka_topic_t, rd_kafka_topic_deleter>;
+  std::map<std::string, topic_ptr> topics;
+
   uint64_t delivery_tag = 1;
   int status = 0;
   CephContext* const cct;
@@ -168,7 +173,6 @@ struct connection_t {
     // wait for 500ms to try and handle pending callbacks
     rd_kafka_flush(producer, 500);
     // destroy all topics
-    std::for_each(topics.begin(), topics.end(), [](auto topic) {rd_kafka_topic_destroy(topic);});
     topics.clear();
     // destroy producer
     rd_kafka_destroy(producer);
@@ -431,10 +435,9 @@ private:
     }
 
     // create a new topic unless it was already created
-    auto topic_it = std::find(conn->topics.begin(), conn->topics.end(), message->topic);
-    rd_kafka_topic_t* topic = nullptr;
+    auto topic_it = conn->topics.find(message->topic);
     if (topic_it == conn->topics.end()) {
-      topic = rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr);
+      connection_t::topic_ptr topic(rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr));
       if (!topic) {
         const auto err = rd_kafka_last_error();
         ldout(conn->cct, 1) << "Kafka publish: failed to create topic: " << message->topic << " error: " 
@@ -444,17 +447,15 @@ private:
         }
         return;
       }
-      // TODO use the topics list as an LRU cache
-      conn->topics.push_back(topic);
+      topic_it = conn->topics.emplace(message->topic, std::move(topic)).first;
       ldout(conn->cct, 20) << "Kafka publish: successfully created topic: " << message->topic << dendl;
     } else {
-        topic = *topic_it;
         ldout(conn->cct, 20) << "Kafka publish: reused existing topic: " << message->topic << dendl;
     }
 
     const auto tag = (message->cb == nullptr ? nullptr : new uint64_t(conn->delivery_tag++));
     const auto rc = rd_kafka_produce(
-            topic,
+            topic_it->second.get(),
             // TODO: non builtin partitioning
             RD_KAFKA_PARTITION_UA,
             // make a copy of the payload
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index fb3002b0a38..0c80ad1b765 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -258,13 +258,13 @@ void RGWLC::finalize()
   delete[] obj_names;
 }
 
-static inline std::ostream& operator<<(std::ostream &os, rgw::sal::Lifecycle::LCEntry& ent) {
+static inline std::ostream& operator<<(std::ostream &os, rgw::sal::LCEntry& ent) {
   os << "<ent: bucket=";
-  os << ent.get_bucket();
+  os << ent.bucket;
   os << "; start_time=";
-  os << rgw_to_asctime(utime_t(time_t(ent.get_start_time()), 0));
+  os << rgw_to_asctime(utime_t(ent.start_time, 0));
   os << "; status=";
-  os << LC_STATUS[ent.get_status()];
+  os << LC_STATUS[ent.status];
   os << ">";
   return os;
 }
@@ -1834,7 +1834,7 @@ public:
 };
 
 int RGWLC::bucket_lc_post(int index, int max_lock_sec,
-			  rgw::sal::Lifecycle::LCEntry& entry, int& result,
+			  rgw::sal::LCEntry& entry, int& result,
 			  LCWorker* worker)
 {
   utime_t lock_duration(cct->_conf->rgw_lc_lock_max_time, 0);
@@ -1872,9 +1872,9 @@ int RGWLC::bucket_lc_post(int index, int max_lock_sec,
       }
       goto clean;
     } else if (result < 0) {
-      entry.set_status(lc_failed);
+      entry.status = lc_failed;
     } else {
-      entry.set_status(lc_complete);
+      entry.status = lc_complete;
     }
 
     ret = sal_lc->set_entry(this, null_yield, obj_names[index], entry);
@@ -1891,12 +1891,12 @@ clean:
 } /* RGWLC::bucket_lc_post */
 
 int RGWLC::list_lc_progress(string& marker, uint32_t max_entries,
-			    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+			    vector<rgw::sal::LCEntry>& progress_map,
 			    int& index)
 {
   progress_map.clear();
   for(; index < max_objs; index++, marker="") {
-    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+    vector<rgw::sal::LCEntry> entries;
     int ret = sal_lc->list_entries(this, null_yield, obj_names[index],
                                    marker, max_entries, entries);
     if (ret < 0) {
@@ -1914,7 +1914,7 @@ int RGWLC::list_lc_progress(string& marker, uint32_t max_entries,
 
     /* update index, marker tuple */
     if (progress_map.size() > 0)
-      marker = progress_map.back()->get_bucket();
+      marker = progress_map.back().bucket;
 
     if (progress_map.size() >= max_entries)
       break;
@@ -2034,7 +2034,6 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
   std::unique_ptr<rgw::sal::LCSerializer> serializer =
     sal_lc->get_serializer(lc_index_lock_name, obj_names[index],
 			   worker->thr_name());
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
   if (max_lock_secs <= 0) {
     return -EAGAIN;
   }
@@ -2053,11 +2052,12 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
   std::unique_lock<rgw::sal::LCSerializer> lock(
     *(serializer.get()), std::adopt_lock);
 
+  rgw::sal::LCEntry entry;
   ret = sal_lc->get_entry(this, null_yield, obj_names[index],
-                          bucket_entry_marker, &entry);
+                          bucket_entry_marker, entry);
   if (ret >= 0) {
-    if (entry->get_status() == lc_processing) {
-      if (expired_session(entry->get_start_time())) {
+    if (entry.status == lc_processing) {
+      if (expired_session(entry.start_time)) {
 	ldpp_dout(this, 5) << "RGWLC::process_bucket(): STALE lc session found for: " << entry
 			   << " index: " << index << " worker ix: " << worker->ix
 			   << " (clearing)"
@@ -2074,7 +2074,7 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
   }
 
   /* do nothing if no bucket */
-  if ((! entry) || entry->get_bucket().empty()) {
+  if ((ret < 0) || entry.bucket.empty()) {
     return ret;
   }
 
@@ -2082,11 +2082,11 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
 		     << " index: " << index << " worker ix: " << worker->ix
 		     << dendl;
 
-  entry->set_status(lc_processing);
-  ret = sal_lc->set_entry(this, null_yield, obj_names[index], *entry);
+  entry.status = lc_processing;
+  ret = sal_lc->set_entry(this, null_yield, obj_names[index], entry);
   if (ret < 0) {
     ldpp_dout(this, 0) << "RGWLC::process_bucket() failed to set obj entry "
-		       << obj_names[index] << entry->get_bucket() << entry->get_status()
+		       << obj_names[index] << entry.bucket << entry.status
 		       << dendl;
     return ret;
   }
@@ -2096,10 +2096,10 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
 		     << dendl;
 
   lock.unlock();
-  ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+  ret = bucket_lc_process(entry.bucket, worker, thread_stop_at(), once);
   ldpp_dout(this, 5) << "RGWLC::process_bucket(): END entry 2: " << entry
     << " index: " << index << " worker ix: " << worker->ix << " ret: " << ret << dendl;
-  bucket_lc_post(index, max_lock_secs, *entry, ret, worker);
+  bucket_lc_post(index, max_lock_secs, entry, ret, worker);
 
   return ret;
 } /* RGWLC::process_bucket */
@@ -2144,15 +2144,15 @@ static inline bool already_run_today(CephContext* cct, time_t start_date)
 } /* already_run_today */
 
 inline int RGWLC::advance_head(const std::string& lc_shard,
-			       rgw::sal::Lifecycle::LCHead& head,
-			       rgw::sal::Lifecycle::LCEntry& entry,
+			       rgw::sal::LCHead& head,
+			       const rgw::sal::LCEntry& entry,
 			       time_t start_date)
 {
   int ret{0};
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> next_entry;
+  rgw::sal::LCEntry next_entry;
 
   ret = sal_lc->get_next_entry(this, null_yield, lc_shard,
-                               entry.get_bucket(), &next_entry);
+                               entry.bucket, next_entry);
   if (ret < 0) {
     ldpp_dout(this, 0) << "RGWLC::process() failed to get obj entry "
 		       << lc_shard << dendl;
@@ -2160,8 +2160,8 @@ inline int RGWLC::advance_head(const std::string& lc_shard,
   }
 
   /* save the next position */
-  head.set_marker(next_entry->get_bucket());
-  head.set_start_date(start_date);
+  head.marker = next_entry.bucket;
+  head.start_date = start_date;
 
   ret = sal_lc->put_head(this, null_yield, lc_shard, head);
   if (ret < 0) {
@@ -2175,17 +2175,17 @@ exit:
 } /* advance head */
 
 inline int RGWLC::check_if_shard_done(const std::string& lc_shard,
-				rgw::sal::Lifecycle::LCHead& head, int worker_ix)
+				rgw::sal::LCHead& head, int worker_ix)
 {
   int ret{0};
 
-  if (head.get_marker().empty()) {
+  if (head.marker.empty()) {
     /* done with this shard */
     ldpp_dout(this, 5) <<
       "RGWLC::process() next_entry not found. cycle finished lc_shard="
        << lc_shard << " worker=" << worker_ix
        << dendl;
-      head.set_shard_rollover_date(ceph_clock_now());
+      head.shard_rollover_date = ceph_clock_now();
       ret = sal_lc->put_head(this, null_yield, lc_shard, head);
       if (ret < 0) {
         ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
@@ -2198,8 +2198,8 @@ inline int RGWLC::check_if_shard_done(const std::string& lc_shard,
 }
 
 inline int RGWLC::update_head(const std::string& lc_shard,
-			       rgw::sal::Lifecycle::LCHead& head,
-			       rgw::sal::Lifecycle::LCEntry& entry,
+			       rgw::sal::LCHead& head,
+			       rgw::sal::LCEntry& entry,
 			       time_t start_date, int worker_ix)
 {
   int ret{0};
@@ -2229,8 +2229,8 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
   int ret{0};
   const auto& lc_shard = obj_names[index];
 
-  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS
+  rgw::sal::LCHead head;
+  rgw::sal::LCEntry entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS
 
   ldpp_dout(this, 5) << "RGWLC::process(): ENTER: "
 	  << "index: " << index << " worker ix: " << worker->ix
@@ -2264,7 +2264,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
     utime_t now = ceph_clock_now();
 
     /* preamble: find an inital bucket/marker */
-    ret = sal_lc->get_head(this, null_yield, lc_shard, &head);
+    ret = sal_lc->get_head(this, null_yield, lc_shard, head);
     if (ret < 0) {
       ldpp_dout(this, 0) << "RGWLC::process() failed to get obj head "
           << lc_shard << ", ret=" << ret << dendl;
@@ -2273,18 +2273,18 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
 
     /* if there is nothing at head, try to reinitialize head.marker with the
      * first entry in the queue */
-    if (head->get_marker().empty() &&
-	allow_shard_rollover(cct, now, head->get_shard_rollover_date()) /* prevent multiple passes by diff.
+    if (head.marker.empty() &&
+	allow_shard_rollover(cct, now, head.shard_rollover_date) /* prevent multiple passes by diff.
 								  * rgws,in same cycle */) {
 
       ldpp_dout(this, 5) << "RGWLC::process() process shard rollover lc_shard=" << lc_shard
-			 << " head.marker=" << head->get_marker()
-			 << " head.shard_rollover_date=" << head->get_shard_rollover_date()
+			 << " head.marker=" << head.marker
+			 << " head.shard_rollover_date=" << head.shard_rollover_date
 			 << dendl;
 
-      vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+      vector<rgw::sal::LCEntry> entries;
       int ret = sal_lc->list_entries(this, null_yield, lc_shard,
-                                     head->get_marker(), 1, entries);
+                                     head.marker, 1, entries);
       if (ret < 0) {
 	ldpp_dout(this, 0) << "RGWLC::process() sal_lc->list_entries(lc_shard, head.marker, 1, "
 			   << "entries) returned error ret==" << ret << dendl;
@@ -2292,27 +2292,27 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
       }
       if (entries.size() > 0) {
 	entry = std::move(entries.front());
-	head->set_marker(entry->get_bucket());
-	head->set_start_date(now);
-	head->set_shard_rollover_date(0);
+	head.marker = entry.bucket;
+	head.start_date= now;
+	head.shard_rollover_date = 0;
       }
     } else {
       ldpp_dout(this, 0) << "RGWLC::process() head.marker !empty() at START for shard=="
 			 << lc_shard << " head last stored at "
-			 << rgw_to_asctime(utime_t(time_t(head->get_start_date()), 0))
+			 << rgw_to_asctime(utime_t(head.start_date, 0))
 			 << dendl;
 
       /* fetches the entry pointed to by head.bucket */
       ret = sal_lc->get_entry(this, null_yield, lc_shard,
-                              head->get_marker(), &entry);
+                              head.marker, entry);
       if (ret == -ENOENT) {
         /* skip to next entry */
-	      std::unique_ptr<rgw::sal::Lifecycle::LCEntry> tmp_entry = sal_lc->get_entry();
-      	tmp_entry->set_bucket(head->get_marker());
+        rgw::sal::LCEntry tmp_entry;
+        tmp_entry.bucket = head.marker;
 
-	      if (update_head(lc_shard, *head.get(), *tmp_entry.get(), now, worker->ix) != 0) {
-	        goto exit;
-	      }
+        if (update_head(lc_shard, head, tmp_entry, now, worker->ix) != 0) {
+          goto exit;
+        }
         continue;
       }
       if (ret < 0) {
@@ -2322,9 +2322,9 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
       }
     }
 
-    if (entry && !entry->get_bucket().empty()) {
-      if (entry->get_status() == lc_processing) {
-        if (expired_session(entry->get_start_time())) {
+    if (!entry.bucket.empty()) {
+      if (entry.status == lc_processing) {
+        if (expired_session(entry.start_time)) {
           ldpp_dout(this, 5)
               << "RGWLC::process(): STALE lc session found for: " << entry
               << " index: " << index << " worker ix: " << worker->ix
@@ -2334,19 +2334,19 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
               << "RGWLC::process(): ACTIVE entry: " << entry
               << " index: " << index << " worker ix: " << worker->ix << dendl;
 	  /* skip to next entry */
-	  if (update_head(lc_shard, *head.get(), *entry.get(), now, worker->ix) != 0) {
+	  if (update_head(lc_shard, head, entry, now, worker->ix) != 0) {
 	     goto exit;
 	  }
           continue;
         }
       } else {
-	if ((entry->get_status() == lc_complete) &&
-	    already_run_today(cct, entry->get_start_time())) {
+	if ((entry.status == lc_complete) &&
+	    already_run_today(cct, entry.start_time)) {
 	  ldpp_dout(this, 5) << "RGWLC::process() worker ix: " << worker->ix
-			     << " SKIP processing for already-processed bucket " << entry->get_bucket()
+			     << " SKIP processing for already-processed bucket " << entry.bucket
 			     << dendl;
 	  /* skip to next entry */
-	      if (update_head(lc_shard, *head.get(), *entry.get(), now, worker->ix) != 0) {
+	      if (update_head(lc_shard, head, entry, now, worker->ix) != 0) {
 	        goto exit;
 	      }
 	  continue;
@@ -2368,18 +2368,18 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
 	    << " index: " << index << " worker ix: " << worker->ix
 	    << dendl;
 
-    entry->set_status(lc_processing);
-    entry->set_start_time(now);
+    entry.status = lc_processing;
+    entry.start_time = now;
 
-    ret = sal_lc->set_entry(this, null_yield, lc_shard, *entry);
+    ret = sal_lc->set_entry(this, null_yield, lc_shard, entry);
     if (ret < 0) {
       ldpp_dout(this, 0) << "RGWLC::process() failed to set obj entry "
-	      << lc_shard << entry->get_bucket() << entry->get_status() << dendl;
+	      << lc_shard << entry.bucket << entry.status << dendl;
       goto exit;
     }
 
     /* advance head for next waiter, then process */
-    if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) {
+    if (advance_head(lc_shard, head, entry, now) < 0) {
       goto exit;
     }
 
@@ -2390,7 +2390,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
     /* drop lock so other instances can make progress while this
      * bucket is being processed */
     lock->unlock();
-    ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+    ret = bucket_lc_process(entry.bucket, worker, thread_stop_at(), once);
     ldpp_dout(this, 5) << "RGWLC::process(): END entry 2: " << entry
       << " index: " << index << " worker ix: " << worker->ix << " ret: " << ret << dendl;
 
@@ -2407,7 +2407,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
       /* XXXX are we SURE the only way result could == ENOENT is when
        * there is no such bucket?  It is currently the value returned
        * from bucket_lc_process(...) */
-      ret = sal_lc->rm_entry(this, null_yield, lc_shard, *entry);
+      ret = sal_lc->rm_entry(this, null_yield, lc_shard, entry);
       if (ret < 0) {
         ldpp_dout(this, 0) << "RGWLC::process() failed to remove entry "
 			   << lc_shard << " (nonfatal)"
@@ -2416,11 +2416,11 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
       }
     } else {
       if (ret < 0) {
-        entry->set_status(lc_failed);
+        entry.status = lc_failed;
       } else {
-        entry->set_status(lc_complete);
+        entry.status = lc_complete;
       }
-      ret = sal_lc->set_entry(this, null_yield, lc_shard, *entry);
+      ret = sal_lc->set_entry(this, null_yield, lc_shard, entry);
       if (ret < 0) {
         ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on lc_shard="
                            << lc_shard << " entry=" << entry
@@ -2430,7 +2430,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
       }
     }
 
-    if (check_if_shard_done(lc_shard, *head.get(), worker->ix) != 0 ) {
+    if (check_if_shard_done(lc_shard, head, worker->ix) != 0 ) {
       goto exit;
     }
   } while(1 && !once && !going_down());
@@ -2569,9 +2569,9 @@ static int guard_lc_modify(const DoutPrefixProvider *dpp,
   get_lc_oid(cct, bucket_lc_key, &oid);
 
   /* XXX it makes sense to take shard_id for a bucket_id? */
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry = sal_lc->get_entry();
-  entry->set_bucket(bucket_lc_key);
-  entry->set_status(lc_uninitial);
+  rgw::sal::LCEntry entry;
+  entry.bucket = bucket_lc_key;
+  entry.status = lc_uninitial;
   int max_lock_secs = cct->_conf->rgw_lc_lock_max_time;
 
   std::unique_ptr<rgw::sal::LCSerializer> lock =
@@ -2598,7 +2598,7 @@ static int guard_lc_modify(const DoutPrefixProvider *dpp,
           << oid << ", ret=" << ret << dendl;
       break;
     }
-    ret = f(sal_lc, oid, *entry.get());
+    ret = f(sal_lc, oid, entry);
     if (ret < 0) {
       ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to set entry on "
           << oid << ", ret=" << ret << dendl;
@@ -2634,7 +2634,7 @@ int RGWLC::set_bucket_config(const DoutPrefixProvider* dpp, optional_yield y,
 
   ret = guard_lc_modify(dpp, driver, sal_lc.get(), b, cookie,
 			[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
-			    rgw::sal::Lifecycle::LCEntry& entry) {
+			    rgw::sal::LCEntry& entry) {
     return sal_lc->set_entry(dpp, y, oid, entry);
   });
 
@@ -2663,7 +2663,7 @@ int RGWLC::remove_bucket_config(const DoutPrefixProvider* dpp, optional_yield y,
 
   ret = guard_lc_modify(dpp, driver, sal_lc.get(), b, cookie,
 			[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
-			    rgw::sal::Lifecycle::LCEntry& entry) {
+			    rgw::sal::LCEntry& entry) {
     return sal_lc->rm_entry(dpp, y, oid, entry);
   });
 
@@ -2692,13 +2692,13 @@ int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
   std::string lc_oid;
   get_lc_oid(driver->ctx(), bucket_lc_key, &lc_oid);
 
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+  rgw::sal::LCEntry entry;
   // There are multiple cases we need to encounter here
   // 1. entry exists and is already set to marker, happens in plain buckets & newly resharded buckets
   // 2. entry doesn't exist, which usually happens when reshard has happened prior to update and next LC process has already dropped the update
   // 3. entry exists matching the current bucket id which was after a reshard (needs to be updated to the marker)
   // We are not dropping the old marker here as that would be caught by the next LC process update
-  int ret = sal_lc->get_entry(dpp, null_yield, lc_oid, bucket_lc_key, &entry);
+  int ret = sal_lc->get_entry(dpp, null_yield, lc_oid, bucket_lc_key, entry);
   if (ret == 0) {
     ldpp_dout(dpp, 5) << "Entry already exists, nothing to do" << dendl;
     return ret; // entry is already existing correctly set to marker
@@ -2716,7 +2716,7 @@ int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
       driver, sal_lc, bucket->get_key(), cookie,
       [dpp, &lc_oid](rgw::sal::Lifecycle* slc,
 			      const string& oid,
-			      rgw::sal::Lifecycle::LCEntry& entry) {
+			      rgw::sal::LCEntry& entry) {
 	return slc->set_entry(dpp, null_yield, lc_oid, entry);
       });
 
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
index d53e14cee65..b4c6ad4a86b 100644
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -628,15 +628,15 @@ public:
 	      const std::unique_ptr<rgw::sal::Bucket>& optional_bucket,
 	      bool once);
   int advance_head(const std::string& lc_shard,
-		   rgw::sal::Lifecycle::LCHead& head,
-		   rgw::sal::Lifecycle::LCEntry& entry,
+		   rgw::sal::LCHead& head,
+		   const rgw::sal::LCEntry& entry,
 		   time_t start_date);
   int check_if_shard_done(const std::string& lc_shard,
- 			 rgw::sal::Lifecycle::LCHead& head,
+ 			 rgw::sal::LCHead& head,
        int worker_ix);
   int update_head(const std::string& lc_shard,
-			 rgw::sal::Lifecycle::LCHead& head,
-			 rgw::sal::Lifecycle::LCEntry& entry,
+			 rgw::sal::LCHead& head,
+			 rgw::sal::LCEntry& entry,
 			 time_t start_date, int worker_ix);
   int process(int index, int max_lock_secs, LCWorker* worker, bool once);
   int process_bucket(int index, int max_lock_secs, LCWorker* worker,
@@ -644,12 +644,12 @@ public:
   bool expired_session(time_t started);
   time_t thread_stop_at();
   int list_lc_progress(std::string& marker, uint32_t max_entries,
-		       std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>&,
+		       std::vector<rgw::sal::LCEntry>&,
 		       int& index);
   int bucket_lc_process(std::string& shard_id, LCWorker* worker, time_t stop_at,
 			bool once);
   int bucket_lc_post(int index, int max_lock_sec,
-		     rgw::sal::Lifecycle::LCEntry& entry, int& result, LCWorker* worker);
+		     rgw::sal::LCEntry& entry, int& result, LCWorker* worker);
   bool going_down();
   void start_processor();
   void stop_processor();
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 870dd1cee90..b54805bdc7d 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -6703,15 +6703,61 @@ void RGWCompleteMultipart::execute(optional_yield y)
     return;
   }
 
+  RGWObjVersionTracker& objv_tracker = meta_obj->get_version_tracker();
+
+  using prefix_map_t = rgw::sal::MultipartUpload::prefix_map_t;
+  prefix_map_t processed_prefixes;
+
   op_ret =
     upload->complete(this, y, s->cct, parts->parts, remove_objs, accounted_size,
-		     compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch,
-		     s->object.get());
+                     compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch,
+                     s->object.get(), processed_prefixes);
   if (op_ret < 0) {
     ldpp_dout(this, 0) << "ERROR: upload complete failed ret=" << op_ret << dendl;
     return;
   }
 
+  remove_objs.clear();
+
+  // use cls_version_check() when deleting the meta object to detect part uploads that raced
+  // with upload->complete(). any parts that finish after that won't be part of the final
+  // upload, so they need to be gc'd and removed from the bucket index before retrying
+  // deletion of the multipart meta object
+  static constexpr auto MAX_DELETE_RETRIES = 15u;
+  for (auto i = 0u; i < MAX_DELETE_RETRIES; i++) {
+    // remove the upload meta object ; the meta object is not versioned
+    // when the bucket is, as that would add an unneeded delete marker
+    int ret = meta_obj->delete_object(this, y, rgw::sal::FLAG_PREVENT_VERSIONING, &remove_objs, &objv_tracker);
+    if (ret != -ECANCELED || i == MAX_DELETE_RETRIES - 1) {
+      if (ret >= 0) {
+        /* serializer's exclusive lock is released */
+        serializer->clear_locked();
+      } else {
+        ldpp_dout(this, 1) << "ERROR: failed to remove object " << meta_obj << ", ret: " << ret << dendl;
+      }
+      break;
+    }
+
+    ldpp_dout(this, 20) << "deleting meta_obj is cancelled due to mismatch cls_version: " << objv_tracker << dendl;
+    objv_tracker.clear();
+
+    ret = meta_obj->get_obj_attrs(s->yield, this);
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "ERROR: failed to get obj attrs, obj=" << meta_obj
+			 << " ret=" << ret << dendl;
+
+      if (ret != -ENOENT) {
+	ldpp_dout(this, 0) << "ERROR: failed to remove object " << meta_obj << dendl;
+      }
+      break;
+    }
+
+    ret = upload->cleanup_orphaned_parts(this, s->cct, y, meta_obj->get_obj(), remove_objs, processed_prefixes);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to clenup orphaned parts. ret=" << ret << dendl;
+    }
+  }
+
   const ceph::real_time upload_time = upload->get_mtime();
   etag = s->object->get_attrs()[RGW_ATTR_ETAG].to_str();
 
@@ -6721,17 +6767,6 @@ void RGWCompleteMultipart::execute(optional_yield y)
     ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
     // too late to rollback operation, hence op_ret is not set here
   }
-
-  // remove the upload meta object ; the meta object is not versioned
-  // when the bucket is, as that would add an unneeded delete marker
-  ret = meta_obj->delete_object(this, y, rgw::sal::FLAG_PREVENT_VERSIONING);
-  if (ret >= 0) {
-    /* serializer's exclusive lock is released */
-    serializer->clear_locked();
-  } else {
-    ldpp_dout(this, 4) << "WARNING: failed to remove object " << meta_obj << ", ret: " << ret << dendl;
-  }
-
 } // RGWCompleteMultipart::execute
 
 bool RGWCompleteMultipart::check_previously_completed(const RGWMultiCompleteUpload* parts)
diff --git a/src/rgw/rgw_process.cc b/src/rgw/rgw_process.cc
index 14d6d26f7d2..8be7be79069 100644
--- a/src/rgw/rgw_process.cc
+++ b/src/rgw/rgw_process.cc
@@ -460,20 +460,24 @@ done:
   } else {
     ldpp_dout(s, 2) << "http status=" << s->err.http_ret << dendl;
   }
-  if (handler)
-    handler->put_op(op);
-  rest->put_handler(handler);
 
   const auto lat = s->time_elapsed();
   if (latency) {
     *latency = lat;
   }
   dout(1) << "====== req done req=" << hex << req << dec
-	  << " op status=" << op_ret
-	  << " http_status=" << s->err.http_ret
-	  << " latency=" << lat
-	  << " ======"
-	  << dendl;
+          << " op=" << (op ? op->name() : "unknown")
+          << " bucket=" << s->bucket_name
+          << " status=" << op_ret
+          << " http_status=" << s->err.http_ret
+          << " latency=" << lat
+          << " request_id=" << s->trans_id
+          << " ======"
+          << dendl;
+
+  if (handler)
+    handler->put_op(op);
+  rest->put_handler(handler);
 
   return (ret < 0 ? ret : s->err.ret);
 } /* process_request */
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
index 8ddfb7f7c7d..d6129165689 100644
--- a/src/rgw/rgw_sal.h
+++ b/src/rgw/rgw_sal.h
@@ -1096,6 +1096,7 @@ class Object {
         rgw_zone_set* zones_trace{nullptr};
 	bool abortmp{false};
 	uint64_t parts_accounted_size{0};
+        RGWObjVersionTracker* objv_tracker = nullptr;
       } params;
 
       struct Result {
@@ -1115,7 +1116,9 @@ class Object {
     /** Shortcut synchronous delete call for common deletes */
     virtual int delete_object(const DoutPrefixProvider* dpp,
 			      optional_yield y,
-			      uint32_t flags) = 0;
+			      uint32_t flags,
+			      std::list<rgw_obj_index_key>* remove_objs,
+			      RGWObjVersionTracker* objv) = 0;
     /** Copy an this object to another object. */
     virtual int copy_object(const ACLOwner& owner, const rgw_user& remote_user,
                req_info* info, const rgw_zone_id& source_zone,
@@ -1278,6 +1281,9 @@ class Object {
     virtual int get_torrent_info(const DoutPrefixProvider* dpp,
                                  optional_yield y, bufferlist& bl) = 0;
 
+    /** Get the version tracker for this object */
+    virtual RGWObjVersionTracker& get_version_tracker() = 0;
+
     /** Get the OMAP values matching the given set of keys */
     virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
 			      const std::set<std::string>& keys,
@@ -1361,6 +1367,8 @@ public:
  */
 class MultipartUpload {
 public:
+  using prefix_map_t = boost::container::flat_map<uint32_t, boost::container::flat_set<std::string>>;
+
   //object lock
   std::optional<RGWObjectRetention> obj_retention = std::nullopt;
   std::optional<RGWObjectLegalHold> obj_legal_hold = std::nullopt;
@@ -1407,7 +1415,14 @@ public:
 		       RGWCompressionInfo& cs_info, off_t& ofs,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) = 0;
+		       rgw::sal::Object* target_obj,
+                       prefix_map_t& processed_prefixes) = 0;
+  /** Cleanup orphaned parts caused by racing condition involving part upload retry */
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                     CephContext *cct, optional_yield y,
+                                     const rgw_obj& obj,
+                                     std::list<rgw_obj_index_key>& remove_objs,
+                                     prefix_map_t& processed_prefixes) = 0;
 
   /** Get placement and/or attribute info for this upload */
   virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) = 0;
@@ -1491,6 +1506,21 @@ public:
   virtual ~LCSerializer() = default;
 };
 
+/** Head of a lifecycle run.  Used for tracking parallel lifecycle runs. */
+struct LCHead {
+  time_t start_date = 0;
+  std::string marker;
+  time_t shard_rollover_date = 0;
+};
+
+/** Single entry in a lifecycle run.  Multiple entries can exist processing different
+ * buckets. */
+struct LCEntry {
+  std::string bucket;
+  uint64_t start_time = 0;
+  uint32_t status = 0;
+};
+
 /**
  * @brief Abstraction for lifecycle processing
  *
@@ -1500,84 +1530,34 @@ public:
  */
 class Lifecycle {
 public:
-  /** Head of a lifecycle run.  Used for tracking parallel lifecycle runs. */
-  struct LCHead {
-    LCHead() = default;
-    virtual ~LCHead() = default;
-
-    virtual time_t& get_start_date() = 0;
-    virtual void set_start_date(time_t) = 0;
-    virtual std::string& get_marker() = 0;
-    virtual void set_marker(const std::string&) = 0;
-    virtual time_t& get_shard_rollover_date() = 0;
-    virtual void set_shard_rollover_date(time_t) = 0;
-  };
-
-  /** Single entry in a lifecycle run.  Multiple entries can exist processing different
-   * buckets. */
-  struct LCEntry {
-    LCEntry() = default;
-    virtual ~LCEntry() = default;
-
-    virtual std::string& get_bucket() = 0;
-    virtual void set_bucket(const std::string&) = 0;
-    virtual std::string& get_oid() = 0;
-    virtual void set_oid(const std::string&) = 0;
-    virtual uint64_t get_start_time() = 0;
-    virtual void set_start_time(uint64_t) = 0;
-    virtual uint32_t get_status() = 0;
-    virtual void set_status(uint32_t) = 0;
-
-    /** Print the entry to @a out */
-    virtual void print(std::ostream& out) const = 0;
-
-    friend inline std::ostream& operator<<(std::ostream& out, const LCEntry& e) {
-      e.print(out);
-      return out;
-    }
-    friend inline std::ostream& operator<<(std::ostream& out, const LCEntry* e) {
-      if (!e)
-	out << "<NULL>";
-      else
-	e->print(out);
-      return out;
-    }
-    friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<LCEntry>& p) {
-      out << p.get();
-      return out;
-      }
-  };
-
   Lifecycle() = default;
   virtual ~Lifecycle() = default;
 
-  /** Get an empty entry */
-  virtual std::unique_ptr<LCEntry> get_entry() = 0;
   /** Get an entry matching the given marker */
   virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                         const std::string& oid, const std::string& marker,
-                        std::unique_ptr<LCEntry>* entry) = 0;
+                        LCEntry& entry) = 0;
   /** Get the entry following the given marker */
   virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                              const std::string& oid, const std::string& marker,
-                             std::unique_ptr<LCEntry>* entry) = 0;
+                             LCEntry& entry) = 0;
   /** Store a modified entry in then backing store */
   virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                        const std::string& oid, LCEntry& entry) = 0;
+                        const std::string& oid, const LCEntry& entry) = 0;
   /** List all known entries */
   virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                            const std::string& oid, const std::string& marker,
 			   uint32_t max_entries,
-			   std::vector<std::unique_ptr<LCEntry>>& entries) = 0;
+			   std::vector<LCEntry>& entries) = 0;
   /** Remove an entry from the backing store */
   virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, LCEntry& entry) = 0;
+                       const std::string& oid, const LCEntry& entry) = 0;
   /** Get a head */
   virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, std::unique_ptr<LCHead>* head) = 0;
+                       const std::string& oid, LCHead& head) = 0;
   /** Store a modified head to the backing store */
   virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, LCHead& head) = 0;
+                       const std::string& oid, const LCHead& head) = 0;
 
   /** Get a serializer for lifecycle */
   virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc
index 51627914db2..5f8bab70817 100644
--- a/src/rgw/rgw_sal_dbstore.cc
+++ b/src/rgw/rgw_sal_dbstore.cc
@@ -715,7 +715,11 @@ namespace rgw::sal {
     return ret;
   }
 
-  int DBObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags)
+  int DBObject::delete_object(const DoutPrefixProvider* dpp,
+      optional_yield y,
+      uint32_t flags,
+      std::list<rgw_obj_index_key>* remove_objs,
+      RGWObjVersionTracker* objv)
   {
     DB::Object del_target(store->getDB(), bucket->get_info(), get_obj());
     DB::Object::Delete del_op(&del_target);
@@ -907,7 +911,8 @@ namespace rgw::sal {
 				   RGWCompressionInfo& cs_info, off_t& ofs,
 				   std::string& tag, ACLOwner& owner,
 				   uint64_t olh_epoch,
-				   rgw::sal::Object* target_obj)
+				   rgw::sal::Object* target_obj,
+				   prefix_map_t& processed_prefixes)
   {
     char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
     char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -1017,6 +1022,15 @@ namespace rgw::sal {
     return ret;
   }
 
+  int DBMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+      CephContext *cct, optional_yield y,
+      const rgw_obj& obj,
+      std::list<rgw_obj_index_key>& remove_objs,
+      prefix_map_t& processed_prefixes)
+  {
+    return -ENOTSUP;
+  }
+
   int DBMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
   {
     if (!rule && !attrs) {
@@ -1840,45 +1854,45 @@ namespace rgw::sal {
 
   int DBLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                              const std::string& oid, const std::string& marker,
-                             std::unique_ptr<LCEntry>* entry)
+                             LCEntry& entry)
   {
     return store->getDB()->get_entry(oid, marker, entry);
   }
 
   int DBLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                                   const std::string& oid, const std::string& marker,
-				  std::unique_ptr<LCEntry>* entry)
+				  LCEntry& entry)
   {
     return store->getDB()->get_next_entry(oid, marker, entry);
   }
 
   int DBLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                             const std::string& oid, LCEntry& entry)
+                             const std::string& oid, const LCEntry& entry)
   {
     return store->getDB()->set_entry(oid, entry);
   }
 
   int DBLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                                 const std::string& oid, const std::string& marker,
-  				 uint32_t max_entries, vector<std::unique_ptr<LCEntry>>& entries)
+  				 uint32_t max_entries, vector<LCEntry>& entries)
   {
     return store->getDB()->list_entries(oid, marker, max_entries, entries);
   }
 
   int DBLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                            const std::string& oid, LCEntry& entry)
+                            const std::string& oid, const LCEntry& entry)
   {
     return store->getDB()->rm_entry(oid, entry);
   }
 
   int DBLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                            const std::string& oid, std::unique_ptr<LCHead>* head)
+                            const std::string& oid, LCHead& head)
   {
     return store->getDB()->get_head(oid, head);
   }
 
   int DBLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
-                            const std::string& oid, LCHead& head)
+                            const std::string& oid, const LCHead& head)
   {
     return store->getDB()->put_head(oid, head);
   }
diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h
index f163b874eb0..107ba735a63 100644
--- a/src/rgw/rgw_sal_dbstore.h
+++ b/src/rgw/rgw_sal_dbstore.h
@@ -38,31 +38,30 @@ public:
   }
 };
 
-class DBLifecycle : public StoreLifecycle {
+class DBLifecycle : public Lifecycle {
   DBStore* store;
 
 public:
   DBLifecycle(DBStore* _st) : store(_st) {}
 
-  using StoreLifecycle::get_entry;
   virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                         const std::string& oid, const std::string& marker,
-                        std::unique_ptr<LCEntry>* entry) override;
+                        LCEntry& entry) override;
   virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                              const std::string& oid, const std::string& marker,
-                             std::unique_ptr<LCEntry>* entry) override;
+                             LCEntry& entry) override;
   virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                        const std::string& oid, LCEntry& entry) override;
+                        const std::string& oid, const LCEntry& entry) override;
   virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                            const std::string& oid, const std::string& marker,
 			   uint32_t max_entries,
-			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+			   std::vector<LCEntry>& entries) override;
   virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, LCEntry& entry) override;
+                       const std::string& oid, const LCEntry& entry) override;
   virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, std::unique_ptr<LCHead>* head) override;
-  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
                        const std::string& oid, LCHead& head) override;
+  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+                       const std::string& oid, const LCHead& head) override;
   virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
 						       const std::string& oid,
 						       const std::string& cookie) override;
@@ -460,7 +459,13 @@ protected:
 		       RGWCompressionInfo& cs_info, off_t& ofs,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) override;
+		       rgw::sal::Object* target_obj,
+		       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                     CephContext *cct, optional_yield y,
+                                     const rgw_obj& obj,
+                                     std::list<rgw_obj_index_key>& remove_objs,
+                                     prefix_map_t& processed_prefixes) override;
     virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
     virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
 			  optional_yield y,
@@ -532,7 +537,9 @@ protected:
 
       virtual int delete_object(const DoutPrefixProvider* dpp,
           optional_yield y,
-          uint32_t flags) override;
+          uint32_t flags,
+          std::list<rgw_obj_index_key>* remove_objs,
+          RGWObjVersionTracker* objv) override;
       virtual int copy_object(const ACLOwner& owner,
           const rgw_user& remote_user,
           req_info* info, const rgw_zone_id& source_zone,
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
index 5afefa3d258..272862cb7e1 100644
--- a/src/rgw/rgw_sal_filter.cc
+++ b/src/rgw/rgw_sal_filter.cc
@@ -994,9 +994,11 @@ int FilterBucket::abort_multiparts(const DoutPrefixProvider* dpp, CephContext* c
 
 int FilterObject::delete_object(const DoutPrefixProvider* dpp,
 				optional_yield y,
-				uint32_t flags)
+				uint32_t flags,
+				std::list<rgw_obj_index_key>* remove_objs,
+				RGWObjVersionTracker* objv)
 {
-  return next->delete_object(dpp, y, flags);
+  return next->delete_object(dpp, y, flags, remove_objs, objv);
 }
 
 int FilterObject::copy_object(const ACLOwner& owner,
@@ -1282,11 +1284,21 @@ int FilterMultipartUpload::complete(const DoutPrefixProvider *dpp,
 				    RGWCompressionInfo& cs_info, off_t& ofs,
 				    std::string& tag, ACLOwner& owner,
 				    uint64_t olh_epoch,
-				    rgw::sal::Object* target_obj)
+				    rgw::sal::Object* target_obj,
+				    prefix_map_t& processed_prefixes)
 {
   return next->complete(dpp, y, cct, part_etags, remove_objs, accounted_size,
 			compressed, cs_info, ofs, tag, owner, olh_epoch,
-			nextObject(target_obj));
+			nextObject(target_obj), processed_prefixes);
+}
+
+int FilterMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                                  CephContext *cct, optional_yield y,
+                                                  const rgw_obj& obj,
+                                                  std::list<rgw_obj_index_key>& remove_objs,
+                                                  prefix_map_t& processed_prefixes)
+{
+  return next->cleanup_orphaned_parts(dpp, cct, y, obj, remove_objs, processed_prefixes);
 }
 
 int FilterMultipartUpload::get_info(const DoutPrefixProvider *dpp,
@@ -1324,40 +1336,22 @@ int FilterLCSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur,
   return next->try_lock(dpp, dur, y);
 }
 
-std::unique_ptr<Lifecycle::LCEntry> FilterLifecycle::get_entry()
-{
-  std::unique_ptr<Lifecycle::LCEntry> e = next->get_entry();
-  return std::make_unique<FilterLCEntry>(std::move(e));
-}
-
 int FilterLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                                const std::string& oid, const std::string& marker,
-			       std::unique_ptr<LCEntry>* entry)
+			       LCEntry& entry)
 {
-  std::unique_ptr<LCEntry> ne;
-  int ret = next->get_entry(dpp, y, oid, marker, &ne);
-  if (ret < 0)
-    return ret;
-
-  *entry = std::make_unique<FilterLCEntry>(std::move(ne));
-  return 0;
+  return next->get_entry(dpp, y, oid, marker, entry);
 }
 
 int FilterLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                                     const std::string& oid, const std::string& marker,
-				    std::unique_ptr<LCEntry>* entry)
+				    LCEntry& entry)
 {
-  std::unique_ptr<LCEntry> ne;
-  int ret = next->get_next_entry(dpp, y, oid, marker, &ne);
-  if (ret < 0)
-    return ret;
-
-  *entry = std::make_unique<FilterLCEntry>(std::move(ne));
-  return 0;
+  return next->get_next_entry(dpp, y, oid, marker, entry);
 }
 
 int FilterLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                               const std::string& oid, LCEntry& entry)
+                               const std::string& oid, const LCEntry& entry)
 {
   return next->set_entry(dpp, y, oid, entry);
 }
@@ -1365,42 +1359,27 @@ int FilterLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
 int FilterLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                                   const std::string& oid, const std::string& marker,
 				  uint32_t max_entries,
-				  std::vector<std::unique_ptr<LCEntry>>& entries)
+				  std::vector<LCEntry>& entries)
 {
-  std::vector<std::unique_ptr<LCEntry>> ne;
-  int ret = next->list_entries(dpp, y, oid, marker, max_entries, ne);
-  if (ret < 0)
-    return ret;
-
-  for (auto& ent : ne) {
-    entries.emplace_back(std::make_unique<FilterLCEntry>(std::move(ent)));
-  }
-
-  return 0;
+  return next->list_entries(dpp, y, oid, marker, max_entries, entries);
 }
 
 int FilterLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                              const std::string& oid, LCEntry& entry)
+                              const std::string& oid, const LCEntry& entry)
 {
   return next->rm_entry(dpp, y, oid, entry);
 }
 
 int FilterLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                              const std::string& oid, std::unique_ptr<LCHead>* head)
+                              const std::string& oid, LCHead& head)
 {
-  std::unique_ptr<LCHead> nh;
-  int ret = next->get_head(dpp, y, oid, &nh);
-  if (ret < 0)
-    return ret;
-
-  *head = std::make_unique<FilterLCHead>(std::move(nh));
-  return 0;
+  return next->get_head(dpp, y, oid, head);
 }
 
 int FilterLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
-                              const std::string& oid, LCHead& head)
+                              const std::string& oid, const LCHead& head)
 {
-  return next->put_head(dpp, y, oid, *(dynamic_cast<FilterLCHead&>(head).next.get()));
+  return next->put_head(dpp, y, oid, head);
 }
 
 std::unique_ptr<LCSerializer> FilterLifecycle::get_serializer(
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
index fc2e3aeffa3..b12ea53a9bb 100644
--- a/src/rgw/rgw_sal_filter.h
+++ b/src/rgw/rgw_sal_filter.h
@@ -727,7 +727,9 @@ public:
 
   virtual int delete_object(const DoutPrefixProvider* dpp,
 			    optional_yield y,
-			    uint32_t flags) override;
+			    uint32_t flags,
+			    std::list<rgw_obj_index_key>* remove_objs,
+			    RGWObjVersionTracker* objv) override;
   virtual int copy_object(const ACLOwner& owner,
                const rgw_user& remote_user,
                req_info* info, const rgw_zone_id& source_zone,
@@ -843,6 +845,8 @@ public:
   virtual int get_torrent_info(const DoutPrefixProvider* dpp,
                                optional_yield y, bufferlist& bl) override;
 
+  virtual RGWObjVersionTracker& get_version_tracker() override { return next->get_version_tracker(); }
+
   virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
 				    const std::string& oid,
 				    const std::set<std::string>& keys,
@@ -920,7 +924,13 @@ public:
 		       RGWCompressionInfo& cs_info, off_t& ofs,
 		       std::string& tag, ACLOwner& owner,
 		       uint64_t olh_epoch,
-		       rgw::sal::Object* target_obj) override;
+		       rgw::sal::Object* target_obj,
+		       prefix_map_t& processed_prefixes) override;
+  virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+                                     CephContext *cct, optional_yield y,
+                                     const rgw_obj& obj,
+                                     std::list<rgw_obj_index_key>& remove_objs,
+                                     prefix_map_t& processed_prefixes) override;
 
   virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y,
 		       rgw_placement_rule** rule,
@@ -969,59 +979,28 @@ protected:
   std::unique_ptr<Lifecycle> next;
 
 public:
-  struct FilterLCHead : LCHead {
-    std::unique_ptr<LCHead> next;
-
-    FilterLCHead(std::unique_ptr<LCHead> _next) : next(std::move(_next)) {}
-    virtual ~FilterLCHead() = default;
-
-    virtual time_t& get_start_date() override { return next->get_start_date(); }
-    virtual void set_start_date(time_t t) override { next->set_start_date(t); }
-    virtual std::string& get_marker() override { return next->get_marker(); }
-    virtual void set_marker(const std::string& m) override { next->set_marker(m); }
-    virtual time_t& get_shard_rollover_date() override { return next->get_shard_rollover_date(); }
-    virtual void set_shard_rollover_date(time_t t) override { next->set_shard_rollover_date(t); }
-  };
-
-  struct FilterLCEntry : LCEntry {
-    std::unique_ptr<LCEntry> next;
-
-    FilterLCEntry(std::unique_ptr<LCEntry> _next) : next(std::move(_next)) {}
-    virtual ~FilterLCEntry() = default;
-
-    virtual std::string& get_bucket() override { return next->get_bucket(); }
-    virtual void set_bucket(const std::string& b) override { next->set_bucket(b); }
-    virtual std::string& get_oid() override { return next->get_oid(); }
-    virtual void set_oid(const std::string& o) override { next->set_oid(o); }
-    virtual uint64_t get_start_time() override { return next->get_start_time(); }
-    virtual void set_start_time(uint64_t t) override { next->set_start_time(t); }
-    virtual uint32_t get_status() override { return next->get_status(); }
-    virtual void set_status(uint32_t s) override { next->set_status(s); }
-    virtual void print(std::ostream& out) const override { return next->print(out); }
-  };
 
   FilterLifecycle(std::unique_ptr<Lifecycle> _next) : next(std::move(_next)) {}
   virtual ~FilterLifecycle() = default;
 
-  virtual std::unique_ptr<LCEntry> get_entry() override;
   virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
                         const std::string& oid, const std::string& marker,
-			std::unique_ptr<LCEntry>* entry) override;
+			LCEntry& entry) override;
   virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
                              const std::string& oid, const std::string& marker,
-			     std::unique_ptr<LCEntry>* entry) override;
+			     LCEntry& entry) override;
   virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                        const std::string& oid, LCEntry& entry) override;
+                        const std::string& oid, const LCEntry& entry) override;
   virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
                            const std::string& oid, const std::string& marker,
 			   uint32_t max_entries,
-			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+			   std::vector<LCEntry>& entries) override;
   virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, LCEntry& entry) override;
+                       const std::string& oid, const LCEntry& entry) override;
   virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
-                       const std::string& oid, std::unique_ptr<LCHead>* head) override;
-  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
                        const std::string& oid, LCHead& head) override;
+  virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+                       const std::string& oid, const LCHead& head) override;
   virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
 						       const std::string& oid,
 						       const std::string& cookie) override;
diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h
index 6084612d127..d9b2f80e1b6 100644
--- a/src/rgw/rgw_sal_store.h
+++ b/src/rgw/rgw_sal_store.h
@@ -365,6 +365,8 @@ class StoreObject : public Object {
       return -ENOENT;
     }
 
+    virtual RGWObjVersionTracker& get_version_tracker() override { return state.objv_tracker; }
+
     virtual void print(std::ostream& out) const override {
       if (bucket)
 	out << bucket << ":";
@@ -428,74 +430,6 @@ public:
   virtual void print(std::ostream& out) const override { out << oid; }
 };
 
-class StoreLifecycle : public Lifecycle {
-public:
-  struct StoreLCHead : LCHead {
-    time_t start_date{0};
-    time_t shard_rollover_date{0};
-    std::string marker;
-
-    StoreLCHead() = default;
-    StoreLCHead(time_t _start_date, time_t _rollover_date, std::string& _marker) : start_date(_start_date), shard_rollover_date(_rollover_date), marker(_marker) {}
-
-    StoreLCHead& operator=(LCHead& _h) {
-      start_date = _h.get_start_date();
-      shard_rollover_date = _h.get_shard_rollover_date();
-      marker = _h.get_marker();
-
-      return *this;
-    }
-
-    virtual time_t& get_start_date() override { return start_date; }
-    virtual void set_start_date(time_t _date) override { start_date = _date; }
-    virtual std::string& get_marker() override { return marker; }
-    virtual void set_marker(const std::string& _marker) override { marker = _marker; }
-    virtual time_t& get_shard_rollover_date() override { return shard_rollover_date; }
-    virtual void set_shard_rollover_date(time_t _date) override { shard_rollover_date = _date; }
-  };
-
-  struct StoreLCEntry : LCEntry {
-    std::string bucket;
-    std::string oid;
-    uint64_t start_time{0};
-    uint32_t status{0};
-
-    StoreLCEntry() = default;
-    StoreLCEntry(std::string& _bucket, uint64_t _time, uint32_t _status) : bucket(_bucket), start_time(_time), status(_status) {}
-    StoreLCEntry(std::string& _bucket, std::string _oid, uint64_t _time, uint32_t _status) : bucket(_bucket), oid(_oid), start_time(_time), status(_status) {}
-    StoreLCEntry(const StoreLCEntry& _e) = default;
-
-    StoreLCEntry& operator=(LCEntry& _e) {
-      bucket = _e.get_bucket();
-      oid = _e.get_oid();
-      start_time = _e.get_start_time();
-      status = _e.get_status();
-
-      return *this;
-    }
-
-    virtual std::string& get_bucket() override { return bucket; }
-    virtual void set_bucket(const std::string& _bucket) override { bucket = _bucket; }
-    virtual std::string& get_oid() override { return oid; }
-    virtual void set_oid(const std::string& _oid) override { oid = _oid; }
-    virtual uint64_t get_start_time() override { return start_time; }
-    virtual void set_start_time(uint64_t _time) override { start_time = _time; }
-    virtual uint32_t get_status() override { return status; }
-    virtual void set_status(uint32_t _status) override { status = _status; }
-    virtual void print(std::ostream& out) const override {
-      out << bucket << ":" << oid << ":" << start_time << ":" << status;
-    }
-  };
-
-  StoreLifecycle() = default;
-  virtual ~StoreLifecycle() = default;
-
-  virtual std::unique_ptr<LCEntry> get_entry() override {
-      return std::make_unique<StoreLCEntry>();
-  }
-  using Lifecycle::get_entry;
-};
-
 class StoreNotification : public Notification {
 protected:
   Object* obj;
diff --git a/src/seastar b/src/seastar
-Subproject ba30b8c4d33ff22deff8daee5b080aaa7a7b402
+Subproject 7d4ae901b5604cb23273a5f0341c8d5f2c553d4
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index ff25b5973a4..52d8d4425fa 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -178,7 +178,8 @@
   usage: rbd bench [--pool <pool>] [--namespace <namespace>] [--image <image>] 
                    [--io-size <io-size>] [--io-threads <io-threads>] 
                    [--io-total <io-total>] [--io-pattern <io-pattern>] 
-                   [--rw-mix-read <rw-mix-read>] --io-type <io-type> 
+                   [--rw-mix-read <rw-mix-read>] 
+                   [--pattern-byte <pattern-byte>] --io-type <io-type> 
                    <image-spec> 
   
   Simple benchmark.
@@ -196,6 +197,8 @@
     --io-total arg       total size for IO (in B/K/M/G/T) [default: 1G]
     --io-pattern arg     IO pattern (rand, seq, or full-seq) [default: seq]
     --rw-mix-read arg    read proportion in readwrite (<= 100) [default: 50]
+    --pattern-byte arg   which byte value to write (integer between 0-255, rand
+                         or rand-str [default: rand]
     --io-type arg        IO type (read, write, or readwrite(rw))
   
   rbd help children
diff --git a/src/test/crimson/test_calc_subsets.cc b/src/test/crimson/test_calc_subsets.cc
index 7d23810c2e6..dae3cab84fe 100644
--- a/src/test/crimson/test_calc_subsets.cc
+++ b/src/test/crimson/test_calc_subsets.cc
@@ -190,6 +190,7 @@ TEST(clone_subsets, overlap)
   ss.clones.push_back(snapid_t(2));
   pg_missing_t missing;
   pg_missing_item item;
+  item.clean_regions.mark_fully_dirty();
   missing.add(clone, std::move(item));
   hobject_t last_backfill{object_t{"foo1"}, "foo1", CEPH_NOSNAP, 42, 0, "nspace"};
 
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 238cffa1999..507bd6d1b26 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -603,6 +603,13 @@ int IoCtx::omap_get_vals(const std::string& oid,
                      max_return, out_vals));
 }
 
+int IoCtx::omap_rm_keys(const std::string& oid,
+                        const std::set<std::string>& keys) {
+  TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
+  return ctx->execute_operation(
+    oid, std::bind(&TestIoCtxImpl::omap_rm_keys, _1, _2, keys));
+}
+
 int IoCtx::operate(const std::string& oid, ObjectWriteOperation *op) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
   TestObjectOperationImpl *ops = reinterpret_cast<TestObjectOperationImpl*>(op->impl);
diff --git a/src/test/librbd/CMakeLists.txt b/src/test/librbd/CMakeLists.txt
index c3f0edbea5d..6b3d8c2681c 100644
--- a/src/test/librbd/CMakeLists.txt
+++ b/src/test/librbd/CMakeLists.txt
@@ -153,6 +153,11 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
           crypto/luks/test_mock_LoadRequest.cc)
 endif()
 
+if(HAVE_LIBNBD)
+  list(APPEND unittest_librbd_srcs
+          migration/test_mock_NBDStream.cc)
+endif()
+
 # On Windows, we'll skip librbd unit tests for the time being, running just the
 # functional tests. The reason is that the unit tests require libcls*, which in
 # turn requires libos and libosd, however those libraries haven't been ported to
diff --git a/src/test/librbd/migration/test_mock_FileStream.cc b/src/test/librbd/migration/test_mock_FileStream.cc
index a5bdfebe4b8..b9729edf69a 100644
--- a/src/test/librbd/migration/test_mock_FileStream.cc
+++ b/src/test/librbd/migration/test_mock_FileStream.cc
@@ -209,5 +209,33 @@ TEST_F(TestMockMigrationFileStream, ShortReadError) {
   ASSERT_EQ(0, ctx3.wait());
 }
 
+TEST_F(TestMockMigrationFileStream, ListSparseExtents) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  bufferlist bl;
+  ASSERT_EQ(0, bl.write_file(file_name.c_str()));
+
+  MockFileStream mock_file_stream(&mock_image_ctx, json_object);
+
+  C_SaferCond ctx1;
+  mock_file_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_file_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+                                       &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_file_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
 } // namespace migration
 } // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_HttpStream.cc b/src/test/librbd/migration/test_mock_HttpStream.cc
index aff22b757e9..f928d9351ce 100644
--- a/src/test/librbd/migration/test_mock_HttpStream.cc
+++ b/src/test/librbd/migration/test_mock_HttpStream.cc
@@ -190,5 +190,36 @@ TEST_F(TestMockMigrationHttpStream, Read) {
   ASSERT_EQ(0, ctx3.wait());
 }
 
+TEST_F(TestMockMigrationHttpStream, ListSparseExtents) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_http_client = new MockHttpClient();
+  expect_open(*mock_http_client, 0);
+  expect_close(*mock_http_client, 0);
+
+  MockHttpStream mock_http_stream(&mock_image_ctx, json_object);
+
+  C_SaferCond ctx1;
+  mock_http_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_http_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+                                       &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_http_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
 } // namespace migration
 } // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_NBDStream.cc b/src/test/librbd/migration/test_mock_NBDStream.cc
new file mode 100644
index 00000000000..5977057b11f
--- /dev/null
+++ b/src/test/librbd/migration/test_mock_NBDStream.cc
@@ -0,0 +1,730 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "include/rbd_types.h"
+#include "librbd/migration/NBDStream.h"
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include "json_spirit/json_spirit.h"
+
+namespace librbd {
+namespace {
+
+struct MockTestImageCtx : public MockImageCtx {
+  MockTestImageCtx(ImageCtx &image_ctx) : MockImageCtx(image_ctx) {
+  }
+};
+
+} // anonymous namespace
+} // namespace librbd
+
+#include "librbd/migration/NBDStream.cc"
+
+namespace librbd {
+namespace migration {
+
+template <>
+struct NBDClient<MockTestImageCtx> {
+  static NBDClient* s_instance;
+  static NBDClient* create() {
+    ceph_assert(s_instance != nullptr);
+    return s_instance;
+  }
+
+  NBDClient() {
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(get_error, const char*());
+  MOCK_METHOD0(get_errno, int());
+  MOCK_METHOD0(init, int());
+  MOCK_METHOD1(add_meta_context, int(const char*));
+  MOCK_METHOD1(connect_uri, int(const char*));
+  MOCK_METHOD0(get_size, int64_t());
+  MOCK_METHOD4(pread, int(void*, size_t, uint64_t, uint32_t));
+  MOCK_METHOD4(block_status, int(uint64_t, uint64_t, nbd_extent_callback,
+                                 uint32_t));
+  MOCK_METHOD1(shutdown, int(uint32_t));
+};
+
+NBDClient<MockTestImageCtx>* NBDClient<MockTestImageCtx>::s_instance = nullptr;
+
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::WithArg;
+
+class TestMockMigrationNBDStream : public TestMockFixture {
+public:
+  typedef NBDStream<MockTestImageCtx> MockNBDStream;
+  typedef NBDClient<MockTestImageCtx> MockNBDClient;
+
+  void SetUp() override {
+    TestMockFixture::SetUp();
+
+    ASSERT_EQ(0, open_image(m_image_name, &m_image_ctx));
+    m_json_object["uri"] = "nbd://foo.example";
+  }
+
+  void expect_get_errno(MockNBDClient& mock_nbd_client, int err) {
+    EXPECT_CALL(mock_nbd_client, get_errno()).WillOnce(Return(err));
+    EXPECT_CALL(mock_nbd_client, get_error()).WillOnce(Return("error message"));
+  }
+
+  void expect_init(MockNBDClient& mock_nbd_client, int rc) {
+    EXPECT_CALL(mock_nbd_client, init()).WillOnce(Return(rc));
+  }
+
+  void expect_add_meta_context(MockNBDClient& mock_nbd_client, int rc) {
+    EXPECT_CALL(mock_nbd_client, add_meta_context(_)).WillOnce(Return(rc));
+  }
+
+  void expect_connect_uri(MockNBDClient& mock_nbd_client, int rc) {
+    EXPECT_CALL(mock_nbd_client, connect_uri(_)).WillOnce(Return(rc));
+  }
+
+  void expect_get_size(MockNBDClient& mock_nbd_client, int64_t rc) {
+    EXPECT_CALL(mock_nbd_client, get_size()).WillOnce(Return(rc));
+  }
+
+  void expect_pread(MockNBDClient& mock_nbd_client, uint64_t byte_offset,
+                    uint64_t byte_length, const void* buf, int rc) {
+    EXPECT_CALL(mock_nbd_client, pread(_, byte_length, byte_offset, _))
+      .WillOnce(WithArg<0>(Invoke(
+        [byte_length, buf, rc](void* out_buf) {
+          memcpy(out_buf, buf, byte_length);
+          return rc;
+        })));
+  }
+
+  struct block_status_cb_args {
+    const char* metacontext;
+    uint64_t entries_offset;
+    std::vector<uint32_t> entries;
+  };
+
+  // cbs is taken by non-const reference only because of
+  // nbd_extent_callback::callback() signature
+  void expect_block_status(MockNBDClient& mock_nbd_client,
+                           uint64_t byte_offset, uint64_t byte_length,
+                           std::vector<block_status_cb_args>& cbs, int rc) {
+    EXPECT_CALL(mock_nbd_client, block_status(byte_length, byte_offset, _, _))
+      .WillOnce(WithArg<2>(Invoke(
+        [&cbs, rc](nbd_extent_callback extent_callback) {
+          int err = 0;
+          for (auto& cb : cbs) {
+            extent_callback.callback(extent_callback.user_data, cb.metacontext,
+                                     cb.entries_offset, cb.entries.data(),
+                                     cb.entries.size(), &err);
+          }
+          return rc;
+        })));
+  }
+
+  void expect_shutdown(MockNBDClient& mock_nbd_client, int rc) {
+    EXPECT_CALL(mock_nbd_client, shutdown(_)).WillOnce(Return(rc));
+  }
+
+  librbd::ImageCtx *m_image_ctx;
+  json_spirit::mObject m_json_object;
+};
+
+TEST_F(TestMockMigrationNBDStream, OpenInvalidURI) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  m_json_object["uri"] = 123;
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-EINVAL, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenMissingURI) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  m_json_object.clear();
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-EINVAL, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenInitError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, -1);
+  expect_get_errno(*mock_nbd_client, ENOMEM);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-ENOMEM, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenAddMetaContextError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, -1);
+  expect_get_errno(*mock_nbd_client, EINVAL);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-EINVAL, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenConnectURIError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, -1);
+  expect_get_errno(*mock_nbd_client, ECONNREFUSED);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-ECONNREFUSED, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenConnectURIErrorNoErrno) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, -1);
+  // libnbd actually does this for getaddrinfo() errors ("Name or
+  // service not known", etc)
+  expect_get_errno(*mock_nbd_client, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(-EIO, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, GetSize) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  expect_get_size(*mock_nbd_client, 128);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  uint64_t size;
+  mock_nbd_stream.get_size(&size, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+  ASSERT_EQ(128, size);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, GetSizeError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  expect_get_size(*mock_nbd_client, -1);
+  expect_get_errno(*mock_nbd_client, EOVERFLOW);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  uint64_t size;
+  mock_nbd_stream.get_size(&size, &ctx2);
+  ASSERT_EQ(-EOVERFLOW, ctx2.wait());
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, Read) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  std::string s1(128, '1');
+  expect_pread(*mock_nbd_client, 0, 128, s1.c_str(), 0);
+  std::string s2(64, '2');
+  expect_pread(*mock_nbd_client, 256, 64, s2.c_str(), 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  bufferlist bl;
+  mock_nbd_stream.read({{0, 128}, {256, 64}}, &bl, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  bufferlist expected_bl;
+  expected_bl.append(s1);
+  expected_bl.append(s2);
+  ASSERT_EQ(expected_bl, bl);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ReadError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  std::string s1(128, '1');
+  expect_pread(*mock_nbd_client, 0, 128, s1.c_str(), -1);
+  expect_get_errno(*mock_nbd_client, ERANGE);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  bufferlist bl;
+  mock_nbd_stream.read({{0, 128}, {256, 64}}, &bl, &ctx2);
+  ASSERT_EQ(-ERANGE, ctx2.wait());
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtents) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  // DATA
+  std::vector<block_status_cb_args> cbs1 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {128, 0}}
+  };
+  expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+  // ZEROED (zero)
+  std::vector<block_status_cb_args> cbs2 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 256, {64, LIBNBD_STATE_ZERO}}
+  };
+  expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+  // ZEROED (hole)
+  std::vector<block_status_cb_args> cbs3 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 352, {32, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+  // ZEROED, DATA
+  std::vector<block_status_cb_args> cbs4 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 384,
+     {56, LIBNBD_STATE_ZERO, 8, LIBNBD_STATE_HOLE, 16, 0}}
+  };
+  expect_block_status(*mock_nbd_client, 384, 80, cbs4, 0);
+  // DATA, ZEROED
+  std::vector<block_status_cb_args> cbs5 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 464,
+     {40, 0, 16, LIBNBD_STATE_HOLE, 8, LIBNBD_STATE_ZERO}}
+  };
+  expect_block_status(*mock_nbd_client, 464, 64, cbs5, 0);
+  // ZEROED, DATA, ZEROED
+  std::vector<block_status_cb_args> cbs6 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 528,
+     {80, LIBNBD_STATE_HOLE, 128, 0, 32, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 528, 240, cbs6, 0);
+  // DATA, ZEROED, DATA
+  std::vector<block_status_cb_args> cbs7 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 1536,
+     {48, 0, 256, LIBNBD_STATE_ZERO, 16, 0}}
+  };
+  expect_block_status(*mock_nbd_client, 1536, 320, cbs7, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32},
+                                       {384, 80}, {464, 64}, {528, 240},
+                                       {1536, 320}}, &sparse_extents, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+  expected_sparse_extents.insert(352, 96, {io::SPARSE_EXTENT_STATE_ZEROED, 96});
+  expected_sparse_extents.insert(448, 56, {io::SPARSE_EXTENT_STATE_DATA, 56});
+  expected_sparse_extents.insert(504, 104, {io::SPARSE_EXTENT_STATE_ZEROED, 104});
+  expected_sparse_extents.insert(608, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(736, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+  expected_sparse_extents.insert(1536, 48, {io::SPARSE_EXTENT_STATE_DATA, 48});
+  expected_sparse_extents.insert(1584, 256, {io::SPARSE_EXTENT_STATE_ZEROED, 256});
+  expected_sparse_extents.insert(1840, 16, {io::SPARSE_EXTENT_STATE_DATA, 16});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsMoreThanRequested) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  // extra byte at the end
+  std::vector<block_status_cb_args> cbs1 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {129, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+  // extra byte at the start
+  std::vector<block_status_cb_args> cbs2 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 255, {65, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+  // extra byte on both sides
+  std::vector<block_status_cb_args> cbs3 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 351, {34, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32}},
+                                      &sparse_extents, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_ZEROED, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+  expected_sparse_extents.insert(352, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsLessThanRequested) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  // missing byte at the end
+  std::vector<block_status_cb_args> cbs1 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {127, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+  // missing byte at the start
+  std::vector<block_status_cb_args> cbs2 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 257, {63, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+  // missing byte on both sides
+  std::vector<block_status_cb_args> cbs3 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 353, {30, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+  // zero-sized entry
+  std::vector<block_status_cb_args> cbs4 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 400, {0, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 400, 48, cbs4, 0);
+  // no entries
+  std::vector<block_status_cb_args> cbs5 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 520, {}}
+  };
+  expect_block_status(*mock_nbd_client, 520, 16, cbs5, 0);
+  // no callback
+  std::vector<block_status_cb_args> cbs6;
+  expect_block_status(*mock_nbd_client, 608, 8, cbs6, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32},
+                                       {400, 48}, {520, 16}, {608, 8}},
+                                       &sparse_extents, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 127, {io::SPARSE_EXTENT_STATE_ZEROED, 127});
+  expected_sparse_extents.insert(127, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+  expected_sparse_extents.insert(256, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+  expected_sparse_extents.insert(257, 63, {io::SPARSE_EXTENT_STATE_ZEROED, 63});
+  expected_sparse_extents.insert(352, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+  expected_sparse_extents.insert(353, 30, {io::SPARSE_EXTENT_STATE_ZEROED, 30});
+  expected_sparse_extents.insert(383, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+  expected_sparse_extents.insert(400, 48, {io::SPARSE_EXTENT_STATE_DATA, 48});
+  expected_sparse_extents.insert(520, 16, {io::SPARSE_EXTENT_STATE_DATA, 16});
+  expected_sparse_extents.insert(608, 8, {io::SPARSE_EXTENT_STATE_DATA, 8});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsMultipleCallbacks) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  std::vector<block_status_cb_args> cbs1 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 96, {32, LIBNBD_STATE_HOLE}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 32, {32, LIBNBD_STATE_ZERO}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {32, LIBNBD_STATE_ZERO}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 64, {32, LIBNBD_STATE_HOLE}}
+  };
+  expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+  std::vector<block_status_cb_args> cbs2 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 192, {32, 0}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 128, {32, LIBNBD_STATE_ZERO, 32, 0}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 224, {32, LIBNBD_STATE_ZERO}}
+  };
+  expect_block_status(*mock_nbd_client, 128, 128, cbs2, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}, {128, 128}}, &sparse_extents,
+                                      &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 160, {io::SPARSE_EXTENT_STATE_ZEROED, 160});
+  expected_sparse_extents.insert(160, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+  expected_sparse_extents.insert(224, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsUnexpectedMetaContexts) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  std::vector<block_status_cb_args> cbs = {
+    {"unexpected context 1", 0, {64, LIBNBD_STATE_ZERO, 64, 0}},
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {32, LIBNBD_STATE_ZERO, 96, 0}},
+    {"unexpected context 2", 0, {128, LIBNBD_STATE_ZERO}}
+  };
+  expect_block_status(*mock_nbd_client, 0, 128, cbs, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}}, &sparse_extents, &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+  expected_sparse_extents.insert(32, 96, {io::SPARSE_EXTENT_STATE_DATA, 96});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  // error isn't propagated -- DATA is assumed instead
+  std::vector<block_status_cb_args> cbs1;
+  expect_block_status(*mock_nbd_client, 0, 128, cbs1, -1);
+  expect_get_errno(*mock_nbd_client, ENOTSUP);
+  std::vector<block_status_cb_args> cbs2 = {
+    {LIBNBD_CONTEXT_BASE_ALLOCATION, 256, {64, LIBNBD_STATE_ZERO}}
+  };
+  expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+  expect_shutdown(*mock_nbd_client, 0);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+                                      &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_nbd_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ShutdownError) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_nbd_client = new MockNBDClient();
+  expect_init(*mock_nbd_client, 0);
+  expect_add_meta_context(*mock_nbd_client, 0);
+  expect_connect_uri(*mock_nbd_client, 0);
+  // error is ignored
+  expect_shutdown(*mock_nbd_client, -1);
+
+  MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+  C_SaferCond ctx1;
+  mock_nbd_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  mock_nbd_stream.close(&ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+}
+
+} // namespace migration
+} // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_RawSnapshot.cc b/src/test/librbd/migration/test_mock_RawSnapshot.cc
index 3ce4b5c9daa..c9af8933652 100644
--- a/src/test/librbd/migration/test_mock_RawSnapshot.cc
+++ b/src/test/librbd/migration/test_mock_RawSnapshot.cc
@@ -105,6 +105,19 @@ public:
         })));
   }
 
+  void expect_stream_list_sparse_extents(MockStreamInterface& mock_stream_interface,
+                                         const io::Extents& byte_extents,
+                                         const io::SparseExtents& sparse_extents,
+                                         int r) {
+    EXPECT_CALL(mock_stream_interface, list_sparse_extents(byte_extents, _, _))
+      .WillOnce(WithArgs<1, 2>(Invoke(
+        [sparse_extents, r](io::SparseExtents* out_sparse_extents,
+                            Context* ctx) {
+          out_sparse_extents->insert(sparse_extents);
+          ctx->complete(r);
+        })));
+  }
+
   json_spirit::mObject json_object;
 };
 
@@ -232,6 +245,11 @@ TEST_F(TestMockMigrationRawSnapshot, ListSnap) {
   expect_stream_open(*mock_stream_interface, 0);
   expect_stream_get_size(*mock_stream_interface, 0, 0);
 
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 123, {io::SPARSE_EXTENT_STATE_DATA, 123});
+  expect_stream_list_sparse_extents(*mock_stream_interface, {{0, 123}},
+                                    expected_sparse_extents, 0);
+
   expect_stream_close(*mock_stream_interface, 0);
 
   MockRawSnapshot mock_raw_snapshot(&mock_image_ctx, json_object,
@@ -245,6 +263,7 @@ TEST_F(TestMockMigrationRawSnapshot, ListSnap) {
   io::SparseExtents sparse_extents;
   mock_raw_snapshot.list_snap({{0, 123}}, 0, &sparse_extents, {}, &ctx2);
   ASSERT_EQ(0, ctx2.wait());
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
 
   C_SaferCond ctx3;
   mock_raw_snapshot.close(&ctx3);
diff --git a/src/test/librbd/migration/test_mock_S3Stream.cc b/src/test/librbd/migration/test_mock_S3Stream.cc
index 2f2097f7926..272ed928965 100644
--- a/src/test/librbd/migration/test_mock_S3Stream.cc
+++ b/src/test/librbd/migration/test_mock_S3Stream.cc
@@ -234,5 +234,36 @@ TEST_F(TestMockMigrationS3Stream, ProcessRequest) {
   ASSERT_EQ(0, ctx2.wait());
 }
 
+TEST_F(TestMockMigrationS3Stream, ListSparseExtents) {
+  MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+  InSequence seq;
+
+  auto mock_http_client = new MockHttpClient();
+  expect_open(*mock_http_client, 0);
+  expect_close(*mock_http_client, 0);
+
+  MockS3Stream mock_s3_stream(&mock_image_ctx, json_object);
+
+  C_SaferCond ctx1;
+  mock_s3_stream.open(&ctx1);
+  ASSERT_EQ(0, ctx1.wait());
+
+  C_SaferCond ctx2;
+  io::SparseExtents sparse_extents;
+  mock_s3_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+                                     &ctx2);
+  ASSERT_EQ(0, ctx2.wait());
+
+  io::SparseExtents expected_sparse_extents;
+  expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+  expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+  ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+  C_SaferCond ctx3;
+  mock_s3_stream.close(&ctx3);
+  ASSERT_EQ(0, ctx3.wait());
+}
+
 } // namespace migration
 } // namespace librbd
diff --git a/src/test/librbd/mock/migration/MockStreamInterface.h b/src/test/librbd/mock/migration/MockStreamInterface.h
index 36df86638de..1f33fee5438 100644
--- a/src/test/librbd/mock/migration/MockStreamInterface.h
+++ b/src/test/librbd/mock/migration/MockStreamInterface.h
@@ -21,6 +21,14 @@ struct MockStreamInterface : public StreamInterface {
   void read(io::Extents&& byte_extents, bufferlist* bl, Context* on_finish) {
     read(byte_extents, bl, on_finish);
   }
+
+  MOCK_METHOD3(list_sparse_extents, void(const io::Extents&,
+                                         io::SparseExtents*, Context*));
+  void list_sparse_extents(io::Extents&& byte_extents,
+                           io::SparseExtents* sparse_extents,
+                           Context* on_finish) {
+    list_sparse_extents(byte_extents, sparse_extents, on_finish);
+  }
 };
 
 } // namespace migration
diff --git a/src/test/librbd/test_Groups.cc b/src/test/librbd/test_Groups.cc
index eaef20611ac..eb18473b988 100644
--- a/src/test/librbd/test_Groups.cc
+++ b/src/test/librbd/test_Groups.cc
@@ -5,12 +5,15 @@
 #include "test/librbd/test_support.h"
 #include "include/rbd/librbd.h"
 #include "include/rbd/librbd.hpp"
+#include "librbd/api/Group.h"
 #include "test/librados/test.h"
 #include "gtest/gtest.h"
 
 #include <boost/scope_exit.hpp>
 #include <chrono>
 #include <vector>
+#include <set>
+#include <algorithm>
 
 void register_test_groups() {
 }
@@ -772,3 +775,102 @@ TEST_F(TestGroup, snap_list2PP)
   ASSERT_EQ(0, m_rbd.group_remove(m_ioctx, gp_name));
   ASSERT_EQ(0, _rados.pool_delete(pool_name2.c_str()));
 }
+
+TEST_F(TestGroup, snap_list_internal)
+{
+  REQUIRE_FORMAT_V2();
+
+  // Check that the listing works with different
+  // values for try_to_sort and fail_if_not_sorted
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  const char *group_name = "gp_snaplist_internalPP";
+
+  librbd::RBD rbd;
+  ASSERT_EQ(0, rbd.group_create(ioctx, group_name));
+
+  std::vector<librbd::group_snap_info2_t> gp_snaps;
+
+  // No snaps present
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, true,
+                                               &gp_snaps));
+  ASSERT_EQ(0U, gp_snaps.size());
+
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+                                               &gp_snaps));
+  ASSERT_EQ(0U, gp_snaps.size());
+
+  // Create a stale snap_order key by deleting the snapshot_ key
+  ASSERT_EQ(0, librbd::api::Group<>::snap_create(ioctx, group_name,
+                                                 "test-snap", 0));
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+                                               &gp_snaps));
+  ASSERT_EQ(1U, gp_snaps.size());
+
+  std::string group_id;
+  ASSERT_EQ(0, librbd::api::Group<>::get_id(ioctx, group_name, &group_id));
+
+  std::string group_header = RBD_GROUP_HEADER_PREFIX + group_id;
+  std::set<std::string> keys = {"snapshot_" + gp_snaps[0].id};
+  ASSERT_EQ(0, ioctx.omap_rm_keys(group_header, keys));
+
+  for (int i = 0; i < 20; i++) {
+    std::string name = "snap" + stringify(i);
+    ASSERT_EQ(0, librbd::api::Group<>::snap_create(ioctx, group_name,
+                                                   name.c_str(), 0));
+  }
+
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, true,
+                                               &gp_snaps));
+  ASSERT_EQ(20U, gp_snaps.size());
+
+  // Verify that the sorted list is correct
+  for (size_t i = 0; i < gp_snaps.size(); i++){
+    std::string name = "snap" + stringify(i);
+    ASSERT_EQ(name, gp_snaps[i].name);
+  }
+
+  // Sort on group snap ids to simulate the unsorted list.
+  std::vector<librbd::group_snap_info2_t> snaps_sorted_by_id = gp_snaps;
+  std::sort(snaps_sorted_by_id.begin(), snaps_sorted_by_id.end(),
+            [](const librbd::group_snap_info2_t &a,
+	       const librbd::group_snap_info2_t &b) {
+	      return a.id < b.id;
+	    });
+
+  // Check that the vectors actually differ
+  bool differ = false;
+  for (size_t i = 0; i < gp_snaps.size(); i++) {
+    if (gp_snaps[i].id != snaps_sorted_by_id[i].id) {
+      differ = true;
+      break;
+    }
+  }
+  ASSERT_TRUE(differ);
+
+  // Remove the snap_order key for one of the snaps.
+  keys = {"snap_order_" + gp_snaps[1].id};
+  ASSERT_EQ(0, ioctx.omap_rm_keys(group_header, keys));
+
+  //This should fail.
+  ASSERT_EQ(-EINVAL, librbd::api::Group<>::snap_list(ioctx, group_name, true,
+                                                     true, &gp_snaps));
+
+  // Should work if fail_if_not_sorted is false
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, false,
+                                               &gp_snaps));
+  ASSERT_EQ(20U, gp_snaps.size());
+
+  ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+                                               &gp_snaps));
+  ASSERT_EQ(20U, gp_snaps.size());
+
+  //Compare unsorted listing
+  for (size_t i = 0; i < gp_snaps.size(); i++){
+    ASSERT_EQ(snaps_sorted_by_id[i].id, gp_snaps[i].id);
+  }
+
+  ASSERT_EQ(0, rbd.group_remove(ioctx, group_name));
+}
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index 5b20db909ce..38b62f3ea6d 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -4364,7 +4364,7 @@ public:
   gen_type *rng;
   ObjectStore *store;
   ObjectStore::CollectionHandle ch;
-
+  std::vector<ObjectStore::Transaction> tt;
   ceph::mutex lock = ceph::make_mutex("State lock");
   ceph::condition_variable cond;
 
@@ -4378,6 +4378,24 @@ public:
     }
   };
 
+  int queue_transactions()
+  {
+    int result = 0;
+    if (!tt.empty()) {
+      result = store->queue_transactions(ch, tt);
+      tt.clear();
+    }
+    return result;
+  }
+  int maybe_queue_transactions()
+  {
+    int result = 0;
+    if ((rand() % 7) == 0 || tt.size() > 10) {
+      result = store->queue_transactions(ch, tt);
+      tt.clear();
+    }
+    return result;
+  }
   class C_SyntheticOnReadable : public Context {
   public:
     SyntheticWorkloadState *state;
@@ -4495,6 +4513,7 @@ public:
   }
   void shutdown() {
     ghobject_t next;
+    queue_transactions();
     while (1) {
       vector<ghobject_t> objects;
       int r = collection_list(store, ch, next, ghobject_t::get_max(), 10,
@@ -4515,6 +4534,7 @@ public:
     queue_transaction(store, ch, std::move(t));
   }
   void statfs(store_statfs_t& stat) {
+    queue_transactions();
     store->statfs(&stat);
   }
 
@@ -4550,6 +4570,7 @@ public:
 
   void wait_for_done() {
     std::unique_lock locker{lock};
+    queue_transactions();
     cond.wait(locker, [this] { return in_flight == 0; });
   }
 
@@ -4623,7 +4644,7 @@ public:
     wait_for_ready(locker);
     ghobject_t new_obj = object_gen->create_object(rng);
     available_objects.erase(new_obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.touch(cid, new_obj);
     boost::uniform_int<> u(17, 22);
     boost::uniform_int<> v(12, 17);
@@ -4636,7 +4657,7 @@ public:
     if (!contents.count(new_obj))
       contents[new_obj] = Object();
     t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4659,7 +4680,7 @@ public:
     new_obj.generation++;
     available_objects.erase(new_obj);
 
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.collection_move_rename(cid, old_obj, cid, new_obj);
     ++in_flight;
     in_flight_objects.insert(old_obj);
@@ -4668,7 +4689,7 @@ public:
     contents[new_obj].data = contents[old_obj].data;
     contents.erase(old_obj);
     t.register_on_applied(new C_SyntheticOnStash(this, old_obj, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4692,7 +4713,7 @@ public:
     new_obj.hobj.set_hash(old_obj.hobj.get_hash());
     available_objects.erase(new_obj);
 
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.clone(cid, old_obj, new_obj);
     ++in_flight;
     in_flight_objects.insert(old_obj);
@@ -4701,7 +4722,7 @@ public:
     contents[new_obj].data = contents[old_obj].data;
 
     t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4751,7 +4772,7 @@ public:
 	 << " (size " << srcdata.length() << ") to "
 	 << dstoff << "~" << len << std::endl;
 
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.clone_range(cid, old_obj, new_obj, srcoff, len, dstoff);
     ++in_flight;
     in_flight_objects.insert(old_obj);
@@ -4783,7 +4804,7 @@ public:
     }
 
     t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4797,7 +4818,7 @@ public:
 
     ghobject_t new_obj = get_uniform_random_object(locker);
     available_objects.erase(new_obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
 
     boost::uniform_int<> u1(0, max_object_len - max_write_len);
     boost::uniform_int<> u2(0, max_write_len);
@@ -4832,7 +4853,7 @@ public:
     ++in_flight;
     in_flight_objects.insert(new_obj);
     t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4845,7 +4866,7 @@ public:
 
     ghobject_t obj = get_uniform_random_object(locker);
     available_objects.erase(obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
 
     boost::uniform_int<> choose(0, max_object_len);
     size_t len = choose(*rng);
@@ -4866,7 +4887,7 @@ public:
     }
 
     t.register_on_applied(new C_SyntheticOnReadable(this, obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -4879,7 +4900,7 @@ public:
 
     ghobject_t new_obj = get_uniform_random_object(locker);
     available_objects.erase(new_obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
 
     boost::uniform_int<> u1(0, max_object_len - max_write_len);
     boost::uniform_int<> u2(0, max_write_len);
@@ -4907,12 +4928,13 @@ public:
     ++in_flight;
     in_flight_objects.insert(new_obj);
     t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
   void read() {
     EnterExit ee("read");
+    queue_transactions();
     boost::uniform_int<> u1(0, max_object_len/2);
     boost::uniform_int<> u2(0, max_object_len);
     uint64_t offset = u1(*rng);
@@ -4962,7 +4984,7 @@ public:
 
     ghobject_t obj = get_uniform_random_object(locker);
     available_objects.erase(obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
 
     boost::uniform_int<> u0(1, max_attr_size);
     boost::uniform_int<> u1(4, max_attr_name_len);
@@ -4997,7 +5019,7 @@ public:
     ++in_flight;
     in_flight_objects.insert(obj);
     t.register_on_applied(new C_SyntheticOnReadable(this, obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -5010,7 +5032,7 @@ public:
 
     ghobject_t obj = get_next_object(locker);
     available_objects.erase(obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
 
     map<string, bufferlist, less<>> attrs;
     set<string> keys;
@@ -5026,12 +5048,13 @@ public:
     ++in_flight;
     in_flight_objects.insert(obj);
     t.register_on_applied(new C_SyntheticOnReadable(this, obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
   void getattrs() {
     EnterExit ee("getattrs");
+    queue_transactions();
     ghobject_t obj;
     map<string, bufferlist> expected;
     {
@@ -5061,6 +5084,7 @@ public:
 
   void getattr() {
     EnterExit ee("getattr");
+    queue_transactions();
     ghobject_t obj;
     int r;
     int retry;
@@ -5118,19 +5142,20 @@ public:
     }
 
     available_objects.erase(obj);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.rmattr(cid, obj, it->first);
 
     contents[obj].attrs.erase(it->first);
     ++in_flight;
     in_flight_objects.insert(obj);
     t.register_on_applied(new C_SyntheticOnReadable(this, obj));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
   void fsck(bool deep) {
     std::unique_lock locker{lock};
+    queue_transactions();
     EnterExit ee("fsck");
     cond.wait(locker, [this] { return in_flight == 0; });
     ch.reset();
@@ -5144,6 +5169,7 @@ public:
   void scan() {
     std::unique_lock locker{lock};
     EnterExit ee("scan");
+    queue_transactions();
     cond.wait(locker, [this] { return in_flight == 0; });
     vector<ghobject_t> objects;
     set<ghobject_t> objects_set, objects_set2;
@@ -5201,6 +5227,7 @@ public:
 
   void stat() {
     EnterExit ee("stat");
+    queue_transactions();
     ghobject_t hoid;
     uint64_t expected;
     {
@@ -5235,14 +5262,14 @@ public:
     if (!can_unlink())
       return -ENOENT;
     ghobject_t to_remove = get_uniform_random_object(locker);
-    ObjectStore::Transaction t;
+    ObjectStore::Transaction& t = tt.emplace_back();
     t.remove(cid, to_remove);
     ++in_flight;
     available_objects.erase(to_remove);
     in_flight_objects.insert(to_remove);
     contents.erase(to_remove);
     t.register_on_applied(new C_SyntheticOnReadable(this, to_remove));
-    int status = store->queue_transaction(ch, std::move(t));
+    int status = maybe_queue_transactions();
     return status;
   }
 
@@ -5281,35 +5308,36 @@ void StoreTestBase::doSyntheticTest(
       cerr << "Op " << i << std::endl;
       test_obj.print_internal_state();
     }
-    boost::uniform_int<> true_false(0, 999);
+    boost::uniform_int<> true_false(0, 9999);
     int val = true_false(rng);
-    if (val > 998) {
+    if (val > 9998) {
       test_obj.fsck(true);
-    } else if (val > 997) {
+    } else if (val > 9997) {
       test_obj.fsck(false);
-    } else if (val > 970) {
+    } else if (val > 9900) {
       test_obj.scan();
-    } else if (val > 950) {
+    } else if (val > 9500) {
       test_obj.stat();
-    } else if (val > 850) {
+    } else if (val > 8500) {
       test_obj.zero();
-    } else if (val > 800) {
+    } else if (val > 8000) {
       test_obj.unlink();
-    } else if (val > 550) {
+    } else if (val > 5500) {
       test_obj.write();
-    } else if (val > 500) {
+    } else if (val > 5000) {
       test_obj.clone();
-    } else if (val > 450) {
+    } else if (val > 4500) {
       test_obj.clone_range();
-    } else if (val > 300) {
+    } else if (val > 3000) {
       test_obj.stash();
-    } else if (val > 100) {
+    } else if (val > 1000) {
       test_obj.read();
     } else {
       test_obj.truncate();
     }
   }
   test_obj.wait_for_done();
+  test_obj.fsck(true);
   test_obj.shutdown();
 }
 
@@ -5338,7 +5366,7 @@ void StoreTestBase::doSyntheticLimitedTest(
       cerr << "Op " << i << std::endl;
       test_obj.print_internal_state();
     }
-    boost::uniform_int<> true_false(0, 9999 /*999*/);
+    boost::uniform_int<> true_false(0, 99999 /*999*/);
     int val = true_false(rng);
     auto option = [&](int range) -> bool {
       if (val == -1) {
@@ -5352,21 +5380,22 @@ void StoreTestBase::doSyntheticLimitedTest(
 	return false;
       }
     };
-    if (option(1)) test_obj.fsck(true);
-    if (option(1)) test_obj.fsck(false);
-    if (option(1)) test_obj.scan();
-    if (option(497)) test_obj.stat();
-    if (option(1000)) test_obj.zero();
-    if (option(1500)) test_obj.read();
-    if (option(1500)) test_obj.write();
-    if (option(500)) test_obj.truncate();
-    if (option(1000)) test_obj.clone_range();
-    if (option(1000)) test_obj.stash();
-    if (option(1500)) test_obj.unlink();
-    if (option(1500)) test_obj.clone();
+    if (option(3)) test_obj.fsck(true);
+    if (option(3)) test_obj.fsck(false);
+    if (option(94)) test_obj.scan();
+    if (option(4900)) test_obj.stat();
+    if (option(10000)) test_obj.zero();
+    if (option(15000)) test_obj.read();
+    if (option(15000)) test_obj.write();
+    if (option(5000)) test_obj.truncate();
+    if (option(10000)) test_obj.clone_range();
+    if (option(10000)) test_obj.stash();
+    if (option(15000)) test_obj.unlink();
+    if (option(15000)) test_obj.clone();
     ceph_assert(val == -1);
   }
   test_obj.wait_for_done();
+  test_obj.fsck(true);
   test_obj.shutdown();
 }
 
diff --git a/src/test/rgw/test_rgw_posix_driver.cc b/src/test/rgw/test_rgw_posix_driver.cc
index caf3c41c8d4..5f624d770d1 100644
--- a/src/test/rgw/test_rgw_posix_driver.cc
+++ b/src/test/rgw/test_rgw_posix_driver.cc
@@ -1709,12 +1709,13 @@ public:
     off_t ofs{0};
     uint64_t accounted_size{0};
     std::string tag;
+    rgw::sal::MultipartUpload::prefix_map_t processed_prefixes;
     ACLOwner owner;
     owner.id = bucket->get_owner();
 
     int ret = upload->complete(env->dpp, null_yield, get_pointer(env->cct), parts,
                                remove_objs, accounted_size, compressed, cs_info,
-                               ofs, tag, owner, 0, mp_obj.get());
+                               ofs, tag, owner, 0, mp_obj.get(), processed_prefixes);
     EXPECT_EQ(ret, 0);
     EXPECT_EQ(write_size, ofs);
     EXPECT_EQ(write_size, accounted_size);
@@ -2480,6 +2481,7 @@ public:
     off_t ofs{0};
     uint64_t accounted_size{0};
     std::string tag;
+    rgw::sal::MultipartUpload::prefix_map_t processed_prefixes;
     ACLOwner owner;
     owner.id = bucket->get_owner();
     mp_obj->gen_rand_obj_instance_name();
@@ -2489,7 +2491,7 @@ public:
 
     int ret = upload->complete(env->dpp, null_yield, get_pointer(env->cct), parts,
                                remove_objs, accounted_size, compressed, cs_info,
-                               ofs, tag, owner, 0, mp_obj.get());
+                               ofs, tag, owner, 0, mp_obj.get(), processed_prefixes);
     EXPECT_EQ(ret, 0);
     EXPECT_EQ(write_size, ofs);
     EXPECT_EQ(write_size, accounted_size);
diff --git a/src/tools/rbd/action/Bench.cc b/src/tools/rbd/action/Bench.cc
index 53f2b6410ae..d4c7a037489 100644
--- a/src/tools/rbd/action/Bench.cc
+++ b/src/tools/rbd/action/Bench.cc
@@ -49,9 +49,13 @@ enum io_pattern_t {
   IO_PATTERN_FULL_SEQ
 };
 
+const int PATTERN_BYTE_RAND = -1;
+const int PATTERN_BYTE_RAND_STR = -2;
+
 struct IOType {};
 struct Size {};
 struct IOPattern {};
+struct PatternByte {};
 
 void validate(boost::any& v, const std::vector<std::string>& values,
               Size *target_type, int) {
@@ -103,6 +107,27 @@ void validate(boost::any& v, const std::vector<std::string>& values,
     v = boost::any(io_type);
 }
 
+void validate(boost::any& v, const std::vector<std::string>& values,
+              PatternByte *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  if (s == "rand") {
+    v = boost::any(PATTERN_BYTE_RAND);
+  } else if (s == "rand-str") {
+    v = boost::any(PATTERN_BYTE_RAND_STR);
+  } else {
+    try {
+      int pattern_byte = boost::lexical_cast<int>(s);
+      if (pattern_byte >= 0 && pattern_byte <= 255) {
+        v = boost::any(pattern_byte);
+        return;
+      }
+    } catch (const boost::bad_lexical_cast &) {
+    }
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+}
+
 } // anonymous namespace
 
 static void rbd_bencher_completion(void *c, void *pc);
@@ -132,8 +157,10 @@ struct rbd_bencher {
   io_type_t io_type;
   uint64_t io_size;
   bufferlist write_bl;
+  int pattern_byte;
 
-  explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size)
+  explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size,
+                       int pattern_byte)
     : image(i),
       in_flight(0),
       io_type(io_type),
@@ -141,7 +168,15 @@ struct rbd_bencher {
   {
     if (io_type == IO_TYPE_WRITE || io_type == IO_TYPE_RW) {
       bufferptr bp(io_size);
-      memset(bp.c_str(), rand() & 0xff, io_size);
+      if (pattern_byte == PATTERN_BYTE_RAND) {
+        memset(bp.c_str(), rand() & 0xff, io_size);
+      } else if (pattern_byte == PATTERN_BYTE_RAND_STR) {
+        for (uint64_t i = 0; i < io_size; i++) {
+          bp.c_str()[i] = rand() & 0xff;
+        }
+      } else {
+        memset(bp.c_str(), pattern_byte, io_size);
+      }
       write_bl.push_back(bp);
     }
   }
@@ -212,7 +247,7 @@ bool should_read(uint64_t read_proportion)
 int do_bench(librbd::Image& image, io_type_t io_type,
 		   uint64_t io_size, uint64_t io_threads,
 		   uint64_t io_bytes, io_pattern_t io_pattern,
-                   uint64_t read_proportion)
+                   uint64_t read_proportion, int pattern_byte)
 {
   uint64_t size = 0;
   image.size(&size);
@@ -236,7 +271,7 @@ int do_bench(librbd::Image& image, io_type_t io_type,
   // seed rand() before constructing rbd_bencher
   srand(time(NULL) % (unsigned long) -1);
 
-  rbd_bencher b(&image, io_type, io_size);
+  rbd_bencher b(&image, io_type, io_size, pattern_byte);
 
   std::cout << "bench "
        << " type " << (io_type == IO_TYPE_READ ? "read" :
@@ -443,7 +478,9 @@ void add_bench_common_options(po::options_description *positional,
     ("io-threads", po::value<uint32_t>(), "ios in flight [default: 16]")
     ("io-total", po::value<Size>(), "total size for IO (in B/K/M/G/T) [default: 1G]")
     ("io-pattern", po::value<IOPattern>(), "IO pattern (rand, seq, or full-seq) [default: seq]")
-    ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]");
+    ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]")
+    ("pattern-byte", po::value<PatternByte>(),
+     "which byte value to write (integer between 0-255, rand or rand-str [default: rand]");
 }
 
 void get_arguments_for_write(po::options_description *positional,
@@ -512,6 +549,13 @@ int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
     bench_pattern = IO_PATTERN_SEQ;
   }
 
+  int pattern_byte;
+  if (vm.count("pattern-byte")) {
+    pattern_byte = vm["pattern-byte"].as<int>();
+  } else {
+    pattern_byte = PATTERN_BYTE_RAND;
+  }
+
   uint64_t bench_read_proportion;
   if (bench_io_type == IO_TYPE_READ) {
     bench_read_proportion = 100;
@@ -545,7 +589,7 @@ int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
   register_async_signal_handler_oneshot(SIGTERM, handle_signal);
 
   r = do_bench(image, bench_io_type, bench_io_size, bench_io_threads,
-		     bench_bytes, bench_pattern, bench_read_proportion);
+               bench_bytes, bench_pattern, bench_read_proportion, pattern_byte);
 
   unregister_async_signal_handler(SIGHUP, sighup_handler);
   unregister_async_signal_handler(SIGINT, handle_signal);
diff --git a/src/tools/rbd_ggate/ggate_drv.c b/src/tools/rbd_ggate/ggate_drv.c
index 11f6cf0a466..813faf34c57 100644
--- a/src/tools/rbd_ggate/ggate_drv.c
+++ b/src/tools/rbd_ggate/ggate_drv.c
@@ -130,7 +130,7 @@ int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
 
   drv = calloc(1, sizeof(*drv));
   if (drv == NULL) {
-    errno = -ENOMEM;
+    errno = ENOMEM;
     goto fail_close;
   }