273 files changed, 9507 insertions, 2279 deletions
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 1b50ff7c5a3..9f2ed1e4790 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -263,6 +263,19 @@ rbd:
   - systemd/rbdmap.service.in
   - udev/50-rbd.rules
 
+nvmeof:
+  - qa/suites/nvmeof/**
+  - qa/tasks/nvmeof.py
+  - qa/workunits/nvmeof/**
+  - src/ceph_nvmeof_monitor_client.cc
+  - src/cephadm/cephadmlib/daemons/nvmeof.py
+  - src/messages/MNVMeofGw*
+  - src/mon/NVMeofGw*
+  - src/nvmeof/**
+  - src/pybind/mgr/cephadm/services/nvmeof.py
+  - src/pybind/mgr/cephadm/templates/services/nvmeof/**
+  - src/tools/ceph-dencoder/nvmeof*
+
 rgw:
   - qa/suites/rgw/**
   - qa/tasks/rgw*
diff --git a/.githubmap b/.githubmap
index b93132cf1ee..01785190643 100644
--- a/.githubmap
+++ b/.githubmap
@@ -27,7 +27,7 @@ b-ranto Boris Ranto <branto@redhat.com>
 badone Brad Hubbard <bhubbard@redhat.com>
 baruza Barbora Ančincová <bara@redhat.com>
 bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
-batrick Patrick Donnelly <pdonnell@redhat.com>
+batrick Patrick Donnelly <pdonnell@ibm.com>
 bigjust Justin Caratzas <jcaratza@redhat.com>
 bk201 Kiefer Chang <kiefer.chang@suse.com>
 BlaineEXE Blaine Gardner <bgardner@suse.com>
@@ -123,6 +123,8 @@ Sarthak0702 Sarthak Gupta <sarthak.dev.0702@gmail.com>
 saschagrunert Sascha Grunert <sgrunert@suse.com>
 sebastian-philipp Sebastian Wagner <sewagner@redhat.com>
 shraddhaag Shraddha Agrawal <shraddhaag@ibm.com>
+Kushal-deb Kushal Deb <Kushal.Deb@ibm.com>
+ShwetaBhosale1 Shweta Bhosale <Shweta.Bhosale1@ibm.com>
 ShyamsundarR Shyamsundar R <srangana@redhat.com>
 sidharthanup Sidharth Anupkrishnan <sanupkri@redhat.com>
 smithfarm Nathan Cutler <ncutler@suse.com>
diff --git a/.mailmap b/.mailmap
index 8359b1473ae..9428951b391 100644
--- a/.mailmap
+++ b/.mailmap
@@ -544,7 +544,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
 Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
 Pascal de Bruijn <pascal@unilogicnetworks.net>
 Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
-Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
 Patrick McGarry <patrick@inktank.com>
 Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
 Patrick Seidensal <pseidensal@suse.com>
@@ -674,12 +675,14 @@ Shiqi <m13913886148@gmail.com> <1454927420@qq.com>
 Shishir Gowda <shishir.gowda@sandisk.com>
 Shotaro Kawaguchi <kawaguchi.s@jp.fujitsu.com>
 Shraddha Agrawal <shraddhaag@ibm.com>
+Kushal Deb <Kushal.Deb@ibm.com>
 Shreyansh Sancheti <ssanchet@redhat.com> shreyanshjain7174 <ssanchet@redhat.com>
 Shu, Xinxin <xinxin.shu@intel.com>
 Shuai Yong <yongshuai@sangfor.com.cn>
 Shun Song <song.shun3@zte.com.cn>
 Shun Song <song.shun3@zte.com.cn> <root@clove83.zte.com.cn>
 Shun Song <song.shun3@zte.com.cn> <songshun134@126.com>
+Shweta Bhosale <Shweta.Bhosale1@ibm.com> <bhosaleshweta097@gmail.com>
 Shyamsundar R <srangana@redhat.com>
 Shylesh Kumar <shmohan@redhat.com> <shylesh.mohan@gmail.com>
 Sibei Gao <gaosb@inspur.com>
diff --git a/.organizationmap b/.organizationmap
index bc194953d1b..42e639c274d 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -359,6 +359,9 @@ IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
 IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
 IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
 IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
+IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
+IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
+IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
 IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
 IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
 IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>
diff --git a/.peoplemap b/.peoplemap
index 507f50edb43..418e8505fb4 100644
--- a/.peoplemap
+++ b/.peoplemap
@@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
 Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
 Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
 Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
-Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
+Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
 Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>
diff --git a/README.md b/README.md
index e51621ca8b8..56257697e9a 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ See https://ceph.com/ for current information about Ceph.
 
 ## Status
 
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/2220/badge)](https://www.bestpractices.dev/projects/2220)
 [![Issue Backporting](https://github.com/ceph/ceph/actions/workflows/create-backport-trackers.yml/badge.svg)](https://github.com/ceph/ceph/actions/workflows/create-backport-trackers.yml)
 
 ## Contributing Code
diff --git a/SubmittingPatches-backports.rst b/SubmittingPatches-backports.rst
index 0f96aec65c4..bb55088cb5f 100644
--- a/SubmittingPatches-backports.rst
+++ b/SubmittingPatches-backports.rst
@@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
 issue, just add a comment describing what changes you would like to make.
 Someone with permissions will make the necessary modifications on your behalf.
 
-For straightforward backports, that's all that you (as the developer of the fix)
-need to do. Volunteers from the `Stable Releases and Backports team`_ will
-proceed to create Backport issues to track the necessary backports and stage the
-backports by opening GitHub PRs with the cherry-picks. If you don't want to
-wait, and provided you have sufficient permissions at https://tracker.ceph.com,
-you can `create Backport tracker issues` and `stage backports`_ yourself. In
-that case, read on.
-
+Authors of pull requests are responsible for creating associated backport pull
+requests. As long as you have sufficient permissions at
+https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
+backports`_ yourself. Read these linked sections to learn how to create
+backport tracker issues and how to stage backports: 
 
 .. _`create backport tracker issues`:
 .. _`backport tracker issue`:
@@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
 
 Under ordinary circumstances, the developer who merges the ``main`` PR will flag
 the ``main`` branch tracker issue for backport by changing the Status to "Pending
-Backport", and volunteers from the `Stable Releases and Backports team`_
-periodically create backport tracker issues by running the
-``backport-create-issue`` script. They also do the actual backporting. But that
-does take time and you may not want to wait.
+Backport". 
 
 You might be tempted to forge ahead and create the backport issues yourself.
 Please don't do that - it is difficult (bordering on impossible) to get all the
@@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
 Milestone tag to the stable release the backport PR is targeting. For example,
 if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
 
-If you don't have sufficient GitHub permissions to set the Milestone, don't
-worry. Members of the `Stable Releases and Backports team`_ periodically run
-a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
-branches and automatically adds the correct Milestone tag if it is missing.
-
 Next, check which component label was applied to the ``main`` PR corresponding to
 this backport, and double-check that that label is applied to the backport PR as
 well. For example, if the ``main`` PR carries the component label "core", the
 backport PR should also get that label.
 
-In general, it is the responsibility of the `Stable Releases and Backports
-team`_ to ensure that backport PRs are properly labelled. If in doubt, just
-leave the labelling to them.
-
 .. _`backport PR reviewing`:
 .. _`backport PR testing`:
 .. _`backport PR merging`:
@@ -381,9 +366,8 @@ leave the labelling to them.
 Reviewing, testing, and merging of backport PRs
 -----------------------------------------------
 
-Once your backport PR is open and the Milestone is set properly, the
-`Stable Releases and Backports team` will take care of getting the PR
-reviewed and tested. Once the PR is reviewed and tested, it will be merged.
+Once your backport PR is open, it will be reviewed and tested. When the PR has
+been reviewed and tested, it will be merged.
 
 If you would like to facilitate this process, you can solicit reviews and run
 integration tests on the PR. In this case, add comments to the PR describing the
@@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
 PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
 unnecessarily complicates the release preparation process, which is done by
 volunteers.)
-
-
-Stable Releases and Backports team
-----------------------------------
-
-Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
-which is charged with maintaining the stable releases and backporting bugfixes
-from the ``main`` branch to them. (That team maintains a wiki, accessible by
-clicking the `Stable Releases and Backports`_ link, which describes various
-workflows in the backporting lifecycle.)
-
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
-
-Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
-issue). The volunteers from the Stable Releases and Backports team will
-backport the fix, run regression tests on it, and include it in one or more
-future point releases.
-
-
diff --git a/cmake/modules/BuildISAL.cmake b/cmake/modules/BuildISAL.cmake
new file mode 100644
index 00000000000..6df15bc5bb8
--- /dev/null
+++ b/cmake/modules/BuildISAL.cmake
@@ -0,0 +1,42 @@
+# use an ExternalProject to build isa-l using its makefile
+function(build_isal)
+  set(isal_BINARY_DIR ${CMAKE_BINARY_DIR}/src/isa-l)
+  set(isal_INSTALL_DIR ${isal_BINARY_DIR}/install)
+  set(isal_INCLUDE_DIR "${isal_INSTALL_DIR}/include")
+  set(isal_LIBRARY "${isal_INSTALL_DIR}/lib/libisal.a")
+
+  # this include directory won't exist until the install step, but the
+  # imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES
+  file(MAKE_DIRECTORY "${isal_INCLUDE_DIR}")
+
+  set(configure_cmd env CC=${CMAKE_C_COMPILER} ./configure --prefix=${isal_INSTALL_DIR})
+  # build a static library with -fPIC that we can link into crypto/compressor plugins
+  list(APPEND configure_cmd --with-pic --enable-static --disable-shared)
+
+  # clear the DESTDIR environment variable from debian/rules,
+  # because it messes with the internal install paths of arrow's bundled deps
+  set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR)
+
+  include(ExternalProject)
+  ExternalProject_Add(isal_ext
+    SOURCE_DIR "${PROJECT_SOURCE_DIR}/src/isa-l"
+    CONFIGURE_COMMAND ./autogen.sh COMMAND ${configure_cmd}
+    BUILD_COMMAND ${NO_DESTDIR_COMMAND} make -j3
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${isal_LIBRARY}
+    INSTALL_COMMAND ${NO_DESTDIR_COMMAND} make install
+    UPDATE_COMMAND ""
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_INSTALL ON
+    LOG_MERGED_STDOUTERR ON
+    LOG_OUTPUT_ON_FAILURE ON)
+
+  # add imported library target ISAL::Crypto
+  add_library(ISAL::ISAL STATIC IMPORTED GLOBAL)
+  add_dependencies(ISAL::ISAL isal_ext)
+  set_target_properties(ISAL::ISAL PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES ${isal_INCLUDE_DIR}
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION ${isal_LIBRARY})
+endfunction()
diff --git a/cmake/modules/BuildISALCrypto.cmake b/cmake/modules/BuildISALCrypto.cmake
new file mode 100644
index 00000000000..26fb4a8f9cd
--- /dev/null
+++ b/cmake/modules/BuildISALCrypto.cmake
@@ -0,0 +1,31 @@
+# use an ExternalProject to build isa-l_crypto using its makefile
+function(build_isal_crypto)
+  set(ISAL_CRYPTO_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
+  set(ISAL_CRYPTO_INCLUDE_DIR "${ISAL_CRYPTO_SOURCE_DIR}/include")
+  set(ISAL_CRYPTO_LIBRARY "${ISAL_CRYPTO_SOURCE_DIR}/bin/isa-l_crypto.a")
+
+  include(FindMake)
+  find_make("MAKE_EXECUTABLE" "make_cmd")
+
+  include(ExternalProject)
+  ExternalProject_Add(isal_crypto_ext
+    SOURCE_DIR ${ISAL_CRYPTO_SOURCE_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${make_cmd} -f <SOURCE_DIR>/Makefile.unx
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${ISAL_CRYPTO_LIBRARY}
+    INSTALL_COMMAND ""
+    UPDATE_COMMAND ""
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_MERGED_STDOUTERR ON
+    LOG_OUTPUT_ON_FAILURE ON)
+
+  # add imported library target ISAL::Crypto
+  add_library(ISAL::Crypto STATIC IMPORTED GLOBAL)
+  add_dependencies(ISAL::Crypto isal_crypto_ext)
+  set_target_properties(ISAL::Crypto PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES ${ISAL_CRYPTO_INCLUDE_DIR}
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION ${ISAL_CRYPTO_LIBRARY})
+endfunction()
diff --git a/container/Containerfile b/container/Containerfile
new file mode 100644
index 00000000000..2f75c8c6ce6
--- /dev/null
+++ b/container/Containerfile
@@ -0,0 +1,209 @@
+ARG FROM_IMAGE="quay.io/centos/centos:stream9"
+FROM $FROM_IMAGE
+
+# allow FROM_IMAGE to be visible inside this stage
+ARG FROM_IMAGE
+
+# Ceph branch name
+ARG CEPH_REF="main"
+
+# Ceph SHA1
+ARG CEPH_SHA1
+
+# Ceph git repo (ceph-ci.git or ceph.git)
+ARG CEPH_GIT_REPO
+
+# (optional) Define the baseurl= for the ganesha.repo
+ARG GANESHA_REPO_BASEURL="https://buildlogs.centos.org/centos/\$releasever-stream/storage/\$basearch/nfsganesha-5/"
+
+# (optional) Set to "crimson" to install crimson packages.
+ARG OSD_FLAVOR="default"
+
+# (optional) Should be 'true' for CI builds (pull from shaman, etc.)
+ARG CI_CONTAINER="true"
+
+RUN /bin/echo -e "\
+FROM_IMAGE: ${FROM_IMAGE}\n\
+CEPH_REF: ${CEPH_REF}\n\
+GANESHA_REPO_BASEURL: ${GANESHA_REPO_BASEURL} \n\
+OSD_FLAVOR: ${OSD_FLAVOR} \n\
+CI_CONTAINER: ${CI_CONTAINER}"
+
+# Other labels are set automatically by container/build github action
+# See: https://github.com/opencontainers/image-spec/blob/main/annotations.md
+LABEL org.opencontainers.image.authors="Ceph Release Team <ceph-maintainers@ceph.io>" \
+      org.opencontainers.image.documentation="https://docs.ceph.com/"
+
+LABEL \
+FROM_IMAGE=${FROM_IMAGE} \
+CEPH_REF=${CEPH_REF} \
+CEPH_SHA1=${CEPH_SHA1} \
+CEPH_GIT_REPO=${CEPH_GIT_REPO} \
+GANESHA_REPO_BASEURL=${GANESHA_REPO_BASEURL} \
+OSD_FLAVOR=${OSD_FLAVOR}
+
+
+#===================================================================================================
+# Install ceph and dependencies, and clean up
+# IMPORTANT: in official builds, use '--squash' build option to keep image as small as possible
+#   keeping run steps separate makes local rebuilds quick, but images are big without squash option
+#===================================================================================================
+
+# Pre-reqs
+RUN dnf install -y --setopt=install_weak_deps=False epel-release jq
+
+# Add NFS-Ganesha repo
+RUN \
+    echo "[ganesha]" > /etc/yum.repos.d/ganesha.repo && \
+    echo "name=ganesha" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "baseurl=${GANESHA_REPO_BASEURL}" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "gpgcheck=0" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "enabled=1" >> /etc/yum.repos.d/ganesha.repo
+
+# ISCSI repo
+RUN set -x && \
+    curl -s -L https://shaman.ceph.com/api/repos/tcmu-runner/main/latest/centos/9/repo?arch=$(arch) -o /etc/yum.repos.d/tcmu-runner.repo && \
+    case "${CEPH_REF}" in \
+        quincy|reef) \
+            curl -s -L https://download.ceph.com/ceph-iscsi/3/rpm/el9/ceph-iscsi.repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
+            ;;\
+        main|*) \
+            curl -s -L https://shaman.ceph.com/api/repos/ceph-iscsi/main/latest/centos/9/repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
+            ;;\
+    esac
+
+# Ceph repo
+RUN set -x && \
+    rpm --import 'https://download.ceph.com/keys/release.asc' && \
+    ARCH=$(arch); if [ "${ARCH}" == "aarch64" ]; then ARCH="arm64"; fi ;\
+    IS_RELEASE=0 ;\
+    if [[ "${CI_CONTAINER}" == "true" ]] ; then \
+        # TODO: this can return different ceph builds (SHA1) for x86 vs. arm runs. is it important to fix?
+        REPO_URL=$(curl -s "https://shaman.ceph.com/api/search/?project=ceph&distros=centos/9/${ARCH}&flavor=${OSD_FLAVOR}&ref=${CEPH_REF}&sha1=latest" | jq -r .[0].url) ;\
+    else \
+        IS_RELEASE=1 ;\
+        REPO_URL="http://download.ceph.com/rpm-${CEPH_REF}/el9/" ;\
+    fi && \
+    rpm -Uvh "$REPO_URL/noarch/ceph-release-1-${IS_RELEASE}.el9.noarch.rpm"
+
+# Copr repos
+# scikit for mgr-diskprediction-local
+# ref: https://github.com/ceph/ceph-container/pull/1821
+RUN \
+    dnf install -y --setopt=install_weak_deps=False dnf-plugins-core && \
+    dnf copr enable -y tchaikov/python-scikit-learn
+
+# Update package mgr
+RUN dnf update -y --setopt=install_weak_deps=False
+
+# Define and install packages
+# General
+RUN echo "ca-certificates" > packages.txt
+# Ceph
+# TODO: remove lua-devel and luarocks once they are present in ceph.spec.in
+#       ref: https://github.com/ceph/ceph/pull/54575#discussion_r1401199635
+RUN echo \
+"ceph-common \
+ceph-exporter \
+ceph-grafana-dashboards \
+ceph-immutable-object-cache \
+ceph-mds \
+ceph-mgr-cephadm \
+ceph-mgr-dashboard \
+ceph-mgr-diskprediction-local \
+ceph-mgr-k8sevents \
+ceph-mgr-rook \
+ceph-mgr \
+ceph-mon \
+ceph-osd \
+ceph-radosgw lua-devel luarocks \
+ceph-volume \
+cephfs-mirror \
+cephfs-top \
+kmod \
+libradosstriper1 \
+rbd-mirror" \
+>> packages.txt
+
+# Optional crimson package(s)
+RUN if [ "${OSD_FLAVOR}" == "crimson" ]; then \
+    echo "ceph-crimson-osd" >> packages.txt ; \
+fi
+
+# Ceph "Recommends"
+RUN echo "nvme-cli python3-saml smartmontools" >> packages.txt
+# NFS-Ganesha
+RUN echo "\
+dbus-daemon \
+nfs-ganesha-ceph \
+nfs-ganesha-rados-grace \
+nfs-ganesha-rados-urls \
+nfs-ganesha-rgw \
+nfs-ganesha \
+rpcbind \
+sssd-client" >> packages.txt
+
+# ISCSI
+RUN echo "ceph-iscsi tcmu-runner python3-rtslib" >> packages.txt
+
+# Ceph-CSI
+# TODO: coordinate with @Madhu-1 to have Ceph-CSI install these itself if unused by ceph
+#       @adk3798 does cephadm use these?
+RUN echo "attr ceph-fuse rbd-nbd"  >> packages.txt
+
+# Rook (only if packages must be in ceph container image)
+RUN echo "systemd-udev" >> packages.txt
+
+# Util packages (should be kept to only utils that are truly very useful)
+# 'sgdisk' (from gdisk) is used in docs and scripts for clearing disks (could be a risk? @travisn @guits @ktdreyer ?)
+# 'ps' (from procps-ng) and 'hostname' are very valuable for debugging and CI
+# TODO: remove sg3_utils once they are moved to ceph.spec.in with libstoragemgmt
+#       ref: https://github.com/ceph/ceph-container/pull/2013#issuecomment-1248606472
+RUN echo "gdisk hostname procps-ng sg3_utils e2fsprogs lvm2 gcc" >> packages.txt
+
+# scikit
+RUN echo "python3-scikit-learn" >> packages.txt
+
+# ceph-node-proxy
+RUN echo "ceph-node-proxy" >> packages.txt
+
+RUN echo "=== PACKAGES TO BE INSTALLED ==="; cat packages.txt
+RUN echo "=== INSTALLING ===" ; \
+dnf install -y --setopt=install_weak_deps=False --setopt=skip_missing_names_on_install=False --enablerepo=crb $(cat packages.txt)
+
+# XXX why isn't this done in the ganesha package?
+RUN mkdir -p /var/run/ganesha
+
+# Disable sync with udev since the container can not contact udev
+RUN \
+    sed -i -e 's/udev_rules = 1/udev_rules = 0/' \
+           -e 's/udev_sync = 1/udev_sync = 0/' \
+           -e 's/obtain_device_list_from_udev = 1/obtain_device_list_from_udev = 0/' \
+        /etc/lvm/lvm.conf && \
+    # validate the sed command worked as expected
+    grep -sqo "udev_sync = 0" /etc/lvm/lvm.conf && \
+    grep -sqo "udev_rules = 0" /etc/lvm/lvm.conf && \
+    grep -sqo "obtain_device_list_from_udev = 0" /etc/lvm/lvm.conf
+
+# CLEAN UP!
+RUN set -x && \
+    dnf clean all && \
+    rm -rf /var/cache/dnf/* && \
+    rm -rf /var/lib/dnf/* && \
+    rm -f /var/lib/rpm/__db* && \
+    # remove unnecessary files with big impact
+    rm -rf /etc/selinux /usr/share/{doc,man,selinux} && \
+    # don't keep compiled python binaries
+    find / -xdev \( -name "*.pyc" -o -name "*.pyo" \) -delete
+
+# Verify that the packages installed haven't been accidentally cleaned, then
+# clean the package list and re-clean unnecessary RPM database files
+RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.txt
+
+#
+# Set some envs in the container for quickly inspecting details about the build at runtime
+ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \
+    CEPH_REF="${CEPH_REF}" \
+    CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \
+    FROM_IMAGE="${FROM_IMAGE}"
+
diff --git a/container/build.sh b/container/build.sh
new file mode 100755
index 00000000000..5edf469d2d2
--- /dev/null
+++ b/container/build.sh
@@ -0,0 +1,175 @@
+#!/bin/bash -ex
+# vim: ts=4 sw=4 expandtab
+
+# repo auth with write perms must be present (this script does not log into
+# CONTAINER_REPO_HOSTNAME and CONTAINER_REPO_ORGANIZATION).
+# If NO_PUSH is set, no login is necessary
+
+
+CFILE=${1:-Containerfile}
+shift || true
+
+usage() {
+    cat << EOF
+$0 [containerfile] (defaults to 'Containerfile')
+For a CI build (from ceph-ci.git, built and pushed to shaman):
+CI_CONTAINER: must be 'true'
+FLAVOR (OSD flavor, default or crimson)
+BRANCH (of Ceph. <remote>/<ref>)
+CEPH_SHA1 (of Ceph)
+ARCH (of build host, and resulting container)
+CONTAINER_REPO_HOSTNAME (quay.ceph.io, for CI, for instance)
+CONTAINER_REPO_ORGANIZATION (ceph-ci, for CI, for instance)
+CONTAINER_REPO_USERNAME
+CONTAINER_REPO_PASSWORD
+
+For a release build: (from ceph.git, built and pushed to download.ceph.com)
+CI_CONTAINER: must be 'false'
+and you must also add
+VERSION (for instance, 19.1.0) for tagging the image
+
+You can avoid the push step (for testing) by setting NO_PUSH to anything
+EOF
+}
+
+CI_CONTAINER=${CI_CONTAINER:-false}
+FLAVOR=${FLAVOR:-default}
+# default: current checked-out branch
+BRANCH=${BRANCH:-$(git rev-parse --abbrev-ref HEAD)}
+# default: current checked-out branch
+CEPH_SHA1=${CEPH_SHA1:-$(git rev-parse HEAD)}
+# default: build host arch
+ARCH=${ARCH:-$(arch)}
+if [[ "${ARCH}" == "aarch64" ]] ; then ARCH=arm64; fi
+if [[ ${CI_CONTAINER} == "true" ]] ; then
+    CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
+    CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph/ceph-${ARCH}}
+else
+    CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.io}
+    CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph/ceph}
+    # default: most-recent annotated tag
+    VERSION=${VERSION:-$(git describe --abbrev=0)}
+fi
+
+# check for existence of all required variables
+: "${CI_CONTAINER:?}"
+: "${FLAVOR:?}"
+: "${BRANCH:?}"
+: "${CEPH_SHA1:?}"
+: "${ARCH:?}"
+: "${CONTAINER_REPO_HOSTNAME:?}"
+: "${CONTAINER_REPO_ORGANIZATION:?}"
+: "${CONTAINER_REPO_USERNAME:?}"
+: "${CONTAINER_REPO_PASSWORD:?}"
+if [[ ${CI_CONTAINER} != "true" ]] ; then ${VERSION:?}; fi
+
+# check for valid repo auth (if pushing)
+ORGURL=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}
+MINIMAL_IMAGE=${ORGURL}/ceph:minimal-test
+if [[ ${NO_PUSH} != "true" ]] ; then
+    podman rmi ${MINIMAL_IMAGE} || true
+    echo "FROM scratch" | podman build -f - -t ${MINIMAL_IMAGE}
+    if ! podman push ${MINIMAL_IMAGE} ; then
+        echo "Not authenticated to ${ORGURL}; need docker/podman login?"
+        exit 1
+    fi
+    podman rmi ${MINIMAL_IMAGE} | true
+fi
+
+if [[ -z "${CEPH_GIT_REPO}" ]] ; then
+    if [[ ${CI_CONTAINER} == "true" ]]; then
+        CEPH_GIT_REPO=https://github.com/ceph/ceph-ci.git
+    else
+        CEPH_GIT_REPO=https://github.com/ceph/ceph.git
+    fi
+fi
+
+# BRANCH will be, say, origin/main.  remove <remote>/
+BRANCH=${BRANCH##*/}
+
+podman build --pull=newer --squash -f $CFILE -t build.sh.output \
+    --build-arg FROM_IMAGE=${FROM_IMAGE:-quay.io/centos/centos:stream9} \
+    --build-arg CEPH_SHA1=${CEPH_SHA1} \
+    --build-arg CEPH_GIT_REPO=${CEPH_GIT_REPO} \
+    --build-arg CEPH_REF=${BRANCH:-main} \
+    --build-arg OSD_FLAVOR=${FLAVOR:-default} \
+    --build-arg CI_CONTAINER=${CI_CONTAINER:-default} \
+    2>&1 
+
+image_id=$(podman image ls localhost/build.sh.output --format '{{.ID}}')
+
+# grab useful image attributes for building the tag
+#
+# the variable settings are prefixed with "export CEPH_CONTAINER_" so that
+# an eval or . can be used to put them into the environment
+#
+# PATH is removed from the output as it would cause problems for this
+# parent script and its children
+#
+# notes:
+#
+# we want .Architecture and everything in .Config.Env
+#
+# printf will not accept "\n" (is this a podman bug?)
+# so construct vars with two calls to podman inspect, joined by a newline,
+# so that vars will get the output of the first command, newline, output
+# of the second command
+#
+vars="$(podman inspect -f '{{printf "export CEPH_CONTAINER_ARCH=%v" .Architecture}}' ${image_id})
+$(podman inspect -f '{{range $index, $value := .Config.Env}}export CEPH_CONTAINER_{{$value}}{{println}}{{end}}' ${image_id})"
+vars="$(echo "${vars}" | grep -v PATH)"
+eval ${vars}
+
+# remove everything up to and including the last slash
+fromtag=${CEPH_CONTAINER_FROM_IMAGE##*/}
+# translate : to -
+fromtag=${fromtag/:/-}
+builddate=$(date +%Y%m%d)
+local_tag=${fromtag}-${CEPH_CONTAINER_CEPH_REF}-${CEPH_CONTAINER_ARCH}-${builddate}
+
+repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}
+
+if [[ ${CI_CONTAINER} == "true" ]] ; then
+    # ceph-ci conventions for remote tags:
+    # requires ARCH, BRANCH, CEPH_SHA1, FLAVOR
+    full_repo_tag=$repopath/ceph:${BRANCH}-${fromtag}-${ARCH}-devel
+    branch_repo_tag=$repopath/ceph:${BRANCH}
+    sha1_repo_tag=$repopath/ceph:${CEPH_SHA1}
+
+    if [[ "${ARCH}" == "arm64" ]] ; then
+        branch_repo_tag=${branch_repo_tag}-arm64
+        sha1_repo_tag=${sha1_repo_tag}-arm64
+    fi
+
+    podman tag ${image_id} ${full_repo_tag}
+    podman tag ${image_id} ${branch_repo_tag}
+    podman tag ${image_id} ${sha1_repo_tag}
+
+    if [[ ${FLAVOR} == "crimson" && ${ARCH} == "x86_64" ]] ; then
+        sha1_flavor_repo_tag=${sha1_repo_tag}-${FLAVOR}
+        podman tag ${image_id} ${sha1_flavor_repo_tag}
+        if [[ -z "${NO_PUSH}" ]] ; then
+            podman push ${sha1_flavor_repo_tag}
+        fi
+        exit
+    fi
+
+    if [[ -z "${NO_PUSH}" ]] ; then
+        podman push ${full_repo_tag}
+        podman push ${branch_repo_tag}
+        podman push ${sha1_repo_tag}
+    fi
+else
+    #
+    # non-CI build.  Tags are like v19.1.0-20240701
+    # push to quay.ceph.io/ceph/prerelease
+    #
+    version_tag=${repopath}/prerelease/ceph-${ARCH}:${VERSION}-${builddate}
+
+    podman tag ${image_id} ${version_tag}
+    if [[ -z "${NO_PUSH}" ]] ; then
+        podman push ${image_id} ${version_tag}
+    fi
+fi
+
+
diff --git a/container/make-manifest-list.py b/container/make-manifest-list.py
new file mode 100755
index 00000000000..010dcaed2b7
--- /dev/null
+++ b/container/make-manifest-list.py
@@ -0,0 +1,164 @@
+#!/usr/bin/python3
+#
+# make a combined "manifest-list" container out of two arch-specific containers
+# searches for latest tags on HOST/{AMD,ARM}64_REPO, makes sure they refer
+# to the same Ceph SHA1, and creates a manifest-list ("fat") image on
+# MANIFEST_HOST/MANIFEST_REPO with the 'standard' set of tags.
+#
+# uses scratch local manifest LOCALMANIFEST, will be destroyed if present
+
+from datetime import datetime
+import functools
+import json
+import os
+import re
+import subprocess
+import sys
+
+# optional env vars (will default if not set)
+
+OPTIONAL_VARS = (
+    'HOST',
+    'AMD64_REPO',
+    'ARM64_REPO',
+    'MANIFEST_HOST',
+    'MANIFEST_REPO',
+)
+
+# Manifest image.  Will be destroyed if already present.
+LOCALMANIFEST = 'localhost/m'
+
+
+def dump_vars(names, vardict):
+    for name in names:
+        print(f'{name}: {vardict[name]}', file=sys.stderr)
+
+
+def run_command(args):
+    print(f'running {args}', file=sys.stderr)
+    if not isinstance(args, list):
+        args = args.split()
+    try:
+        result = subprocess.run(
+            args,
+            capture_output=True,
+            text=True,
+            check=True)
+        return True, result.stdout, result.stderr
+
+    except subprocess.CalledProcessError as e:
+        print(f"Command '{e.cmd}' returned {e.returncode}")
+        print("Error output:")
+        print(e.stderr)
+        return False, result.stdout, result.stderr
+
+
+def get_command_output(args):
+    success, stdout, stderr = run_command(args)
+    return (stdout if success else None)
+
+
+def run_command_show_failure(args):
+    success, stdout, stderr = run_command(args)
+    if not success:
+        print(f'{args} failed:', file=sys.stderr)
+        print(f'stdout:\n{stdout}')
+        print(f'stderr:\n{stderr}')
+    return success
+
+
+@functools.lru_cache
+def get_latest_tag(path):
+    latest_tag = json.loads(
+        get_command_output(f'skopeo list-tags docker://{path}')
+    )['Tags'][-1]
+    return latest_tag
+
+
+@functools.lru_cache
+def get_image_inspect(path):
+    info = json.loads(
+        get_command_output(f'skopeo inspect docker://{path}')
+    )
+    return info
+
+
+def get_sha1(info):
+    return info['Labels']['GIT_COMMIT']
+
+
+def main():
+    host = os.environ.get('HOST', 'quay.io')
+    amd64_repo = os.environ.get('AMD64_REPO', 'ceph/ceph-amd64')
+    arm64_repo = os.environ.get('ARM64_REPO', 'ceph/ceph-arm64')
+    manifest_host = os.environ.get('MANIFEST_HOST', host)
+    manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/ceph')
+    dump_vars(
+        ('host',
+         'amd64_repo',
+         'arm64_repo',
+         'manifest_host',
+         'manifest_repo',
+         ),
+        locals())
+
+    repopaths = (
+        f'{host}/{amd64_repo}',
+        f'{host}/{arm64_repo}',
+    )
+    tags = [get_latest_tag(p) for p in repopaths]
+    print(f'latest tags: amd64:{tags[0]} arm64:{tags[1]}')
+
+    # check that version of latest tag matches
+    version_re = \
+        r'v(?P<major>\d+)\.(?P<minor>\d+)\.(?P<micro>\d+)-(?P<date>\d+)'
+    versions = list()
+    for tag in tags:
+        mo = re.match(version_re, tag)
+        ver = f'{mo.group("major")}.{mo.group("minor")}.{mo.group("micro")}'
+        versions.append(ver)
+    if versions[0] != versions[1]:
+        print(
+            f'version mismatch: amd64:{versions[0]} arm64:{versions[1]}',
+            file=sys.stderr,
+        )
+        return(1)
+
+    major, minor, micro = mo.group(1), mo.group(2), mo.group(3)
+    print(f'Ceph version: {major}.{minor}.{micro}', file=sys.stderr)
+
+    # check that ceph sha1 of two arch images matches
+    paths_with_tags = [f'{p}:{t}' for (p, t) in zip(repopaths, tags)]
+    info = [get_image_inspect(p) for p in paths_with_tags]
+    sha1s = [get_sha1(i) for i in info]
+    if sha1s[0] != sha1s[1]:
+        print(
+            f'sha1 mismatch: amd64: {sha1s[0]} arm64: {sha1s[1]}',
+            file=sys.stderr,
+        )
+        builddate = [i['Created'] for i in info]
+        print(
+            f'Build dates: amd64: {builddate[0]} arm64: {builddate[1]}',
+            file=sys.stderr,
+        )
+        return(1)
+
+    # create manifest list image with the standard list of tags
+    # ignore failure on manifest rm
+    run_command(f'podman manifest rm localhost/m')
+    run_command_show_failure(f'podman manifest create localhost/m')
+    for p in paths_with_tags:
+        run_command_show_failure(f'podman manifest add m {p}')
+    base = f'{manifest_host}/{manifest_repo}'
+    for t in (
+            f'v{major}',
+            f'v{major}.{minor}',
+            f'v{major}.{minor}.{micro}',
+            f'v{major}.{minor}.{micro}-{datetime.today().strftime("%Y%m%d")}',
+        ):
+        run_command_show_failure(
+          f'podman manifest push localhost/m {base}:{t}')
+
+
+if (__name__ == '__main__'):
+    sys.exit(main())
diff --git a/doc/cephadm/services/smb.rst b/doc/cephadm/services/smb.rst
index abd3f4343f0..7b6a31f1c87 100644
--- a/doc/cephadm/services/smb.rst
+++ b/doc/cephadm/services/smb.rst
@@ -4,8 +4,6 @@
 SMB Service
 ===========
 
-.. note:: Only the SMB3 protocol is supported.
-
 .. warning::
 
     SMB support is under active development and many features may be
diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst
index 4ad7304481f..3ae1139ceac 100644
--- a/doc/cephfs/cephfs-journal-tool.rst
+++ b/doc/cephfs/cephfs-journal-tool.rst
@@ -105,12 +105,12 @@ Example: header get/set
       "write_pos": 4274947,
       "expire_pos": 4194304,
       "trimmed_pos": 4194303,
+      "stream_format": 1,
       "layout": { "stripe_unit": 4194304,
-          "stripe_count": 4194304,
+          "stripe_count": 1,
           "object_size": 4194304,
-          "cas_hash": 4194304,
-          "object_stripe_unit": 4194304,
-          "pg_pool": 4194304}}
+          "pool_id": 2,
+          "pool_ns": ""}}
 
     # cephfs-journal-tool header set trimmed_pos 4194303
     Updating trimmed_pos 0x400000 -> 0x3fffff
diff --git a/doc/cephfs/cephfs-mirroring.rst b/doc/cephfs/cephfs-mirroring.rst
index f54050514ed..1a157548281 100644
--- a/doc/cephfs/cephfs-mirroring.rst
+++ b/doc/cephfs/cephfs-mirroring.rst
@@ -189,6 +189,12 @@ To configure a directory for mirroring, run a command of the following form:
 
    ceph fs snapshot mirror add <fs_name> <path>
 
+To list the configured directories, run a command of the following form:
+
+.. prompt:: bash $
+
+   ceph fs snapshot mirror ls <fs_name>
+
 To stop mirroring directory snapshots, run a command of the following form:
 
 .. prompt:: bash $
@@ -340,7 +346,7 @@ command is of format `filesystem-name@filesystem-id peer-uuid`::
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s",
             "sync_bytes": 52428800
         },
@@ -374,7 +380,7 @@ When a directory is currently being synchronized, the mirror daemon marks it as
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s",
             "sync_bytes": 52428800
         },
@@ -403,7 +409,7 @@ E.g., adding a regular file for synchronization would result in failed status::
         "last_synced_snap": {
             "id": 121,
             "name": "snap2",
-            "sync_duration": 300,
+            "sync_duration": 5,
             "sync_time_stamp": "500900.600797s",
             "sync_bytes": 78643200
         },
@@ -439,7 +445,7 @@ In the remote filesystem::
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s"
         },
         "snaps_synced": 2,
@@ -513,16 +519,16 @@ CephFS exports mirroring metrics as :ref:`Labeled Perf Counters` which will be c
      - The total number of snapshots renamed
    * - avg_sync_time
      - Gauge
-     - The average time (ms) taken by all snapshot synchronizations
+     - The average time taken by all snapshot synchronizations
    * - last_synced_start
      - Gauge
-     - The sync start time (ms) of the last synced snapshot
+     - The sync start time of the last synced snapshot
    * - last_synced_end
      - Gauge
-     - The sync end time (ms) of the last synced snapshot
+     - The sync end time of the last synced snapshot
    * - last_synced_duration
      - Gauge
-     - The time duration (ms) of the last synchronization
+     - The time duration of the last synchronization
    * - last_synced_bytes
      - counter
      - The total bytes being synchronized for the last synced snapshot
diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst
index a804a007599..e09fed213f2 100644
--- a/doc/dev/cephfs-mirroring.rst
+++ b/doc/dev/cephfs-mirroring.rst
@@ -17,12 +17,10 @@ Key Idea
 --------
 
 For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
-readdir diff to identify changes in a directory tree. The diffs are applied to
+`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
 directory in the remote file system thereby only synchronizing files that have
 changed between two snapshots.
 
-This feature is tracked here: https://tracker.ceph.com/issues/47034.
-
 Currently, snapshot data is synchronized by bulk copying to the remote
 filesystem.
 
@@ -407,3 +405,5 @@ Feature Status
 --------------
 
 `cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
+
+.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature
diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst
index cbde8779a66..7cce4c6f898 100644
--- a/doc/dev/developer_guide/essentials.rst
+++ b/doc/dev/developer_guide/essentials.rst
@@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
 Backporting
 -----------
 
-All bugfixes should be merged to the ``main`` branch before being
-backported. To flag a bugfix for backporting, make sure it has a
-`tracker issue`_ associated with it and set the ``Backport`` field to a
-comma-separated list of previous releases (e.g. "hammer,jewel") that you think
-need the backport.
-The rest (including the actual backporting) will be taken care of by the
-`Stable Releases and Backports`_ team.
+All bugfixes should be merged to the ``main`` branch before being backported.
+To flag a bugfix for backporting, make sure it has a `tracker issue`_
+associated with it and set the ``Backport`` field to a comma-separated list of
+previous releases (e.g. "hammer,jewel") that you think need the backport. You
+are responsible for the backporting of pull requests that you raise.
 
 .. _`tracker issue`: http://tracker.ceph.com/
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
 
 Dependabot
 ----------
diff --git a/doc/dev/radosgw/bucket_index.rst b/doc/dev/radosgw/bucket_index.rst
index 6764641e0f5..ceff57b58cf 100644
--- a/doc/dev/radosgw/bucket_index.rst
+++ b/doc/dev/radosgw/bucket_index.rst
@@ -32,7 +32,7 @@ For a given bucket, the index may be split into several rados objects, called bu
 
 The default shard count for new buckets is 11, but can be overridden in the zonegroup's ``bucket_index_max_shards`` or ceph.conf's ``rgw_override_bucket_index_max_shards``. As the number of objects in a bucket grows, its index shard count will also increase as a result of dynamic resharding.
 
-Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/rgw_reshard.cc``.
+Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/driver/rados/rgw_reshard.cc``.
 
 -----------------
 Index Transaction
@@ -46,7 +46,7 @@ To keep the bucket index consistent, all object writes or deletes must also upda
 
 Object writes and deletes may race with each other, so a given object may have more than one prepared transaction at a time. RGW considers an object entry to be 'pending' if there are any outstanding transactions, or 'completed' otherwise.
 
-This transaction is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
+This transaction is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
 
 -------
 Listing
@@ -56,7 +56,7 @@ When listing objects, RGW will read all entries (pending and completed) from the
 
 If an RGW crashes in the middle of an `Index Transaction`_, an index entry may get stuck in this 'pending' state. When bucket listing encounters these pending entries, it also sends information from the head object back to the bucket index so it can update the entry and resolve its stale transactions. This message is called 'dir suggest', because the bucket index treats it as a hint or suggestion.
 
-Bucket listing is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
+Bucket listing is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
 
 --------------------
 S3 Object Versioning
@@ -66,9 +66,9 @@ For versioned buckets, the bucket index contains an entry for each object versio
 
 RGW stores a head object in the rgw.buckets.data pool for each object version. This rados object's oid is a combination of the object name and its version id.
 
-In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
+In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
 
-To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
+To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/driver/rados/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
 
 .. _ListObjectsV2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html
 .. _ListObjectVersions: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html
diff --git a/doc/governance.rst b/doc/governance.rst
index 95e1c878028..bc88560f18a 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -29,7 +29,7 @@ Responsibilities
  * Point of contact for the project
  * Representatives for Ceph foundation board meetings
  * Ensure things get done
-
+   
 Membership
 ----------
 
@@ -82,7 +82,7 @@ Current Members
  * Casey Bodley <cbodley@redhat.com>
  * Dan van der Ster <dan.vanderster@clyso.com>
  * David Orman <ormandj@1111systems.com>
- * Ernesto Puerta <epuerta@redhat.com>
+ * Ernesto Puerta <epuertat@redhat.com>
  * Gregory Farnum <gfarnum@redhat.com>
  * Haomai Wang <haomai@xsky.com>
  * Ilya Dryomov <idryomov@redhat.com>
@@ -96,14 +96,25 @@ Current Members
  * Mike Perez <miperez@redhat.com>
  * Myoungwon Oh <myoungwon.oh@samsung.com>
  * Neha Ojha <nojha@redhat.com>
- * Patrick Donnelly <pdonnell@redhat.com>
+ * Patrick Donnelly <pdonnell@ibm.com>
  * Sam Just <sjust@redhat.com>
  * Vikhyat Umrao <vikhyat@redhat.com>
  * Xie Xingguo <xie.xingguo@zte.com.cn>
  * Yehuda Sadeh <yehuda@redhat.com>
  * Yingxin Cheng <yingxin.cheng@intel.com>
  * Yuri Weinstein <yweinste@redhat.com>
- * Zac Dover <zac.dover@gmail.com>
+ * Zac Dover <zac.dover@proton.me>
+ * Laura Flores <lflores@redhat.com>
+ * Venky Shankar <vshankar@redhat.com>
+ * Guillaume Abrioux <gabrioux@redhat.com>
+ * Anthony D'Atri <anthony.datri@gmail.com>
+ * Joseph Mundackal <jmundackal@bloomberg.net>
+ * Gaurav Sitlani <gsitlani@ibm.com>
+ * Afreen Misbah <afreen@ibm.com>
+ * Radoslaw Zarzynski <rzarzyns@redhat.com>
+ * Matan Breizman <mbreizma@redhat.com>
+ * Yaarit Hatuka <yhatuka@ibm.com>
+ * Adam C. Emerson <aemerson@redhat.com>
 
 .. _ctl:
 
diff --git a/doc/man/8/mount.ceph.rst b/doc/man/8/mount.ceph.rst
index 7ecdeb5e852..553e190bdac 100644
--- a/doc/man/8/mount.ceph.rst
+++ b/doc/man/8/mount.ceph.rst
@@ -192,12 +192,13 @@ Advanced
 :command:`wsync`
     Execute all namespace operations synchronously. This ensures that the
     namespace operation will only complete after receiving a reply from
-    the MDS. This is the default.
+    the MDS. 
 
 :command:`nowsync`
     Allow the client to do namespace operations asynchronously. When this
     option is enabled, a namespace operation may complete before the MDS
-    replies, if it has sufficient capabilities to do so.
+    replies, if it has sufficient capabilities to do so. This has been the
+    default since kernel version 5.16.
 
 :command:`crush_location=x`
     Specify the location of the client in terms of CRUSH hierarchy (since 5.8).
diff --git a/doc/mgr/smb.rst b/doc/mgr/smb.rst
index 05e6369ddf1..3252c485a9a 100644
--- a/doc/mgr/smb.rst
+++ b/doc/mgr/smb.rst
@@ -96,6 +96,11 @@ clustering
     enables clustering regardless of the placement count. A value of ``never``
     disables clustering regardless of the placement count. If unspecified,
     ``default`` is assumed.
+public_addrs
+    Optional. A string in the form of <ipaddress/prefixlength>[%<destination interface>].
+    Supported only when using Samba's clustering. Assign "virtual" IP
+    addresses that will be managed by the clustering subsystem and may automatically
+    move between nodes running Samba containers.
 
 Remove Cluster
 ++++++++++++++
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
index c678784249f..edc6a90b0f9 100644
--- a/doc/radosgw/config-ref.rst
+++ b/doc/radosgw/config-ref.rst
@@ -149,7 +149,6 @@ file under each ``[client.radosgw.{instance-name}]`` instance.
 .. confval:: rgw_run_sync_thread
 .. confval:: rgw_data_log_window
 .. confval:: rgw_data_log_changes_size
-.. confval:: rgw_data_log_obj_prefix
 .. confval:: rgw_data_log_num_shards
 .. confval:: rgw_md_log_max_shards
 .. confval:: rgw_data_sync_poll_interval
diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
index 6a21b7479e6..d6925c8ed9c 100644
--- a/doc/radosgw/multisite.rst
+++ b/doc/radosgw/multisite.rst
@@ -507,7 +507,7 @@ For example:
 Updating the Period
 -------------------
 
-After updating the master zone configuration, update the period:
+After updating the secondary zone configuration, update the period:
 
 .. prompt:: bash #
 
diff --git a/make-dist b/make-dist
index e874436a5e7..033bedebd87 100755
--- a/make-dist
+++ b/make-dist
@@ -23,7 +23,7 @@ version=$1
 [ -z "$version" ] && version=$(git describe --long --match 'v*' | sed 's/^v//')
 if expr index $version '-' > /dev/null; then
     rpm_version=$(echo $version | cut -d - -f 1-1)
-    rpm_release=$(echo $version | cut -d - -f 2- | sed 's/-/./')
+    rpm_release=$(echo $version | cut -d - -f 2- | sed 's/-/./g')
 else
     rpm_version=$version
     rpm_release=0
diff --git a/qa/README b/qa/README
index f9b8988c6f9..a6a95c479bc 100644
--- a/qa/README
+++ b/qa/README
@@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or
 ubuntu, chosen randomly.
 
 The teuthology code can be found in https://github.com/ceph/teuthology.git
+
+Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git
+CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench.
+cosbench support has been removed from qa/tasks/cbt.py.
+
diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index 678548fe2cc..94b42579777 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -21,3 +21,6 @@ overrides:
       - overall HEALTH_
       - Replacing daemon
       - deprecated feature inline_data
+      - BLUESTORE_SLOW_OP_ALERT
+      - slow operation indications in BlueStore
+      - experiencing slow operations in BlueStore
diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml
index 1740134a2e0..07ca62e01fb 100644
--- a/qa/cephfs/overrides/pg_health.yaml
+++ b/qa/cephfs/overrides/pg_health.yaml
@@ -9,3 +9,5 @@ overrides:
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 4eac1106e8d..843e9b9901b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -234,146 +234,6 @@ function wait_background_check() {
     return $return_code
 }
 
-# osd_scrub_during_recovery=true make sure scrub happens
-# update 26.8.24: the test should be redesigned. The current version is not
-# reliable, and playing around with the timeouts and such won't fix the
-# design issues.
-function TEST_recovery_scrub_2() {
-    local dir=$1
-    local poolname=test
-    return 0
-
-    TESTDATA="testdata.$$"
-    OSDS=8
-    PGS=32
-    OBJECTS=40
-
-    setup $dir || return 1
-    run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
-    run_mgr $dir x --mgr_stats_period=1 || return 1
-    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 "
-    ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
-    ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
-    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 "
-    ceph_osd_args+="--mgr_stats_period=1"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=1 \
-                          $ceph_osd_args || return 1
-    done
-
-    # Create a pool with $PGS pgs
-    create_pool $poolname $PGS $PGS
-    wait_for_clean || return 1
-    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
-    dd if=/dev/urandom of=$TESTDATA bs=1M count=50
-    for i in $(seq 1 $OBJECTS)
-    do
-        rados -p $poolname put obj${i} $TESTDATA
-    done
-    rm -f $TESTDATA
-
-    ceph osd pool set $poolname size 3
-
-    ceph pg dump pgs
-
-    # note that the following will be needed if the mclock scheduler is specified
-    ceph tell osd.* config get osd_mclock_override_recovery_settings
-
-    # the '_max_active' is expected to be 0
-    ceph tell osd.1 config get osd_recovery_max_active
-    # both next parameters are expected to be >=3
-    ceph tell osd.1 config set osd_recovery_max_active_hdd 6
-    ceph tell osd.1 config set osd_recovery_max_active_ssd 6
-    ceph tell osd.1 config get osd_recovery_max_active_hdd
-    ceph tell osd.1 config get osd_recovery_max_active_ssd
-
-    # Wait for recovery to start
-    count=0
-    while(true)
-    do
-      #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
-      ceph pg dump pgs
-      if test $(ceph --format json pg dump pgs |
-	      jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
-      then
-        break
-      fi
-      sleep 2
-      if test "$count" -eq "10"
-      then
-        echo "Not enough recovery started simultaneously"
-        return 1
-      fi
-      count=$(expr $count + 1)
-    done
-    ceph pg dump pgs
-
-    pids=""
-    recov_scrub_count=0
-    for pg in $(seq 0 $(expr $PGS - 1))
-    do
-        run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
-    done
-    wait_background_check pids
-    return_code=$?
-    if [ $return_code -ne 0 ]; then return $return_code; fi
-
-    ERRORS=0
-    if test $recov_scrub_count -eq 0
-    then
-      echo "No scrubs occurred while PG recovering"
-      ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
-    pid=$(cat $pidfile)
-    if ! kill -0 $pid
-    then
-        echo "OSD crash occurred"
-        #tail -100 $dir/osd.0.log
-        ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    # Work around for http://tracker.ceph.com/issues/38195
-    kill_daemons $dir #|| return 1
-
-    declare -a err_strings
-    ## we do not expect a refusal to scrub
-    err_strings[0]="recovery in progress.*scrubs"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        grep "recovery in progress.*scrubs" $dir/osd.${osd}.log
-    done
-    for err_string in "${err_strings[@]}"
-    do
-        found=false
-        for osd in $(seq 0 $(expr $OSDS - 1))
-        do
-            if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
-            then
-                found=true
-            fi
-        done
-        if [ "$found" = "true" ]; then
-            echo "Found log message not expected '$err_string'"
-	    ERRORS=$(expr $ERRORS + 1)
-        fi
-    done
-
-    teardown $dir || return 1
-
-    if [ $ERRORS != "0" ];
-    then
-        echo "TEST FAILED WITH $ERRORS ERRORS"
-        return 1
-    fi
-
-    echo "TEST PASSED"
-    return 0
-}
-
 main osd-recovery-scrub "$@"
 
 # Local Variables:
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 59564f7e37e..491e46603f7 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() {
         ['pool_name']="testpool"
         ['extras']=" --osd_scrub_auto_repair=true"
     )
-    local extr_dbg=3
     standard_scrub_cluster $dir cluster_conf
     local poolid=${cluster_conf['pool_id']}
     local poolname=${cluster_conf['pool_name']}
@@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() {
     grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
 }
 
+#
+# Testing the "split scrub store" feature: shallow scrubs do not
+# purge deep errors from the store.
+#
+# Corrupt one copy of a replicated pool, creating both shallow and deep errors.
+# Then shallow-scrub the pool and verify that the deep errors are still present.
+#
+function TEST_dual_store_replicated_cluster() {
+    local dir=$1
+    local poolname=csr_pool
+    local total_objs=19
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    run_mon $dir a --osd_pool_default_size=2 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 "
+    for osd in $(seq 0 1)
+    do
+      run_osd $dir $osd $ceph_osd_args || return 1
+    done
+
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+
+    create_pool foo 1 || return 1
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    ceph osd pool set $poolname noscrub 1
+    ceph osd pool set $poolname nodeep-scrub 1
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+        add_something $dir $poolname $objname || return 1
+
+        rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+        rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+    done
+
+    # Increase file 1 MB + 1KB
+    dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025
+    rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1
+    rm -f $dir/new.ROBJ19
+
+    local pg=$(get_pg $poolname ROBJ0)
+    local primary=$(get_primary $poolname ROBJ0)
+
+    # Compute an old omap digest and save oi
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    pg_deep_scrub $pg
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+
+        # Alternate corruption between osd.0 and osd.1
+        local osd=$(expr $i % 2)
+
+        case $i in
+        1)
+            # Size (deep scrub data_digest too)
+            local payload=UVWXYZZZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        2)
+            # digest (deep scrub only)
+            local payload=UVWXYZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        3)
+             # missing
+             objectstore_tool $dir $osd $objname remove || return 1
+             ;;
+
+         4)
+             # Modify omap value (deep scrub only)
+             objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+             ;;
+
+         5)
+            # Delete omap key (deep scrub only)
+            objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+            ;;
+
+         6)
+            # Add extra omap key (deep scrub only)
+            echo extra > $dir/extra-val
+            objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+            rm $dir/extra-val
+            ;;
+
+         7)
+            # Modify omap header (deep scrub only)
+            echo -n newheader > $dir/hdr
+            objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+            rm $dir/hdr
+            ;;
+
+         8)
+            rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+            rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+            # Break xattrs
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+            objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+            echo -n val3-$objname > $dir/newval
+            objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+            rm $dir/bad-val $dir/newval
+            ;;
+
+        9)
+            objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+            echo -n D > $dir/change
+            rados --pool $poolname put $objname $dir/change
+            objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+            rm $dir/oi $dir/change
+            ;;
+
+          # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+          # ROBJ11 must be handled with config change before deep scrub
+          # ROBJ12 must be handled with config change before scrubs
+          # ROBJ13 must be handled before scrubs
+
+        14)
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+            objectstore_tool $dir 1 $objname rm-attr _ || return 1
+            rm $dir/bad-val
+            ;;
+
+        15)
+            objectstore_tool $dir $osd $objname rm-attr _ || return 1
+            ;;
+
+        16)
+            objectstore_tool $dir 0 $objname rm-attr snapset || return 1
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+	    ;;
+
+	17)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ17
+           echo $payload > $dir/new.ROBJ17
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   ;;
+
+	18)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ18
+           echo $payload > $dir/new.ROBJ18
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   # Make one replica have a different object info, so a full repair must happen too
+	   objectstore_tool $dir $osd $objname corrupt-info || return 1
+	   ;;
+
+	19)
+	   # Set osd-max-object-size smaller than this object's size
+
+        esac
+    done
+
+    local pg=$(get_pg $poolname ROBJ0)
+
+    ceph tell osd.\* injectargs -- --osd-max-object-size=1048576
+
+    inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+    inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+    inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+
+    # first sequence: the final shallow scrub should not override any of the deep errors
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1.json
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1b.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_1b_s.json
+
+    pg_deep_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_2.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_2s.json
+
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_3.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_3s.json
+
+    diff -u $dir/dp_results.json $dir/sh2_results.json || return 1
+
+    # inject a read error, which is a special case: the scrub encountering the read error
+    # would override the previously collected shard info.
+    inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+
+    pg_deep_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_4.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json
+    # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s.json
+
+    pg_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_5.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \
+        $dir/sh2Part2_w13_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json
+
+    # the shallow scrub results should differ from the results of the deep
+    # scrub preceding it, but the difference should be limited to ROBJ13
+    diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1
+    diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1
+
+    ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it
+    return 0
+}
+
 
 main osd-scrub-repair "$@"
 
diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
index 0f6021975a4..50d170f5022 100644
--- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml
+++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
@@ -10,3 +10,4 @@ tasks:
       osd:
         debug monc: 20
     flavor: crimson
+- ssh_keys:
diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml
index da841373220..42ca9336c8e 100644
--- a/qa/suites/fs/libcephfs/tasks/client.yaml
+++ b/qa/suites/fs/libcephfs/tasks/client.yaml
@@ -12,3 +12,4 @@ tasks:
     clients:
       client.0:
         - client/test.sh
+        - client/test_oc_disabled.sh
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 2e4741e8140..7c97edae552 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
index 2e873a04bab..9ef37004427 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
index 83d16e4cb2c..12cb50b408d 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
index 6db0c0d4e18..b4755a6433b 100644
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20 # each subsystem
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/rbd/iscsi/0-single-container-host.yaml b/qa/suites/rbd/iscsi/0-single-container-host.yaml
deleted file mode 120000
index 7406e749cf5..00000000000
--- a/qa/suites/rbd/iscsi/0-single-container-host.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/single-container-host.yaml
-\ No newline at end of file
diff --git a/qa/suites/rbd/iscsi/base/install.yaml b/qa/suites/rbd/iscsi/base/install.yaml
index 5c5a6c31f60..cca178cafe8 100644
--- a/qa/suites/rbd/iscsi/base/install.yaml
+++ b/qa/suites/rbd/iscsi/base/install.yaml
@@ -9,6 +9,10 @@ tasks:
     - ceph orch host ls
     - ceph orch device ls
 - install:
-    extra_packages:
+    extra_system_packages:
+      deb:
+      - open-iscsi
+      - multipath-tools
+      rpm:
       - iscsi-initiator-utils
       - device-mapper-multipath
diff --git a/qa/suites/rbd/iscsi/supported-container-hosts$ b/qa/suites/rbd/iscsi/supported-container-hosts$
new file mode 120000
index 00000000000..30a61f1575f
--- /dev/null
+++ b/qa/suites/rbd/iscsi/supported-container-hosts$
@@ -0,0 +1 @@
+.qa/distros/supported-container-hosts/
+\ No newline at end of file
diff --git a/qa/suites/rgw/multifs/0-install.yaml b/qa/suites/rgw/multifs/0-install.yaml
new file mode 100644
index 00000000000..7e83140e64a
--- /dev/null
+++ b/qa/suites/rgw/multifs/0-install.yaml
@@ -0,0 +1,5 @@
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- tox: [client.0]
diff --git a/qa/suites/rgw/multifs/tasks/+ b/qa/suites/rgw/multifs/tasks/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/multifs/tasks/+
diff --git a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
index e07c8b5ccfe..d9526c365c1 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_bucket_quota.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
index bac4f401626..ae32e928661 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_multipart_upload.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
index 66bdff817f5..184555660dc 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
@@ -1,8 +1,4 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
 - ragweed:
     client.0:
       default-branch: ceph-master
diff --git a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
index 92355f04963..573cffbc30a 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
@@ -1,8 +1,4 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
 - s3tests:
     client.0:
       rgw_server: client.0
diff --git a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
index 92c63d2e850..393180e5c17 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_user_quota.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 3814ea3efdb..146bd57960d 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -31,3 +31,5 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+    log-ignorelist:
+        - PG_DEGRADED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index bf3005fad45..ce4e0cc228b 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -6,6 +6,7 @@ overrides:
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
 tasks:
 - install:
     branch: reef
diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
index 84e096520b4..e6a9dc8223c 100644
--- a/qa/tasks/cbt.py
+++ b/qa/tasks/cbt.py
@@ -47,22 +47,11 @@ class CBT(Task):
 
         benchmark_config = self.config.get('benchmarks')
         benchmark_type = next(iter(benchmark_config.keys()))
+  
         if benchmark_type in ['librbdfio', 'fio']:
           testdir = misc.get_testdir(self.ctx)
           benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
-        if benchmark_type == 'cosbench':
-            # create cosbench_dir and cosbench_xml_dir
-            testdir = misc.get_testdir(self.ctx)
-            benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
-            benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
-            self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
-            benchmark_config['cosbench']['controller'] = osd_hosts[0]
-
-            # set auth details
-            remotes_and_roles = self.ctx.cluster.remotes.items()
-            ips = [host for (host, port) in
-                   (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-            benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+  
         client_endpoints_config = self.config.get('client_endpoints', None)
         monitoring_profiles = self.config.get('monitoring_profiles', {})
 
@@ -117,77 +106,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            # install cosbench
-            self.log.info('install dependencies for cosbench')
-            if system_type == 'rpm':
-                cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
-            else:
-                cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
-            self.first_mon.run(args=install_cmd + cosbench_depends)
-            testdir = misc.get_testdir(self.ctx)
-            cosbench_version = '0.4.2.c3'
-            cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
-            os_version = misc.get_system_type(self.first_mon, False, True)
-
-            # additional requirements for bionic
-            if os_version == '18.04':
-                self.first_mon.run(
-                    args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
-                # use our own version of cosbench
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-                # contains additional parameter "-N" to nc
-                cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
-                cosbench_dir = os.path.join(testdir, cosbench_version)
-                self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
-                    ]
-                )
-            else:
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version)
-                    ]
-                )
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'ln', '-s', cosbench_version, 'cos',
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
-                    'chmod', '+x', run.Raw('*.sh'),
-                ]
-            )
-
-            # start cosbench and check info
-            self.log.info('start cosbench')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'start-all.sh'
-                ]
-            )
-            self.log.info('check cosbench info')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'cli.sh', 'info'
-                ]
-            )
-
     def checkout_cbt(self):
         testdir = misc.get_testdir(self.ctx)
         repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
@@ -269,51 +187,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            os_version = misc.get_system_type(self.first_mon, False, True)
-            if os_version == '18.04':
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-            else:
-                cosbench_version = '0.4.2.c3'
-            # note: stop-all requires 'nc'
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'stop-all.sh',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'sudo', 'killall', '-9', 'java',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/cos'.format(tdir=testdir),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/xml'.format(tdir=testdir),
-                ]
-            )
         # Collect cbt performance data
         cbt_performance = CBTperformance()
         cbt_performance.collect(self.ctx, self.config)
diff --git a/qa/tasks/ceph_iscsi_client.py b/qa/tasks/ceph_iscsi_client.py
index 189b7fa31fe..0b0a355f925 100644
--- a/qa/tasks/ceph_iscsi_client.py
+++ b/qa/tasks/ceph_iscsi_client.py
@@ -31,8 +31,15 @@ def task(ctx, config):
         remote.run(args=['sudo', 'systemctl', 'restart', 'iscsid'])
 
         remote.run(args=['sudo', 'modprobe', 'dm_multipath'])
-        remote.run(args=['sudo', 'mpathconf', '--enable'])
         conf = dedent('''
+        defaults {
+                user_friendly_names yes
+                find_multipaths yes
+        }
+
+        blacklist {
+        }
+
         devices {
                 device {
                         vendor                 "LIO-ORG"
@@ -50,7 +57,7 @@ def task(ctx, config):
         }
         ''')
         path = "/etc/multipath.conf"
-        remote.sudo_write_file(path, conf, append=True)
+        remote.sudo_write_file(path, conf)
         remote.run(args=['sudo', 'systemctl', 'start', 'multipathd'])
 
     yield
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index ec06e38d78e..6a583cb4d0f 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -324,6 +324,8 @@ class TestFsStatus(TestAdminCommands):
     Test "ceph fs status subcommand.
     """
 
+    MDSS_REQUIRED = 3
+
     def test_fs_status(self):
         """
         That `ceph fs status` command functions.
@@ -338,6 +340,31 @@ class TestFsStatus(TestAdminCommands):
         mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
         self.assertEqual(mdsmap[0]["state"], "active")
 
+    def test_fs_status_standby_replay(self):
+        """
+        That `ceph fs status` command functions.
+        """
+
+        self.fs.set_allow_standby_replay(True)
+
+        s = self.get_ceph_cmd_stdout("fs", "status")
+        self.assertTrue("active" in s)
+        self.assertTrue("standby-replay" in s)
+        self.assertTrue("0-s" in s)
+        self.assertTrue("standby" in s)
+
+        mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json-pretty"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+        self.assertEqual(mdsmap[1]["state"], "standby-replay")
+        self.assertEqual(mdsmap[1]["rank"], "0-s")
+        self.assertEqual(mdsmap[2]["state"], "standby")
+
+        mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+        self.assertEqual(mdsmap[1]["state"], "standby-replay")
+        self.assertEqual(mdsmap[1]["rank"], "0-s")
+        self.assertEqual(mdsmap[2]["state"], "standby")
+
 
 class TestAddDataPool(TestAdminCommands):
     """
diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py
index 55de1c7b928..078db6a4a6d 100644
--- a/qa/tasks/cephfs/test_mirroring.py
+++ b/qa/tasks/cephfs/test_mirroring.py
@@ -432,6 +432,34 @@ class TestMirroring(CephFSTestCase):
         self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
         self.mount_a.run_shell(["rmdir", "d1"])
 
+    def test_directory_command_ls(self):
+        dir1 = 'dls1'
+        dir2 = 'dls2'
+        self.mount_a.run_shell(["mkdir", dir1])
+        self.mount_a.run_shell(["mkdir", dir2])
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        try:
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+            time.sleep(10)
+            dirs_list = json.loads(self.get_ceph_cmd_stdout("fs", "snapshot", "mirror", "ls", self.primary_fs_name))
+            # verify via asok
+            res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                             'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+            dir_count = res['snap_dirs']['dir_count']
+            self.assertTrue(len(dirs_list) == dir_count and f'/{dir1}' in dirs_list and f'/{dir2}' in dirs_list)
+        except CommandFailedError:
+            raise RuntimeError('Error listing directories')
+        except AssertionError:
+            raise RuntimeError('Wrong number of directories listed')
+        finally:
+            self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+            self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.mount_a.run_shell(["rmdir", dir1])
+        self.mount_a.run_shell(["rmdir",  dir2])
+
     def test_add_relative_directory_path(self):
         self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
         try:
@@ -560,7 +588,7 @@ class TestMirroring(CephFSTestCase):
 
         # create a bunch of files in a directory to snap
         self.mount_a.run_shell(["mkdir", "d0"])
-        for i in range(50):
+        for i in range(100):
             self.mount_a.write_n_mb(os.path.join('d0', f'file.{i}'), 1)
 
         self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
@@ -574,7 +602,7 @@ class TestMirroring(CephFSTestCase):
         # take a snapshot
         self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
 
-        time.sleep(30)
+        time.sleep(60)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
                                "client.mirror_remote@ceph", '/d0', 'snap0', 1)
         self.verify_snapshot('d0', 'snap0')
@@ -586,10 +614,10 @@ class TestMirroring(CephFSTestCase):
         self.assertGreater(second["counters"]["last_synced_start"], first["counters"]["last_synced_start"])
         self.assertGreater(second["counters"]["last_synced_end"], second["counters"]["last_synced_start"])
         self.assertGreater(second["counters"]["last_synced_duration"], 0)
-        self.assertEquals(second["counters"]["last_synced_bytes"], 52428800) # last_synced_bytes = 50 files of 1MB size each
+        self.assertEquals(second["counters"]["last_synced_bytes"], 104857600) # last_synced_bytes = 100 files of 1MB size each
 
         # some more IO
-        for i in range(75):
+        for i in range(150):
             self.mount_a.write_n_mb(os.path.join('d0', f'more_file.{i}'), 1)
 
         time.sleep(60)
@@ -597,7 +625,7 @@ class TestMirroring(CephFSTestCase):
         # take another snapshot
         self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"])
 
-        time.sleep(60)
+        time.sleep(120)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
                                "client.mirror_remote@ceph", '/d0', 'snap1', 2)
         self.verify_snapshot('d0', 'snap1')
@@ -609,7 +637,7 @@ class TestMirroring(CephFSTestCase):
         self.assertGreater(third["counters"]["last_synced_start"], second["counters"]["last_synced_end"])
         self.assertGreater(third["counters"]["last_synced_end"], third["counters"]["last_synced_start"])
         self.assertGreater(third["counters"]["last_synced_duration"], 0)
-        self.assertEquals(third["counters"]["last_synced_bytes"], 78643200) # last_synced_bytes = 75 files of 1MB size each
+        self.assertEquals(third["counters"]["last_synced_bytes"], 157286400) # last_synced_bytes = 150 files of 1MB size each
 
         # delete a snapshot
         self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
@@ -1372,7 +1400,7 @@ class TestMirroring(CephFSTestCase):
         self.mount_b.umount_wait()
         self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
 
-        # create a bunch of files in a directory to snap
+        # create some large files in 3 directories to snap
         self.mount_a.run_shell(["mkdir", "d0"])
         self.mount_a.run_shell(["mkdir", "d1"])
         self.mount_a.run_shell(["mkdir", "d2"])
@@ -1395,30 +1423,38 @@ class TestMirroring(CephFSTestCase):
         vbefore = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
         # take snapshots
         log.debug('taking snapshots')
-        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+        snap_name = "snap0"
+        self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
 
-        time.sleep(10)
         log.debug('checking snap in progress')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d0', 'snap0')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d1', 'snap0')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d2', 'snap0')
+        peer_spec = "client.mirror_remote@ceph"
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        with safe_while(sleep=3, tries=100, action=f'wait for status: {peer_spec}') as proceed:
+            while proceed():
+                res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
+                                                 'fs', 'mirror', 'peer', 'status',
+                                                 f'{self.primary_fs_name}@{self.primary_fs_id}',
+                                                 peer_uuid)
+                if ('syncing' == res["/d0"]['state'] and 'syncing' == res["/d1"]['state'] and \
+                    'syncing' == res["/d2"]['state']):
+                    break
 
-        log.debug('removing directories 1')
+        log.debug('removing directory 1')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
-        log.debug('removing directories 2')
+        log.debug('removing directory 2')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
-        log.debug('removing directories 3')
+        log.debug('removing directory 3')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
 
+        # Wait a while for the sync backoff
+        time.sleep(500)
+
         log.debug('removing snapshots')
-        self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"])
+        self.mount_a.run_shell(["rmdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["rmdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["rmdir", f"d2/.snap/{snap_name}"])
 
         for i in range(4):
             filename = f'file.{i}'
@@ -1438,26 +1474,27 @@ class TestMirroring(CephFSTestCase):
         self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
 
         log.debug('creating new snapshots...')
-        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
+
+        # Wait for the threads to finish
+        time.sleep(500)
 
-        time.sleep(60)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
-        self.verify_snapshot('d0', 'snap0')
+                               "client.mirror_remote@ceph", '/d0', f'{snap_name}', 1)
+        self.verify_snapshot('d0', f'{snap_name}')
 
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d1', 'snap0', 1)
-        self.verify_snapshot('d1', 'snap0')
+                               "client.mirror_remote@ceph", '/d1', f'{snap_name}', 1)
+        self.verify_snapshot('d1', f'{snap_name}')
 
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d2', 'snap0', 1)
-        self.verify_snapshot('d2', 'snap0')
+                               "client.mirror_remote@ceph", '/d2', f'{snap_name}', 1)
+        self.verify_snapshot('d2', f'{snap_name}')
         res = self.mirror_daemon_command(f'counter dump for fs: {self.primary_fs_name}', 'counter', 'dump')
         vafter = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
         self.assertGreater(vafter["counters"]["snaps_synced"], vbefore["counters"]["snaps_synced"])
-        self.assertGreater(vafter["counters"]["snaps_deleted"], vbefore["counters"]["snaps_deleted"])
 
         self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
 
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
index 7917bd9202f..14f54a784e7 100644
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -558,16 +558,18 @@ class TestSessionClientEvict(CephFSTestCase):
         self.assertEqual(ce.exception.exitstatus, errno.EINVAL)
 
     def _evict_with_invalid_id(self, cmd):
+        info_initial = self.fs.rank_asok(cmd + ['ls'])
         # with invalid id
-        with self.assertRaises(CommandFailedError) as ce:
-            self.fs.rank_tell(cmd + ['evict', 'id=1'])
-        self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+        self.fs.rank_tell(cmd + ['evict', 'id=1'])
+        info = self.fs.rank_asok(cmd + ['ls'])
+        self.assertEqual(len(info), len(info_initial)) # session list is status-quo
 
     def _evict_with_negative_id(self, cmd):
+        info_initial = self.fs.rank_asok(cmd + ['ls'])
         # with negative id
-        with self.assertRaises(CommandFailedError) as ce:
-            self.fs.rank_tell(cmd + ['evict', 'id=-9'])
-        self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+        self.fs.rank_tell(cmd + ['evict', 'id=-9'])
+        info = self.fs.rank_asok(cmd + ['ls'])
+        self.assertEqual(len(info), len(info_initial)) # session list is status-quo
 
     def _evict_with_valid_id(self, cmd):
         info_initial = self.fs.rank_asok(cmd + ['ls'])
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index 932d504d47f..19076ea44b3 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -8,6 +8,7 @@ from io import BytesIO, StringIO
 from tasks.mgr.mgr_test_case import MgrTestCase
 from teuthology import contextutil
 from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import Raw
 
 log = logging.getLogger(__name__)
 
@@ -319,7 +320,7 @@ class TestNFS(MgrTestCase):
                     else:
                         log.warning(f'{e}, retrying')
 
-    def _test_mnt(self, pseudo_path, port, ip, check=True):
+    def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False):
         '''
         Test mounting of created exports
         :param pseudo_path: It is the pseudo root name
@@ -347,12 +348,27 @@ class TestNFS(MgrTestCase):
         self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
 
         try:
+            # Clean up volumes directory created by subvolume create by some tests
+            self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes'])
             self.ctx.cluster.run(args=['touch', '/mnt/test'])
             out_mnt = self._sys_cmd(['ls', '/mnt'])
             self.assertEqual(out_mnt,  b'test\n')
+            if datarw:
+              self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1'])
+              out_test1 = self._sys_cmd(['cat', '/mnt/test1'])
+              self.assertEqual(out_test1,  b'test data\n')
         finally:
             self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
 
+    def _test_data_read_write(self, pseudo_path, port, ip):
+        '''
+        Check if read/write works fine
+        '''
+        try:
+            self._test_mnt(pseudo_path, port, ip, True, True)
+        except CommandFailedError as e:
+            self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
+
     def _write_to_read_only_export(self, pseudo_path, port, ip):
         '''
         Check if write to read only export fails
@@ -599,6 +615,18 @@ class TestNFS(MgrTestCase):
         self._write_to_read_only_export(self.pseudo_path, port, ip)
         self._test_delete_cluster()
 
+    def test_data_read_write(self):
+        '''
+        Test date read and write on export.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._test_data_read_write(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
     def test_cluster_info(self):
         '''
         Test cluster info outputs correct ip and hostname
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 2baefd72c3f..9ca85ee67f9 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -2388,7 +2388,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # get earmark
@@ -2401,7 +2401,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # remove earmark
@@ -2559,7 +2559,7 @@ class TestSubvolumes(TestVolumesHelper):
             self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
index 07c69ddc47c..be7afccf331 100644
--- a/qa/tasks/mgr/dashboard/test_osd.py
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -11,6 +11,7 @@ from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple,
 class OsdTest(DashboardTestCase):
 
     AUTH_ROLES = ['cluster-manager']
+    _VERSION = '1.1'
 
     @classmethod
     def setUpClass(cls):
@@ -24,7 +25,7 @@ class OsdTest(DashboardTestCase):
 
     @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
     def test_access_permissions(self):
-        self._get('/api/osd')
+        self._get('/api/osd', version=self._VERSION)
         self.assertStatus(403)
         self._get('/api/osd/0')
         self.assertStatus(403)
@@ -33,7 +34,7 @@ class OsdTest(DashboardTestCase):
         self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
 
     def test_list(self):
-        data = self._get('/api/osd')
+        data = self._get('/api/osd', version=self._VERSION)
         self.assertStatus(200)
 
         self.assertGreaterEqual(len(data), 1)
diff --git a/qa/tasks/tox.py b/qa/tasks/tox.py
index 61c5b7411b4..4e4dee966d5 100644
--- a/qa/tasks/tox.py
+++ b/qa/tasks/tox.py
@@ -35,7 +35,7 @@ def task(ctx, config):
         ctx.cluster.only(client).run(args=[
             'source', '{tvdir}/bin/activate'.format(tvdir=tvdir),
             run.Raw('&&'),
-            'pip', 'install', 'tox==3.15.0'
+            'pip', 'install', 'tox'
         ])
 
     # export the path Keystone and Tempest
diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh
new file mode 100755
index 00000000000..88552aa50bd
--- /dev/null
+++ b/qa/workunits/client/test_oc_disabled.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -ex
+
+ceph_test_client --client_oc=false
diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index fb72e1d6402..cc4024323eb 100755
--- a/qa/workunits/nvmeof/setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -29,7 +29,7 @@ list_subsystems () {
 # add all subsystems
 for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
     subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
-    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn
+    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
 done
 
 list_subsystems
diff --git a/qa/workunits/rbd/luks-encryption.sh b/qa/workunits/rbd/luks-encryption.sh
index 97cb5a0fe87..b6305cb46c6 100755
--- a/qa/workunits/rbd/luks-encryption.sh
+++ b/qa/workunits/rbd/luks-encryption.sh
@@ -2,7 +2,7 @@
 set -ex
 
 CEPH_ID=${CEPH_ID:-admin}
-TMP_FILES="/tmp/passphrase /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
+TMP_FILES="/tmp/passphrase /tmp/passphrase1 /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
 
 _sudo()
 {
@@ -278,8 +278,7 @@ function test_migration_clone() {
   rbd migration prepare testimg1 testimg2
 
   # test reading
-  # FIXME: https://tracker.ceph.com/issues/63184
-  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
   cmp $LIBRBD_DEV /tmp/cmpdata
 
   # trigger copyup for an unwritten area
@@ -297,8 +296,7 @@ function test_migration_clone() {
   _sudo rbd device unmap -t nbd $LIBRBD_DEV
 
   # test reading on a fresh mapping
-  # FIXME: https://tracker.ceph.com/issues/63184
-  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
   cmp $LIBRBD_DEV /tmp/cmpdata
   _sudo rbd device unmap -t nbd $LIBRBD_DEV
 
@@ -320,6 +318,85 @@ function test_migration_clone() {
   rbd rm testimg
 }
 
+function test_migration_open_clone_chain() {
+  rbd create --size 32M testimg
+  rbd encryption format testimg luks1 /tmp/passphrase
+  rbd snap create testimg@snap
+  rbd snap protect testimg@snap
+
+  rbd clone testimg@snap testimg1
+  rbd encryption format testimg1 luks2 /tmp/passphrase1
+  rbd snap create testimg1@snap
+  rbd snap protect testimg1@snap
+
+  rbd clone testimg1@snap testimg2
+  rbd encryption format testimg2 luks1 /tmp/passphrase2
+
+  # 1. X <-- X <-- X
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+  # 2. X <-- X <-- migrating
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+
+  # 3. X <-- migrating <-- X
+  rbd migration prepare testimg1 testimg1
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg1
+
+  # 4. migrating <-- X <-- X
+  rbd migration prepare testimg testimg
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg
+
+  # 5. migrating <-- migrating <-- X
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg1 testimg1
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg1
+  rbd migration abort testimg
+
+  # 6. migrating <-- X <-- migrating
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+  rbd migration abort testimg
+
+  # 7. X <-- migrating <-- migrating
+  rbd migration prepare testimg1 testimg1
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+  rbd migration abort testimg1
+
+  # 8. migrating <-- migrating <-- migrating
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg1 testimg1
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+  rbd migration abort testimg2
+  rbd rm testimg2
+  rbd migration abort testimg1
+  rbd snap unprotect testimg1@snap
+  rbd snap rm testimg1@snap
+  rbd rm testimg1
+  rbd migration abort testimg
+  rbd snap unprotect testimg@snap
+  rbd snap rm testimg@snap
+  rbd rm testimg
+}
+
 function get_nbd_device_paths {
   rbd device list -t nbd | tail -n +2 | egrep "\s+rbd\s+testimg" | awk '{print $5;}'
 }
@@ -343,6 +420,7 @@ function clean_up {
   rbd snap unprotect testimg1@snap || true
   rbd snap remove testimg1@snap || true
   rbd remove testimg1 || true
+  rbd migration abort testimg || true
   rbd snap remove testimg@snap2 || true
   rbd snap remove testimg@snap1 || true
   rbd snap unprotect testimg@snap || true
@@ -371,6 +449,7 @@ dd if=/dev/urandom of=/tmp/testdata2 bs=4M count=4
 
 # create passphrase files
 printf "pass\0word\n" > /tmp/passphrase
+printf "  passwo\nrd 1,1" > /tmp/passphrase1
 printf "\t password2   " > /tmp/passphrase2
 
 # create an image
@@ -401,4 +480,6 @@ test_migration_clone luks1
 rbd create --size 48M testimg
 test_migration_clone luks2
 
+test_migration_open_clone_chain
+
 echo OK
diff --git a/qa/workunits/rgw/s3_utilities.pm b/qa/workunits/rgw/s3_utilities.pm
index 3c3fae900e8..5a91db9d1fd 100644
--- a/qa/workunits/rgw/s3_utilities.pm
+++ b/qa/workunits/rgw/s3_utilities.pm
@@ -21,7 +21,7 @@ sub get_timestamp {
    if ($min < 10) { $min = "0$min"; }
    if ($sec < 10) { $sec = "0$sec"; }
    $year=$year+1900;
-   return $year . '_' . $mon . '_' . $mday . '_' . $hour . '_' . $min . '_' . $sec;
+   return $year . '-' . $mon . '-' . $mday . '-' . $hour . '-' . $min . '-' . $sec;
 }
 
 # Function to check if radosgw is already running
@@ -195,11 +195,12 @@ sub run_s3
                 host                  => $hostname,
                 secure                => 0,
                 retry                 => 1,
+                dns_bucket_names      => 0,
             }
       );
     }
 
-our $bucketname = 'buck_'.get_timestamp();
+our $bucketname = 'buck-'.get_timestamp();
 # create a new bucket (the test bucket)
 our $bucket = $s3->add_bucket( { bucket => $bucketname } )
       or die $s3->err. "bucket $bucketname create failed\n". $s3->errstr;
diff --git a/qa/workunits/rgw/test_rgw_bucket_check.py b/qa/workunits/rgw/test_rgw_bucket_check.py
index bfa6d65d6e7..33936df2401 100755
--- a/qa/workunits/rgw/test_rgw_bucket_check.py
+++ b/qa/workunits/rgw/test_rgw_bucket_check.py
@@ -173,6 +173,7 @@ def main():
     exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}')
     out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys')
     json_out = json.loads(out)
+    log.info(f'"bucket check unlinked" returned {json_out}, expecting {unlinked_keys}')
     assert len(json_out) == len(unlinked_keys)
     bucket.object_versions.all().delete()
     out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}')
diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc
index 1985c85435e..72921e6d9f0 100644
--- a/src/blk/kernel/KernelDevice.cc
+++ b/src/blk/kernel/KernelDevice.cc
@@ -65,7 +65,6 @@ KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, ai
     discard_callback(d_cb),
     discard_callback_priv(d_cbpriv),
     aio_stop(false),
-    discard_stop(false),
     aio_thread(this),
     injecting_crash(0)
 {
@@ -548,7 +547,7 @@ void KernelDevice::_aio_stop()
   }
 }
 
-void KernelDevice::_discard_update_threads()
+void KernelDevice::_discard_update_threads(bool discard_stop)
 {
   std::unique_lock l(discard_lock);
 
@@ -570,28 +569,27 @@ void KernelDevice::_discard_update_threads()
     }
   // Decrease? Signal threads after telling them to stop
   } else if (newcount < oldcount) {
+    std::vector<std::shared_ptr<DiscardThread>> discard_threads_to_stop;
     dout(10) << __func__ << " stopping " << (oldcount - newcount) << " existing discard threads" << dendl;
 
     // Signal the last threads to quit, and stop tracking them
-    for(uint64_t i = oldcount; i > newcount; i--)
-    {
+    for(uint64_t i = oldcount; i > newcount; i--) {
       discard_threads[i-1]->stop = true;
-      discard_threads[i-1]->detach();
+      discard_threads_to_stop.push_back(discard_threads[i-1]);
     }
-    discard_threads.resize(newcount);
-
     discard_cond.notify_all();
+    discard_threads.resize(newcount);
+    l.unlock();
+    for (auto &t : discard_threads_to_stop) {
+      t->join();
+    }
   }
 }
 
 void KernelDevice::_discard_stop()
 {
   dout(10) << __func__ << dendl;
-
-  discard_stop = true;
-  _discard_update_threads();
-  discard_drain();
-
+  _discard_update_threads(true);
   dout(10) << __func__ << " stopped" << dendl;
 }
 
diff --git a/src/blk/kernel/KernelDevice.h b/src/blk/kernel/KernelDevice.h
index 42e542a6cc8..ac555cdd3da 100644
--- a/src/blk/kernel/KernelDevice.h
+++ b/src/blk/kernel/KernelDevice.h
@@ -58,7 +58,6 @@ private:
   aio_callback_t discard_callback;
   void *discard_callback_priv;
   bool aio_stop;
-  bool discard_stop;
   std::unique_ptr<PerfCounters> logger;
 
   ceph::mutex discard_lock = ceph::make_mutex("KernelDevice::discard_lock");
@@ -100,7 +99,7 @@ private:
   int _aio_start();
   void _aio_stop();
 
-  void _discard_update_threads();
+  void _discard_update_threads(bool discard_stop = false);
   void _discard_stop();
   bool _discard_started();
 
diff --git a/src/ceph-volume/ceph_volume/api/lvm.py b/src/ceph-volume/ceph_volume/api/lvm.py
index dcc4f186272..16cbc08b262 100644
--- a/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/src/ceph-volume/ceph_volume/api/lvm.py
@@ -6,7 +6,6 @@ set of utilities for interacting with LVM.
 import logging
 import os
 import uuid
-import re
 from itertools import repeat
 from math import floor
 from ceph_volume import process, util, conf
@@ -1210,39 +1209,3 @@ def get_lv_by_fullname(full_name):
     except ValueError:
         res_lv = None
     return res_lv
-
-def get_lv_path_from_mapper(mapper):
-    """
-    This functions translates a given mapper device under the format:
-    /dev/mapper/LV to the format /dev/VG/LV.
-    eg:
-    from:
-    /dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec
-    to:
-    /dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec
-    """
-    results = re.split(r'^\/dev\/mapper\/(.+\w)-(\w.+)', mapper)
-    results = list(filter(None, results))
-
-    if len(results) != 2:
-        return None
-
-    return f"/dev/{results[0].replace('--', '-')}/{results[1].replace('--', '-')}"
-
-def get_mapper_from_lv_path(lv_path):
-    """
-    This functions translates a given lv path under the format:
-    /dev/VG/LV to the format /dev/mapper/LV.
-    eg:
-    from:
-    /dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec
-    to:
-    /dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec
-    """
-    results = re.split(r'^\/dev\/(.+\w)-(\w.+)', lv_path)
-    results = list(filter(None, results))
-
-    if len(results) != 2:
-        return None
-
-    return f"/dev/mapper/{results[0].replace('-', '--')}/{results[1].replace('-', '--')}"
diff --git a/src/ceph-volume/ceph_volume/tests/api/test_lvm.py b/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
index 9ad2f701f12..6a5eee0e1b8 100644
--- a/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
+++ b/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
@@ -883,15 +883,3 @@ class TestGetSingleLV(object):
 
         assert isinstance(lv_, api.Volume)
         assert lv_.name == 'lv1'
-
-
-class TestHelpers:
-    def test_get_lv_path_from_mapper(self):
-        mapper = '/dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec'
-        lv_path = api.get_lv_path_from_mapper(mapper)
-        assert lv_path == '/dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec'
-
-    def test_get_mapper_from_lv_path(self):
-        lv_path = '/dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec'
-        mapper = api.get_mapper_from_lv_path(lv_path)
-        assert mapper == '/dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9/osd--block--32e8e896--172e--4a38--a06a/3702598510ec'
diff --git a/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/src/ceph-volume/ceph_volume/tests/util/test_disk.py
index 368c2ec8469..8c27ce402fb 100644
--- a/src/ceph-volume/ceph_volume/tests/util/test_disk.py
+++ b/src/ceph-volume/ceph_volume/tests/util/test_disk.py
@@ -1,4 +1,5 @@
 import pytest
+import stat
 from ceph_volume.util import disk
 from mock.mock import patch, Mock, MagicMock, mock_open
 from pyfakefs.fake_filesystem_unittest import TestCase
@@ -640,3 +641,107 @@ class TestBlockSysFs(TestCase):
         assert b.active_mappers()['dm-1']
         assert b.active_mappers()['dm-1']['type'] == 'LVM'
         assert b.active_mappers()['dm-1']['uuid'] == 'abcdef'
+
+
+class TestUdevData(TestCase):
+    def setUp(self) -> None:
+        udev_data_lv_device: str = """
+S:disk/by-id/dm-uuid-LVM-1f1RaxWlzQ61Sbc7oCIHRMdh0M8zRTSnU03ekuStqWuiA6eEDmwoGg3cWfFtE2li
+S:mapper/vg1-lv1
+S:disk/by-id/dm-name-vg1-lv1
+S:vg1/lv1
+I:837060642207
+E:DM_UDEV_DISABLE_OTHER_RULES_FLAG=
+E:DM_UDEV_DISABLE_LIBRARY_FALLBACK_FLAG=1
+E:DM_UDEV_PRIMARY_SOURCE_FLAG=1
+E:DM_UDEV_RULES_VSN=2
+E:DM_NAME=fake_vg1-fake-lv1
+E:DM_UUID=LVM-1f1RaxWlzQ61Sbc7oCIHRMdh0M8zRTSnU03ekuStqWuiA6eEDmwoGg3cWfFtE2li
+E:DM_SUSPENDED=0
+E:DM_VG_NAME=fake_vg1
+E:DM_LV_NAME=fake-lv1
+E:DM_LV_LAYER=
+E:NVME_HOST_IFACE=none
+E:SYSTEMD_READY=1
+G:systemd
+Q:systemd
+V:1"""
+        udev_data_bare_device: str = """
+S:disk/by-path/pci-0000:00:02.0
+S:disk/by-path/virtio-pci-0000:00:02.0
+S:disk/by-diskseq/1
+I:3037919
+E:ID_PATH=pci-0000:00:02.0
+E:ID_PATH_TAG=pci-0000_00_02_0
+E:ID_PART_TABLE_UUID=baefa409
+E:ID_PART_TABLE_TYPE=dos
+E:NVME_HOST_IFACE=none
+G:systemd
+Q:systemd
+V:1"""
+        self.fake_device: str = '/dev/cephtest'
+        self.setUpPyfakefs()
+        self.fs.create_file(self.fake_device, st_mode=(stat.S_IFBLK | 0o600))
+        self.fs.create_file('/run/udev/data/b999:0', create_missing_dirs=True, contents=udev_data_bare_device)
+        self.fs.create_file('/run/udev/data/b998:1', create_missing_dirs=True, contents=udev_data_lv_device)
+
+    def test_device_not_found(self) -> None:
+        self.fs.remove(self.fake_device)
+        with pytest.raises(RuntimeError):
+            disk.UdevData(self.fake_device)
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_no_data(self) -> None:
+        self.fs.remove('/run/udev/data/b999:0')
+        with pytest.raises(RuntimeError):
+            disk.UdevData(self.fake_device)
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_is_dm_false(self) -> None:
+        assert not disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_is_dm_true(self) -> None:
+        assert disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_is_lvm_true(self) -> None:
+        assert disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_is_lvm_false(self) -> None:
+        assert not disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_slashed_path_with_lvm(self) -> None:
+        assert disk.UdevData(self.fake_device).slashed_path == '/dev/fake_vg1/fake-lv1'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_dashed_path_with_lvm(self) -> None:
+        assert disk.UdevData(self.fake_device).dashed_path == '/dev/mapper/fake_vg1-fake-lv1'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_slashed_path_with_bare_device(self) -> None:
+        assert disk.UdevData(self.fake_device).slashed_path == '/dev/cephtest'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_dashed_path_with_bare_device(self) -> None:
+        assert disk.UdevData(self.fake_device).dashed_path == '/dev/cephtest'
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index d00a6cc2ec2..78c140597d6 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -818,7 +818,7 @@ def get_devices(_sys_block_path='/sys/block', device=''):
     for block in block_devs:
         metadata: Dict[str, Any] = {}
         if block[2] == 'lvm':
-            block[1] = lvm.get_lv_path_from_mapper(block[1])
+            block[1] = UdevData(block[1]).slashed_path
         devname = os.path.basename(block[0])
         diskname = block[1]
         if block[2] not in block_types:
@@ -1262,3 +1262,129 @@ class BlockSysFs:
                     if mapper_type == 'LVM':
                         result[holder]['uuid'] = content_split[1]
         return result
+
+class UdevData:
+    """
+    Class representing udev data for a specific device.
+    This class extracts and stores relevant information about the device from udev files.
+
+    Attributes:
+    -----------
+    path : str
+        The initial device path (e.g., /dev/sda).
+    realpath : str
+        The resolved real path of the device.
+    stats : os.stat_result
+        The result of the os.stat() call to retrieve device metadata.
+    major : int
+        The device's major number.
+    minor : int
+        The device's minor number.
+    udev_data_path : str
+        The path to the udev metadata for the device (e.g., /run/udev/data/b<major>:<minor>).
+    symlinks : List[str]
+        A list of symbolic links pointing to the device.
+    id : str
+        A unique identifier for the device.
+    environment : Dict[str, str]
+        A dictionary containing environment variables extracted from the udev data.
+    group : str
+        The group associated with the device.
+    queue : str
+        The queue associated with the device.
+    version : str
+        The version of the device or its metadata.
+    """
+    def __init__(self, path: str) -> None:
+        """Initialize an instance of the UdevData class and load udev information.
+
+        Args:
+            path (str): The path to the device to be analyzed (e.g., /dev/sda).
+
+        Raises:
+            RuntimeError: Raised if no udev data file is found for the specified device.
+        """
+        if not os.path.exists(path):
+            raise RuntimeError(f'{path} not found.')
+        self.path: str = path
+        self.realpath: str = os.path.realpath(self.path)
+        self.stats: os.stat_result = os.stat(self.realpath)
+        self.major: int = os.major(self.stats.st_rdev)
+        self.minor: int = os.minor(self.stats.st_rdev)
+        self.udev_data_path: str = f'/run/udev/data/b{self.major}:{self.minor}'
+        self.symlinks: List[str] = []
+        self.id: str = ''
+        self.environment: Dict[str, str] = {}
+        self.group: str = ''
+        self.queue: str = ''
+        self.version: str = ''
+
+        if not os.path.exists(self.udev_data_path):
+            raise RuntimeError(f'No udev data could be retrieved for {self.path}')
+
+        with open(self.udev_data_path, 'r') as f:
+            content: str = f.read().strip()
+            self.raw_data: List[str] = content.split('\n')
+
+        for line in self.raw_data:
+            data_type, data = line.split(':', 1)
+            if data_type == 'S':
+                self.symlinks.append(data)
+            if data_type == 'I':
+                self.id = data
+            if data_type == 'E':
+                key, value = data.split('=')
+                self.environment[key] = value
+            if data_type == 'G':
+                self.group = data
+            if data_type == 'Q':
+                self.queue = data
+            if data_type == 'V':
+                self.version = data
+
+    @property
+    def is_dm(self) -> bool:
+        """Check if the device is a device mapper (DM).
+
+        Returns:
+            bool: True if the device is a device mapper, otherwise False.
+        """
+        return 'DM_UUID' in self.environment.keys()
+
+    @property
+    def is_lvm(self) -> bool:
+        """Check if the device is a Logical Volume Manager (LVM) volume.
+
+        Returns:
+            bool: True if the device is an LVM volume, otherwise False.
+        """
+        return self.environment.get('DM_UUID', '').startswith('LVM')
+
+    @property
+    def slashed_path(self) -> str:
+        """Get the LVM path structured with slashes.
+
+        Returns:
+            str: A path using slashes if the device is an LVM volume (e.g., /dev/vgname/lvname),
+                 otherwise the original path.
+        """
+        result: str = self.path
+        if self.is_lvm:
+            vg: str = self.environment.get('DM_VG_NAME')
+            lv: str = self.environment.get('DM_LV_NAME')
+            result = f'/dev/{vg}/{lv}'
+        return result
+
+    @property
+    def dashed_path(self) -> str:
+        """Get the LVM path structured with dashes.
+
+        Returns:
+            str: A path using dashes if the device is an LVM volume (e.g., /dev/mapper/vgname-lvname),
+            otherwise the original path.
+        """
+        result: str = self.path
+        if self.is_lvm:
+            name: str = self.environment.get('DM_NAME')
+            result = f'/dev/mapper/{name}'
+        return result
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index c0bd5b33ad4..52988843c83 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -375,8 +375,9 @@ int main(int argc, const char **argv)
 	    << " for osd." << whoami
 	    << " fsid " << g_conf().get_val<uuid_d>("fsid")
 	    << dendl;
+    forker.exit(0);
   }
-  if (mkfs || mkkey) {
+  if (mkkey) {
     forker.exit(0);
   }
   if (mkjournal) {
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index d25eb1391e0..354c3782398 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -5,15 +5,15 @@ DEFAULT_IMAGE = 'quay.ceph.io/ceph-ci/ceph:main'
 DEFAULT_IMAGE_IS_MAIN = True
 DEFAULT_IMAGE_RELEASE = 'squid'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -22,7 +22,7 @@ DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
 DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
 DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
-DEFAULT_REGISTRY = 'docker.io'  # normalize unqualified digests to this
+DEFAULT_REGISTRY = 'quay.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
 
 LATEST_STABLE_RELEASE = 'squid'
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index ae9acbc9c45..a0e648e857c 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -370,6 +370,8 @@ class CTDBDaemonContainer(SambaContainerCommon):
         # make conditional?
         # CAP_NET_ADMIN is needed for event script to add public ips to iface
         cargs.append('--cap-add=NET_ADMIN')
+        # CAP_NET_RAW allows to send gratuitous ARPs/tickle ACKs via raw sockets
+        cargs.append('--cap-add=NET_RAW')
         return cargs
 
 
@@ -714,6 +716,18 @@ class SMB(ContainerDaemonForm):
             mounts[ctdb_run] = '/var/run/ctdb:z'
             mounts[ctdb_volatile] = '/var/lib/ctdb/volatile:z'
             mounts[ctdb_etc] = '/etc/ctdb:z'
+            # create a shared smb.conf file for our clustered instances.
+            # This is a HACK that substitutes for a bunch of architectural
+            # changes to sambacc *and* smbmetrics (container). In short,
+            # sambacc can set up the correct cluster enabled conf file for
+            # samba daemons (smbd, winbindd, etc) but not it's own long running
+            # tasks.  Similarly, the smbmetrics container always uses the
+            # registry conf (non-clustered). Having cephadm create a stub
+            # config that will share the file across all containers is a
+            # stopgap that resolves the problem for now, but should eventually
+            # be replaced by a less "leaky" approach in the managed containers.
+            ctdb_smb_conf = str(data_dir / 'ctdb/smb.conf')
+            mounts[ctdb_smb_conf] = '/etc/samba/smb.conf:z'
 
     def customize_container_endpoints(
         self, endpoints: List[EndPoint], deployment_type: DeploymentType
@@ -739,6 +753,7 @@ class SMB(ContainerDaemonForm):
             file_utils.makedirs(ddir / 'ctdb/volatile', uid, gid, 0o770)
             file_utils.makedirs(ddir / 'ctdb/etc', uid, gid, 0o770)
             self._write_ctdb_stub_config(etc_samba_ctr / 'ctdb.json')
+            self._write_smb_conf_stub(ddir / 'ctdb/smb.conf')
 
     def _write_ctdb_stub_config(self, path: pathlib.Path) -> None:
         reclock_cmd = ' '.join(_MUTEX_SUBCMD + [self._cfg.cluster_lock_uri])
@@ -758,6 +773,19 @@ class SMB(ContainerDaemonForm):
         with file_utils.write_new(path) as fh:
             json.dump(stub_config, fh)
 
+    def _write_smb_conf_stub(self, path: pathlib.Path) -> None:
+        """Initialize a stub smb conf that will be shared by the primary
+        and sidecar containers. This is expected to be overwritten by
+        sambacc.
+        """
+        _lines = [
+            '[global]',
+            'config backend = registry',
+        ]
+        with file_utils.write_new(path) as fh:
+            for line in _lines:
+                fh.write(f'{line}\n')
+
 
 class _NetworkMapper:
     """Helper class that maps between cephadm-friendly address-networks
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 2f4674752cc..0ab8b38d2b5 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -165,17 +165,17 @@ def is_fsid(s):
 def normalize_image_digest(digest: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/ubuntu', 'quay.io')
+    'quay.io/ubuntu'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/cephadm/samples/custom_container.json b/src/cephadm/samples/custom_container.json
index 194a44d2abb..210cf1e3e55 100644
--- a/src/cephadm/samples/custom_container.json
+++ b/src/cephadm/samples/custom_container.json
@@ -1,5 +1,5 @@
 {
-    "image": "docker.io/prom/alertmanager:v0.20.0",
+    "image": "quay.io/prometheus/alertmanager:v0.20.0",
     "ports": [9093, 9094],
     "args": [
         "-p", "9093:9093",
diff --git a/src/cephadm/tests/build/test_cephadm_build.py b/src/cephadm/tests/build/test_cephadm_build.py
index 1465c2c5efe..c2995a76d4b 100644
--- a/src/cephadm/tests/build/test_cephadm_build.py
+++ b/src/cephadm/tests/build/test_cephadm_build.py
@@ -34,12 +34,12 @@ CONTAINERS = {
     },
     'ubuntu-20.04': {
         'name': 'cephadm-build-test:ubuntu-20-04-py3',
-        'base_image': 'docker.io/library/ubuntu:20.04',
+        'base_image': 'quay.io/library/ubuntu:20.04',
         'script': 'apt update && apt install -y python3-venv',
     },
     'ubuntu-22.04': {
         'name': 'cephadm-build-test:ubuntu-22-04-py3',
-        'base_image': 'docker.io/library/ubuntu:22.04',
+        'base_image': 'quay.io/library/ubuntu:22.04',
         'script': 'apt update && apt install -y python3-venv',
     },
 }
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index 928982de70b..f27b9bcd362 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -533,12 +533,12 @@ class TestCephAdm(object):
 
     def test_get_image_info_from_inspect(self):
         # podman
-        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
+        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/ceph/ceph:latest')
         print(r)
         assert r == {
             'image_id': '204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1',
-            'repo_digests': ['docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
+            'repo_digests': ['quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
         }
 
         # docker
@@ -550,13 +550,13 @@ class TestCephAdm(object):
         }
 
         # multiple digests (podman)
-        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
+        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/prom/prometheus:latest')
         assert r == {
             'image_id': 'e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42',
             'repo_digests': [
-                'docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
-                'docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
+                'quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
+                'quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
             ]
         }
 
@@ -604,7 +604,7 @@ class TestCephAdm(object):
                                  '')
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=cinfo):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -613,7 +613,7 @@ class TestCephAdm(object):
         # make sure first valid image is used when no container_info is found
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -621,12 +621,12 @@ class TestCephAdm(object):
 
         # make sure images without digest are discarded (no container_info is found)
         out = '''quay.ceph.io/ceph-ci/ceph@|||
-        docker.io/ceph/ceph@|||
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@|||
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
-                assert image == 'docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
+                assert image == 'quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
 
 
 
@@ -2409,7 +2409,7 @@ class TestSNMPGateway:
 
     def test_unit_run_V2c(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V2c_config)
             ctx.fsid = fsid
@@ -2434,11 +2434,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
 
     def test_unit_run_V3_noPriv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_no_priv_config)
             ctx.fsid = fsid
@@ -2463,11 +2463,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
 
     def test_unit_run_V3_Priv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_priv_config)
             ctx.fsid = fsid
@@ -2492,11 +2492,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
 
     def test_unit_run_no_dest(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.no_destination_config)
             ctx.fsid = fsid
@@ -2512,7 +2512,7 @@ class TestSNMPGateway:
 
     def test_unit_run_bad_version(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.bad_version_config)
             ctx.fsid = fsid
diff --git a/src/cephadm/tests/test_custom_container.py b/src/cephadm/tests/test_custom_container.py
index c185b0908df..197ed38dca3 100644
--- a/src/cephadm/tests/test_custom_container.py
+++ b/src/cephadm/tests/test_custom_container.py
@@ -47,7 +47,7 @@ class TestCustomContainer(unittest.TestCase):
                     ]
                 ]
             },
-            image='docker.io/library/hello-world:latest'
+            image='quay.io/hello-world/hello-world:latest'
         )
 
     def test_entrypoint(self):
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index 70e9a411238..20608c1681c 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -49,7 +49,8 @@ deps =
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
-    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 11'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25'
 # Downstream distributions may choose to alter this "docker.io" number,
 # to make sure no new references to docker.io are creeping in unnoticed.
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index e208cf76675..6577dd575f1 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -10798,7 +10798,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
         goto success;
     }
 
-    clnt->put_cap_ref(in, CEPH_CAP_FILE_RD);
     // reverify size
     {
       r = clnt->_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
@@ -10810,14 +10809,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
     if ((uint64_t)pos >= in->size)
       goto success;
 
-    {
-      int have_caps2 = 0;
-      r = clnt->get_caps(f, CEPH_CAP_FILE_RD, have_caps, &have_caps2, -1);
-      if (r < 0) {
-        goto error;
-      }
-    }
-
     wanted = left;
     retry();
     clnt->client_lock.unlock();
@@ -10971,6 +10962,20 @@ retry:
     // branch below but in a non-blocking fashion. The code in _read_sync
     // is duplicated and modified and exists in
     // C_Read_Sync_NonBlocking::finish().
+
+    // trim read based on file size?
+    if ((offset >= in->size) || (size == 0)) {
+      // read is requested at the EOF or the read len is zero, therefore just
+      // release managed pointers and complete the C_Read_Finisher immediately with 0 bytes
+
+      Context *iof = iofinish.release();
+      crf.release();
+      iof->complete(0);
+
+      // Signal async completion
+      return 0;
+    }
+
     C_Read_Sync_NonBlocking *crsa =
       new C_Read_Sync_NonBlocking(this, iofinish.release(), f, in, f->pos,
                                   offset, size, bl, filer.get(), have);
@@ -11399,10 +11404,18 @@ int64_t Client::_write_success(Fh *f, utime_t start, uint64_t fpos,
   return r;
 }
 
+void Client::C_Lock_Client_Finisher::finish(int r)
+{
+  std::scoped_lock lock(clnt->client_lock);
+  onfinish->complete(r);
+}
+
 void Client::C_Write_Finisher::finish_io(int r)
 {
   bool fini;
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   clnt->put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
   if (r >= 0) {
@@ -11438,6 +11451,8 @@ void Client::C_Write_Finisher::finish_fsync(int r)
   bool fini;
   client_t const whoami = clnt->whoami;  // For the benefit of ldout prefix
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   ldout(clnt->cct, 3) << "finish_fsync r = " << r << dendl;
 
   fsync_finished = true;
@@ -11598,6 +11613,7 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
 
   std::unique_ptr<Context> iofinish = nullptr;
   std::unique_ptr<C_Write_Finisher> cwf = nullptr;
+  std::unique_ptr<Context> filer_iofinish = nullptr;
   
   if (in->inline_version < CEPH_INLINE_NONE) {
     if (endoff > cct->_conf->client_max_inline_size ||
@@ -11709,7 +11725,10 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     if (onfinish == nullptr) {
       // We need a safer condition to wait on.
       cond_iofinish = new C_SaferCond();
-      iofinish.reset(cond_iofinish);
+      filer_iofinish.reset(cond_iofinish);
+    } else {
+      //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock'
+      filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get()));
     }
 
     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
@@ -11717,11 +11736,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
 		       offset, size, bl, ceph::real_clock::now(), 0,
 		       in->truncate_size, in->truncate_seq,
-		       iofinish.get());
+		       filer_iofinish.get());
 
     if (onfinish) {
       // handle non-blocking caller (onfinish != nullptr), we can now safely
       // release all the managed pointers
+      filer_iofinish.release();
       iofinish.release();
       onuninline.release();
       cwf.release();
diff --git a/src/client/Client.h b/src/client/Client.h
index 5a1e69394d0..f8c39e2fdd6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -1409,6 +1409,21 @@ private:
     void finish(int r) override;
   };
 
+  // A wrapper callback which takes the 'client_lock' and finishes the context.
+  // One of the usecase is the filer->write_trunc which doesn't hold client_lock
+  // in the call back passed. So, use this wrapper in such cases.
+  class C_Lock_Client_Finisher : public Context {
+  public:
+    C_Lock_Client_Finisher(Client *clnt, Context *onfinish)
+      : clnt(clnt), onfinish(onfinish) {}
+
+  private:
+    Client *clnt;
+    Context *onfinish;
+
+    void finish(int r) override;
+  };
+
   class C_Write_Finisher : public Context {
   public:
     void finish_io(int r);
diff --git a/src/cls/user/cls_user.cc b/src/cls/user/cls_user.cc
index 0447bf33a2c..592f304fc71 100644
--- a/src/cls/user/cls_user.cc
+++ b/src/cls/user/cls_user.cc
@@ -482,10 +482,6 @@ static int cls_user_reset_stats2(cls_method_context_t hctx,
     add_header_stats(&ret.acc_stats, e);
   }
 
-  /* try-update marker */
-  if(!keys.empty())
-    ret.marker = (--keys.cend())->first;
-
   if (! ret.truncated) {
     buffer::list bl;
     header.last_stats_update = op.time;
@@ -500,6 +496,10 @@ static int cls_user_reset_stats2(cls_method_context_t hctx,
     return rc;
   }
 
+  /* try-update marker */
+  if(!keys.empty())
+    ret.marker = (--keys.cend())->first;
+
   /* return partial result */
   encode(ret, *out);
   return 0;
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 8b9f3339e38..ea3cce16609 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -12,6 +12,7 @@ if(WIN32)
   add_library(dlfcn_win32 STATIC win32/dlfcn.cc win32/errno.cc)
 endif()
 
+add_subdirectory(io_exerciser)
 add_subdirectory(options)
 
 set(common_srcs
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
index d34179b4020..d25d5dd5ada 100644
--- a/src/common/Preforker.h
+++ b/src/common/Preforker.h
@@ -126,7 +126,7 @@ public:
     }
     return r;
   }
-  void exit(int r) {
+  [[noreturn]] void exit(int r) {
     if (is_child())
         signal_exit(r);
     ::exit(r);
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index a6467bcaaca..b888d933480 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -204,7 +204,7 @@ void OpHistory::dump_slow_ops(utime_t now, Formatter *f, set<string> filters)
   cleanup(now);
   f->open_object_section("OpHistory slow ops");
   f->dump_int("num to keep", history_slow_op_size.load());
-  f->dump_int("threshold to keep", history_slow_op_threshold.load());
+  f->dump_float("threshold to keep", history_slow_op_threshold.load());
   {
     f->open_array_section("Ops");
     for ([[maybe_unused]] const auto& [t, op] : slow_op) {
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 681301252de..f4cf8902dac 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -68,7 +68,7 @@ class OpHistory {
   std::atomic_size_t history_size{0};
   std::atomic_uint32_t history_duration{0};
   std::atomic_size_t history_slow_op_size{0};
-  std::atomic_uint32_t history_slow_op_threshold{0};
+  std::atomic<float> history_slow_op_threshold{0};
   std::atomic_bool shutdown{false};
   OpHistoryServiceThread opsvc;
   friend class OpHistoryServiceThread;
@@ -113,7 +113,7 @@ public:
     history_size = new_size;
     history_duration = new_duration;
   }
-  void set_slow_op_size_and_threshold(size_t new_size, uint32_t new_threshold) {
+  void set_slow_op_size_and_threshold(size_t new_size, float new_threshold) {
     history_slow_op_size = new_size;
     history_slow_op_threshold = new_threshold;
   }
@@ -144,7 +144,7 @@ public:
   void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
     history.set_size_and_duration(new_size, new_duration);
   }
-  void set_history_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
+  void set_history_slow_op_size_and_threshold(uint32_t new_size, float new_threshold) {
     history.set_slow_op_size_and_threshold(new_size, new_threshold);
   }
   bool is_tracking() const {
diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
index 059d81f2ac3..6ed8c56d5da 100644
--- a/src/common/ceph_mutex.h
+++ b/src/common/ceph_mutex.h
@@ -83,7 +83,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
   #define ceph_mutex_is_locked(m) true
   #define ceph_mutex_is_locked_by_me(m) true
 }
@@ -131,8 +130,6 @@ namespace ceph {
     return {std::forward<Args>(args)...};
   }
 
-  static constexpr bool mutex_debugging = true;
-
   // debug methods
   #define ceph_mutex_is_locked(m) ((m).is_locked())
   #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
@@ -186,8 +183,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
-
   // debug methods.  Note that these can blindly return true
   // because any code that does anything other than assert these
   // are true is broken.
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
index af2baaa5c67..86ced8d183c 100644
--- a/src/common/cohort_lru.h
+++ b/src/common/cohort_lru.h
@@ -15,6 +15,12 @@
 
 #include <boost/intrusive/list.hpp>
 #include <boost/intrusive/slist.hpp>
+#include <cstdint>
+#include <atomic>
+#include <mutex>
+#include <algorithm>
+#include <functional>
+#include <vector>
 
 #ifdef __CEPH__
 # include "include/ceph_assert.h"
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
index b9b47d9cef4..12a273b8c84 100644
--- a/src/common/config_proxy.h
+++ b/src/common/config_proxy.h
@@ -31,7 +31,6 @@ class ConfigProxy {
   using rev_obs_map_t = ObsMgr::rev_obs_map;
 
   void _call_observers(rev_obs_map_t& rev_obs) {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     for (auto& [obs, keys] : rev_obs) {
       (*obs)->handle_conf_change(*this, keys);
     }
diff --git a/src/common/io_exerciser/CMakeLists.txt b/src/common/io_exerciser/CMakeLists.txt
new file mode 100644
index 00000000000..07091df86e1
--- /dev/null
+++ b/src/common/io_exerciser/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_library(object_io_exerciser STATIC
+  DataGenerator.cc
+  IoOp.cc
+  IoSequence.cc
+  Model.cc
+  ObjectModel.cc
+  RadosIo.cc
+)
+
+target_link_libraries(object_io_exerciser
+  librados 
+  global
+)
+\ No newline at end of file
diff --git a/src/common/io_exerciser/DataGenerator.cc b/src/common/io_exerciser/DataGenerator.cc
new file mode 100644
index 00000000000..9aa77eeb6e9
--- /dev/null
+++ b/src/common/io_exerciser/DataGenerator.cc
@@ -0,0 +1,753 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "DataGenerator.h"
+
+#include "ObjectModel.h"
+
+#include "common/debug.h"
+#include "common/dout.h"
+
+#include "fmt/format.h"
+#include "fmt/ranges.h"
+
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
+
+#define dout_subsys ceph_subsys_rados
+#define dout_context g_ceph_context
+
+using DataGenerator = ceph::io_exerciser::data_generation::DataGenerator;
+using SeededRandomGenerator = ceph::io_exerciser::data_generation
+                                ::SeededRandomGenerator;
+using HeaderedSeededRandomGenerator = ceph::io_exerciser::data_generation
+                                        ::HeaderedSeededRandomGenerator;
+
+std::unique_ptr<DataGenerator> DataGenerator::create_generator(
+    GenerationType generationType, const ObjectModel& model)
+{
+  switch(generationType)
+  {
+    case GenerationType::SeededRandom:
+      return std::make_unique<SeededRandomGenerator>(model);
+    case GenerationType::HeaderedSeededRandom:
+      return std::make_unique<HeaderedSeededRandomGenerator>(model);
+    default:
+      throw std::invalid_argument("Not yet implemented");
+  }
+
+  return nullptr;
+}
+
+bufferlist DataGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+  for (uint64_t block_offset = offset;
+       block_offset < offset + length;
+       block_offset++)
+  {
+    std::memset(buffer, 0, block_size);
+    retlist.append(ceph::bufferptr(buffer, block_size));
+  }
+  return retlist;
+}
+
+bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, uint64_t length)
+{
+  return bufferlist.contents_equal(generate_data(offset, length));
+}
+
+ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset)
+{
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+
+  std::mt19937_64 random_generator(m_model.get_seed(block_offset));
+  uint64_t rand1 = random_generator();
+  uint64_t rand2 = random_generator();
+
+  constexpr size_t generation_length = sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
+  {
+    std::memcpy(buffer + i, &rand1, generation_length);
+    std::memcpy(buffer + i + generation_length, &rand2, generation_length);
+  }
+
+  size_t remainingBytes = block_size % (generation_length * 2);
+  if (remainingBytes > generation_length)
+  {
+    size_t remainingBytes2 = remainingBytes - generation_length;
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+    std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
+  }
+  else if (remainingBytes > 0)
+  {
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+  }
+
+  return ceph::bufferptr(buffer, block_size);
+}
+
+ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
+{
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+
+  std::mt19937_64 random_generator(m_model.get_seed(block_offset));
+  uint64_t rand1 = random_generator() - 1;
+  uint64_t rand2 = random_generator() + 1;
+
+  constexpr size_t generation_length = sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
+  {
+    std::memcpy(buffer + i, &rand1, generation_length);
+    std::memcpy(buffer + i + generation_length, &rand2, generation_length);
+  }
+
+  size_t remainingBytes = block_size % (generation_length * 2);
+  if (remainingBytes > generation_length)
+  {
+    size_t remainingBytes2 = remainingBytes - generation_length;
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+    std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
+  }
+  else if (remainingBytes > 0)
+  {
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+  }
+
+  return ceph::bufferptr(buffer, block_size);
+}
+
+bufferlist SeededRandomGenerator::generate_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    retlist.append(generate_block(block_offset));
+  }
+
+  return retlist;
+}
+
+bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    retlist.append(generate_wrong_block(block_offset));
+  }
+
+  return retlist;
+}
+
+HeaderedSeededRandomGenerator
+  ::HeaderedSeededRandomGenerator(const ObjectModel& model,
+                                  std::optional<uint64_t> unique_run_id) :
+    SeededRandomGenerator(model),
+    unique_run_id(unique_run_id.value_or(generate_unique_run_id()))
+{
+
+}
+
+uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id()
+{
+  std::mt19937_64 random_generator =
+        std::mt19937_64(duration_cast<std::chrono::milliseconds>(
+          std::chrono::system_clock::now().time_since_epoch()).count());
+
+      return random_generator();
+}
+
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(uint64_t block_offset)
+{
+  SeedBytes seed = m_model.get_seed(block_offset);
+  TimeBytes current_time = duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now().time_since_epoch()).count();
+
+  ceph::bufferptr bufferptr = SeededRandomGenerator::generate_block(block_offset);
+
+  std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, uniqueIdLength());
+  std::memcpy(bufferptr.c_str() + seedStart(), &seed, seedLength());
+  std::memcpy(bufferptr.c_str() + timeStart(), &current_time, timeLength());
+
+  return bufferptr;
+}
+
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
+{
+  return HeaderedSeededRandomGenerator::generate_block(block_offset % 8);
+}
+
+const HeaderedSeededRandomGenerator::UniqueIdBytes
+  HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset,
+                                                 const bufferlist& bufferlist)
+{
+  UniqueIdBytes read_unique_run_id = 0;
+  std::memcpy(&read_unique_run_id,
+              &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()],
+              uniqueIdLength());
+  return read_unique_run_id;
+}
+
+const HeaderedSeededRandomGenerator::SeedBytes
+  HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset,
+                                          const bufferlist& bufferlist)
+{
+  SeedBytes read_seed = 0;
+  std::memcpy(&read_seed,
+              &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()],
+              seedLength());
+  return read_seed;
+}
+
+const HeaderedSeededRandomGenerator::TimeBytes
+  HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset,
+                                              const bufferlist& bufferlist)
+{
+  TimeBytes read_time = 0;
+  std::memcpy(&read_time,
+              &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()],
+              timeLength());
+  return read_time;
+}
+
+bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist,
+                                             uint64_t offset, uint64_t length)
+{
+  std::vector<uint64_t> invalid_block_offsets;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    bool valid_block
+      = validate_block(block_offset,
+                       (bufferlist.c_str() + ((block_offset - offset) *
+                       m_model.get_block_size())));
+    if (!valid_block)
+    {
+      invalid_block_offsets.push_back(block_offset);
+    }
+  }
+
+  if (!invalid_block_offsets.empty())
+  {
+    printDebugInformationForOffsets(offset, invalid_block_offsets, bufferlist);
+  }
+
+  return invalid_block_offsets.empty();
+}
+
+bool HeaderedSeededRandomGenerator::validate_block(uint64_t block_offset,
+                                                   const char* buffer_start)
+{
+  // We validate the block matches what we generate byte for byte
+  // however we ignore the time section of the header
+  ceph::bufferptr bufferptr = generate_block(block_offset);
+  bool valid = strncmp(bufferptr.c_str(), buffer_start, timeStart()) == 0;
+  valid = valid ? strncmp(bufferptr.c_str() + timeEnd(),
+                          buffer_start + timeEnd(),
+                          m_model.get_block_size() - timeEnd()) == 0 : valid;
+  return valid;
+}
+
+const HeaderedSeededRandomGenerator::ErrorType
+  HeaderedSeededRandomGenerator::getErrorTypeForBlock(uint64_t read_offset,
+                                                      uint64_t block_offset,
+                                                      const bufferlist& bufferlist)
+{
+  try
+  {
+    UniqueIdBytes read_unique_run_id = readUniqueRunId(block_offset - read_offset,
+                                                       bufferlist);
+    if (unique_run_id != read_unique_run_id)
+    {
+      return ErrorType::RUN_ID_MISMATCH;
+    }
+
+    SeedBytes read_seed = readSeed(block_offset - read_offset, bufferlist);
+    if (m_model.get_seed(block_offset) != read_seed)
+    {
+      return ErrorType::SEED_MISMATCH;
+    }
+
+    if (std::strncmp(&bufferlist[((block_offset - read_offset) *
+                      m_model.get_block_size()) + bodyStart()],
+                     generate_block(block_offset).c_str() + bodyStart(),
+                     m_model.get_block_size() - bodyStart()) != 0)
+    {
+      return ErrorType::DATA_MISMATCH;
+    }
+  }
+  catch(const std::exception& e)
+  {
+    return ErrorType::DATA_NOT_FOUND;
+  }
+
+  return ErrorType::UNKNOWN;
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForBlock(uint64_t read_offset, uint64_t block_offset,
+                                  const bufferlist& bufferlist)
+{
+  ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, bufferlist);
+
+  TimeBytes read_time = 0;
+  std::time_t ttp;
+
+  char read_bytes[m_model.get_block_size()];
+  char generated_bytes[m_model.get_block_size()];
+
+  if (blockError == ErrorType::DATA_MISMATCH || blockError == ErrorType::UNKNOWN)
+  {
+    read_time = readDateTime(block_offset - read_offset, bufferlist);
+    std::chrono::system_clock::time_point time_point{std::chrono::milliseconds{read_time}};
+    ttp = std::chrono::system_clock::to_time_t(time_point);
+
+    std::memcpy(&read_bytes,
+                &bufferlist[((block_offset - read_offset) * m_model.get_block_size())],
+                m_model.get_block_size() - bodyStart());
+    std::memcpy(&generated_bytes,
+                generate_block(block_offset).c_str(),
+                m_model.get_block_size() - bodyStart());
+  }
+
+  std::string error_string;
+  switch(blockError)
+  {
+    case ErrorType::RUN_ID_MISMATCH:
+    {
+      UniqueIdBytes read_unique_run_id = readUniqueRunId((block_offset - read_offset),
+                                                          bufferlist);
+      error_string = fmt::format("Header (Run ID) mismatch detected at block {} "
+        "(byte offset {}) Header expected run id {} but found id {}. "
+        "Block data corrupt or not written from this instance of this application.",
+      block_offset,
+      block_offset * m_model.get_block_size(),
+      unique_run_id,
+      read_unique_run_id);
+    }
+    break;
+
+    case ErrorType::SEED_MISMATCH:
+    {
+      SeedBytes read_seed = readSeed((block_offset - read_offset), bufferlist);
+
+      if (m_model.get_seed_offsets(read_seed).size() == 0)
+      {
+        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
+          " (byte offset {}). Header expected seed {} but found seed {}. "
+          "Read data was not from any other recognised block in the object.",
+            block_offset,
+            block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset),
+            read_seed);
+      }
+      else
+      {
+        std::vector<int> seed_offsets = m_model.get_seed_offsets(read_seed);
+        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
+          " (byte offset {}). Header expected seed {} but found seed {}."
+          " Read data was from a different block(s): {}",
+            block_offset,
+            block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset),
+            read_seed,
+            fmt::join(seed_offsets.begin(), seed_offsets.end(), ""));
+      }
+    }
+    break;
+
+    case ErrorType::DATA_MISMATCH:
+    {
+      error_string = fmt::format("Data (Body) mismatch detected at block {}"
+        " (byte offset {}). Header data matches, data body does not."
+        " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          std::ctime(&ttp),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+          fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
+    }
+    break;
+
+    case ErrorType::DATA_NOT_FOUND:
+    {
+      uint64_t bufferlist_length = bufferlist.to_str().size();
+      error_string = fmt::format("Data (Body) could not be read at block {}"
+        " (byte offset {}) offset in bufferlist returned from read: {}"
+        " ({} bytes). Returned bufferlist length: {}.",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          (block_offset - read_offset),
+          (block_offset - read_offset) * m_model.get_block_size(),
+          bufferlist_length);
+    }
+    break;
+
+    case ErrorType::UNKNOWN:
+      [[ fallthrough ]];
+
+    default:
+    {
+      error_string = fmt::format("Data mismatch detected at block {}"
+        " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+          fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
+    }
+    break;
+  }
+  dout(0) << error_string << dendl;
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForRange(uint64_t read_offset,
+                                  uint64_t start_block_offset,
+                                  uint64_t range_length_in_blocks,
+                                  ErrorType rangeError,
+                                  const bufferlist& bufferlist)
+{
+  switch(rangeError)
+  {
+  case ErrorType::RUN_ID_MISMATCH:
+    printDebugInformationForRunIdMismatchRange(read_offset, start_block_offset,
+                                               range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::SEED_MISMATCH:
+    printDebugInformationForSeedMismatchRange(read_offset, start_block_offset,
+                                              range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::DATA_MISMATCH:
+    printDebugInformationDataBodyMismatchRange(read_offset, start_block_offset,
+                                               range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::DATA_NOT_FOUND:
+    printDebugInformationDataNotFoundRange(read_offset, start_block_offset,
+                                           range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::UNKNOWN:
+    [[ fallthrough ]];
+  default:
+    printDebugInformationCorruptRange(read_offset, start_block_offset,
+                                      range_length_in_blocks, bufferlist);
+    break;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
+                                               uint64_t start_block_offset,
+                                               uint64_t range_length_in_blocks,
+                                               const bufferlist& bufferlist)
+{
+  uint64_t range_start = start_block_offset;
+  uint64_t range_length = 0;
+  UniqueIdBytes initial_read_unique_run_id = readUniqueRunId(start_block_offset - read_offset,
+                                                             bufferlist);
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
+                == ErrorType::RUN_ID_MISMATCH);
+
+    UniqueIdBytes read_unique_run_id = readUniqueRunId(i - read_offset, bufferlist);
+    if (initial_read_unique_run_id != read_unique_run_id ||
+        i == (start_block_offset + range_length_in_blocks - 1))
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, i, bufferlist);
+      }
+      else if (range_length > 1)
+      {
+        dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {} ({} bytes)"
+                    " and spanning a range of {} blocks ({} bytes). "
+                    "Expected run id {} for range but found id {}"
+                    " for all blocks in range. "
+                    "Block data corrupt or not written from this instance of this application.",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size(),
+                      unique_run_id,
+                      initial_read_unique_run_id) << dendl;
+      }
+
+      range_start = i;
+      range_length = 1;
+      initial_read_unique_run_id = read_unique_run_id;
+    }
+    else
+    {
+      range_length++;
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset,
+                                  start_block_offset + range_length_in_blocks - 1,
+                                  bufferlist);
+  }
+  else if (range_length > 1)
+  {
+    dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {}"
+                " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                "Expected run id {} for range but found id for all blocks in range. "
+                "Block data corrupt or not written from this instance of this application.",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  unique_run_id,
+                  initial_read_unique_run_id)
+            << dendl;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForSeedMismatchRange(uint64_t read_offset,
+                                              uint64_t start_block_offset,
+                                              uint64_t range_length_in_blocks,
+                                              const bufferlist& bufferlist)
+{
+  uint64_t range_start = start_block_offset;
+  uint64_t range_length = 0;
+
+  // Assert here if needed, as we can't support values
+  // that can't be converted to a signed integer.
+  ceph_assert(m_model.get_block_size() < (std::numeric_limits<uint64_t>::max() / 2));
+  std::optional<int64_t> range_offset = 0;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
+                == ErrorType::SEED_MISMATCH);
+    SeedBytes read_seed = readSeed(i - read_offset, bufferlist);
+
+    std::vector<int> seed_found_offsets = m_model.get_seed_offsets(read_seed);
+
+    if ((seed_found_offsets.size() == 1 &&
+        (static_cast<int64_t>(seed_found_offsets.front() - i) == range_offset)) ||
+        range_length == 0)
+    {
+      if (range_length == 0)
+      {
+        range_start = i;
+        if (seed_found_offsets.size() > 0)
+        {
+          range_offset = seed_found_offsets.front() - i;
+        }
+        else
+        {
+          range_offset = std::nullopt;
+        }
+      }
+      range_length++;
+    }
+    else
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, i - 1, bufferlist);
+      }
+      else if (range_length > 1 && range_offset.has_value())
+      {
+        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
+                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                    "Returned data located starting from block {} ({} bytes) "
+                    "and spanning a range of {} blocks ({} bytes).",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length, range_length * m_model.get_block_size(),
+                      static_cast<uint64_t>(*range_offset) + range_start,
+                      (static_cast<uint64_t>(*range_offset) + range_start)
+                        * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size())
+                << dendl;
+      }
+      else
+      {
+        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
+                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                    "Data seed mismatch spanning a range of {} blocks ({} bytes).",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length, range_length * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size())
+                << dendl;
+      }
+      range_length = 1;
+      range_start = i;
+      if (seed_found_offsets.size() > 0)
+      {
+        range_offset = seed_found_offsets.front() - i;
+      }
+      else
+      {
+        range_offset = std::nullopt;
+      }
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset,
+                                  start_block_offset + range_length_in_blocks - 1,
+                                  bufferlist);
+  }
+  else if (range_length > 1 && range_offset.has_value())
+  {
+    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes). "
+                "Returned data located starting from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes).",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  *range_offset + range_start,
+                  (*range_offset + range_start) * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size())
+            << dendl;
+  }
+  else
+  {
+    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes). "
+                "and spanning a range of {} blocks ({} bytes).",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size())
+            << dendl;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+::printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
+                                             uint64_t start_block_offset,
+                                             uint64_t range_length_in_blocks,
+                                             const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
+              "Headers look as expected for range, "
+              "but generated data body does not match. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationCorruptRange(uint64_t read_offset,
+                                      uint64_t start_block_offset,
+                                      uint64_t range_length_in_blocks,
+                                      const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
+              "Headers look as expected for range, "
+              "but generated data body does not match. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationDataNotFoundRange(uint64_t read_offset,
+                                           uint64_t start_block_offset,
+                                           uint64_t range_length_in_blocks,
+                                           const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data not found for blocks from {} to {}. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset; i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForOffsets(uint64_t read_offset,
+                                    std::vector<uint64_t> offsets,
+                                    const bufferlist& bufferlist)
+{
+  uint64_t range_start = 0;
+  uint64_t range_length = 0;
+  ErrorType rangeError = ErrorType::UNKNOWN;
+
+  for (const uint64_t& block_offset : offsets)
+  {
+    ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset,
+                                                bufferlist);
+
+    if (range_start == 0 && range_length == 0)
+    {
+      range_start = block_offset;
+      range_length = 1;
+      rangeError = blockError;
+    }
+    else if (blockError == rangeError &&
+             range_start + range_length == block_offset)
+{
+      range_length++;
+    }
+    else
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, range_start, bufferlist);
+      }
+      else if (range_length > 1)
+      {
+        printDebugInformationForRange(read_offset, range_start, range_length,
+                                      rangeError, bufferlist);
+      }
+
+      range_start = block_offset;
+      range_length = 1;
+      rangeError = blockError;
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset, range_start, bufferlist);
+  }
+  else if (range_length > 1)
+  {
+    printDebugInformationForRange(read_offset, range_start, range_length,
+                                  rangeError, bufferlist);
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/DataGenerator.h b/src/common/io_exerciser/DataGenerator.h
new file mode 100644
index 00000000000..1e5784a54cc
--- /dev/null
+++ b/src/common/io_exerciser/DataGenerator.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <memory>
+#include <random>
+
+#include "include/buffer.h"
+#include "ObjectModel.h"
+
+/* Overview
+ *
+ * class DataGenerator
+ *   Generates data buffers for write I/Os using state queried
+ *   from ObjectModel. Validates data buffers for read I/Os
+ *   against the state in the ObjectModel. If a data miscompare
+ *   is detected provide debug information about the state of the
+ *   object, the buffer that was read and the expected buffer.
+ *
+ *
+ * class SeededRandomGenerator
+ *   Inherits from DataGenerator. Generates entirely random patterns
+ *   based on the seed retrieved by the model.
+ *
+ *
+ * class HeaderedSeededRandomGenerator
+ *   Inherits from SeededDataGenerator. Generates entirely random patterns
+ *   based on the seed retrieved by the model, however also appends a 
+ *   header to the start of each block. This generator also provides
+ *   a range of verbose debug options to help disagnose a miscompare
+ *   whenever it detects unexpected data.
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    namespace data_generation {
+      enum class GenerationType {
+        SeededRandom,
+        HeaderedSeededRandom
+        // CompressedGenerator
+        // MixedGenerator
+      };
+
+      class DataGenerator {
+      public:
+        virtual ~DataGenerator() = default;
+        static std::unique_ptr<DataGenerator>
+          create_generator(GenerationType generatorType,
+                           const ObjectModel& model);
+        virtual bufferlist generate_data(uint64_t length, uint64_t offset)=0;
+        virtual bool validate(bufferlist& bufferlist, uint64_t offset,
+                              uint64_t length);
+
+        // Used for testing debug outputs from data generation
+        virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length);
+
+      protected:
+        const ObjectModel& m_model;
+
+        DataGenerator(const ObjectModel& model) : m_model(model) {}
+      };
+
+      class SeededRandomGenerator : public DataGenerator
+      {
+        public:
+          SeededRandomGenerator(const ObjectModel& model)
+            : DataGenerator(model) {}
+
+          virtual bufferptr generate_block(uint64_t offset);
+          virtual bufferlist generate_data(uint64_t length, uint64_t offset);
+          virtual bufferptr generate_wrong_block(uint64_t offset);
+          virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length) override;
+      };
+
+      class HeaderedSeededRandomGenerator : public SeededRandomGenerator
+      {
+        public:
+          HeaderedSeededRandomGenerator(const ObjectModel& model,
+                                        std::optional<uint64_t> unique_run_id = std::nullopt);
+
+          bufferptr generate_block(uint64_t offset) override;
+          bufferptr generate_wrong_block(uint64_t offset) override;
+          bool validate(bufferlist& bufferlist, uint64_t offset,
+                        uint64_t length) override;
+
+        private:
+          using UniqueIdBytes = uint64_t;
+          using SeedBytes = int;
+          using TimeBytes = uint64_t;
+
+          enum class ErrorType {
+            RUN_ID_MISMATCH,
+            SEED_MISMATCH,
+            DATA_MISMATCH,
+            DATA_NOT_FOUND,
+            UNKNOWN
+          };
+
+          constexpr uint8_t headerStart() const
+            { return 0; };
+          constexpr uint8_t uniqueIdStart() const
+            { return headerStart(); };
+          constexpr uint8_t uniqueIdLength() const
+            { return sizeof(UniqueIdBytes); };
+          constexpr uint8_t seedStart() const
+            { return uniqueIdStart() + uniqueIdLength(); };
+          constexpr uint8_t seedLength() const
+            { return sizeof(SeedBytes); };
+          constexpr uint8_t timeStart() const
+            { return seedStart() + seedLength(); };
+          constexpr uint8_t timeLength() const
+            { return sizeof(TimeBytes); };
+          constexpr uint8_t timeEnd() const
+            { return timeStart() + timeLength(); };
+          constexpr uint8_t headerLength() const
+            { return uniqueIdLength() + seedLength() + timeLength(); };
+          constexpr uint8_t bodyStart() const
+            { return headerStart() + headerLength(); };
+
+          const UniqueIdBytes readUniqueRunId(uint64_t block_offset,
+                                              const bufferlist& bufferlist);
+          const SeedBytes readSeed(uint64_t block_offset,
+                                   const bufferlist& bufferlist);
+          const TimeBytes readDateTime(uint64_t block_offset,
+                                       const bufferlist& bufferlist);
+
+          const UniqueIdBytes unique_run_id;
+
+          uint64_t generate_unique_run_id();
+
+          bool validate_block(uint64_t block_offset, const char* buffer_start);
+
+          const ErrorType getErrorTypeForBlock(uint64_t read_offset,
+                                               uint64_t block_offset,
+                                               const bufferlist& bufferlist);
+
+          void printDebugInformationForBlock(uint64_t read_offset,
+                                             uint64_t block_offset,
+                                             const bufferlist& bufferlist);
+          void printDebugInformationForRange(uint64_t read_offset,
+                                             uint64_t start_block_offset,
+                                             uint64_t range_length_in_blocks,
+                                             ErrorType rangeError,
+                                             const bufferlist& bufferlist);
+
+          void printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
+                                                          uint64_t start_block_offset,
+                                                          uint64_t range_length_in_blocks,
+                                                          const bufferlist& bufferlist);
+          void printDebugInformationForSeedMismatchRange(uint64_t read_offset,
+                                                         uint64_t start_block_offset,
+                                                         uint64_t range_length_in_blocks,
+                                                         const bufferlist& bufferlist);
+          void printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
+                                                          uint64_t start_block_offset,
+                                                          uint64_t range_length_in_blocks,
+                                                          const bufferlist& bufferlist);
+          void printDebugInformationDataNotFoundRange(uint64_t ßread_offset,
+                                                      uint64_t start_block_offset,
+                                                      uint64_t range_length_in_blocks,
+                                                      const bufferlist& bufferlist);
+          void printDebugInformationCorruptRange(uint64_t read_offset,
+                                                 uint64_t start_block_offset,
+                                                 uint64_t range_length_in_blocks,
+                                                 const bufferlist& bufferlist);
+
+          void printDebugInformationForOffsets(uint64_t read_offset,
+                                               std::vector<uint64_t> offsets,
+                                               const bufferlist& bufferlist);
+      };
+    }
+  }
+}
diff --git a/src/common/io_exerciser/IoOp.cc b/src/common/io_exerciser/IoOp.cc
new file mode 100644
index 00000000000..cd855ba6fff
--- /dev/null
+++ b/src/common/io_exerciser/IoOp.cc
@@ -0,0 +1,188 @@
+#include "IoOp.h"
+
+using IoOp = ceph::io_exerciser::IoOp;
+
+IoOp::IoOp( OpType op,
+            uint64_t offset1, uint64_t length1,
+            uint64_t offset2, uint64_t length2,
+            uint64_t offset3, uint64_t length3) :
+  op(op),
+  offset1(offset1), length1(length1),
+  offset2(offset2), length2(length2),
+  offset3(offset3), length3(length3)
+{
+
+}
+
+std::string IoOp::value_to_string(uint64_t v) const
+{
+  if (v < 1024 || (v % 1024) != 0) {
+    return std::to_string(v);
+  }else if (v < 1024*1024 || (v % (1024 * 1024)) != 0 ) {
+    return std::to_string(v / 1024) + "K";
+  }else{
+    return std::to_string(v / 1024 / 1024) + "M";
+  }
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_done() {
+
+    return std::make_unique<IoOp>(OpType::Done);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_barrier() {
+
+  return std::make_unique<IoOp>(OpType::BARRIER);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_create(uint64_t size) {
+
+  return std::make_unique<IoOp>(OpType::CREATE,0,size);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_remove() {
+
+  return std::make_unique<IoOp>(OpType::REMOVE);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read(uint64_t offset, uint64_t length) {
+
+  return std::make_unique<IoOp>(OpType::READ, offset, length);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read2(uint64_t offset1, uint64_t length1,
+                   uint64_t offset2, uint64_t length2) {
+
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+
+  return std::make_unique<IoOp>(OpType::READ2,
+                                offset1, length1,
+                                offset2, length2);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read3(uint64_t offset1, uint64_t length1,
+                   uint64_t offset2, uint64_t length2,
+                   uint64_t offset3, uint64_t length3) {
+
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  if (offset1 < offset3) {
+    ceph_assert( offset1 + length1 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset1 );
+  }
+  if (offset2 < offset3) {
+    ceph_assert( offset2 + length2 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset2 );
+  }
+  return std::make_unique<IoOp>(OpType::READ3,
+                                offset1, length1,
+                                offset2, length2,
+                                offset3, length3);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write(uint64_t offset, uint64_t length) {
+  return std::make_unique<IoOp>(OpType::WRITE, offset, length);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write2(uint64_t offset1, uint64_t length1,
+                                            uint64_t offset2, uint64_t length2) {
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  return std::make_unique<IoOp>(OpType::WRITE2,
+                                offset1, length1,
+                                offset2, length2);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write3(uint64_t offset1, uint64_t length1, 
+                                            uint64_t offset2, uint64_t length2,
+                                            uint64_t offset3, uint64_t length3) {
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  if (offset1 < offset3) {
+    ceph_assert( offset1 + length1 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset1 );
+  }
+  if (offset2 < offset3) {
+    ceph_assert( offset2 + length2 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset2 );
+  }
+  return std::make_unique<IoOp>(OpType::WRITE3,
+                                offset1, length1,
+                                offset2, length2,
+                                offset3, length3);
+}
+
+bool IoOp::done() {
+  return (op == OpType::Done);
+}
+
+std::string IoOp::to_string(uint64_t block_size) const
+{
+  switch (op) {
+  case OpType::Done:
+    return "Done";
+  case OpType::BARRIER:
+    return "Barrier";
+  case OpType::CREATE:
+    return "Create (size=" + value_to_string(length1 * block_size) + ")";
+  case OpType::REMOVE:
+    return "Remove";
+  case OpType::READ:
+    return "Read (offset=" + value_to_string(offset1 * block_size) +
+           ",length=" + value_to_string(length1 * block_size) + ")";
+  case OpType::READ2:
+    return "Read2 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) + ")";
+  case OpType::READ3:
+    return "Read3 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) +
+           ",offset3=" + value_to_string(offset3 * block_size) +
+           ",length3=" + value_to_string(length3 * block_size) + ")";
+  case OpType::WRITE:
+    return "Write (offset=" + value_to_string(offset1 * block_size) +
+           ",length=" + value_to_string(length1 * block_size) + ")";
+  case OpType::WRITE2:
+    return "Write2 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) + ")";
+  case OpType::WRITE3:
+    return "Write3 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) +
+           ",offset3=" + value_to_string(offset3 * block_size) +
+           ",length3=" + value_to_string(length3 * block_size) + ")";
+  default:
+    break;
+  }
+  return "Unknown";
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoOp.h b/src/common/io_exerciser/IoOp.h
new file mode 100644
index 00000000000..60c02a93d4e
--- /dev/null
+++ b/src/common/io_exerciser/IoOp.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <string>
+#include <memory>
+#include "include/ceph_assert.h"
+
+/* Overview
+ *
+ * enum OpType
+ *   Enumeration of different types of I/O operation
+ *
+ * class IoOp
+ *   Stores details for an I/O operation. Generated by IoSequences
+ *   and applied by IoExerciser's
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    enum class OpType {
+      Done,       // End of I/O sequence
+      BARRIER,    // Barrier - all prior I/Os must complete
+      CREATE,     // Create object and pattern with data
+      REMOVE,     // Remove object
+      READ,       // Read
+      READ2,      // 2 Reads in one op
+      READ3,      // 3 Reads in one op
+      WRITE,      // Write
+      WRITE2,     // 2 Writes in one op
+      WRITE3      // 3 Writes in one op
+    };
+
+    class IoOp {
+    protected:
+      std::string value_to_string(uint64_t v) const;
+
+    public:
+      OpType op;
+      uint64_t offset1;
+      uint64_t length1;
+      uint64_t offset2;
+      uint64_t length2;
+      uint64_t offset3;
+      uint64_t length3;
+
+      IoOp( OpType op,
+            uint64_t offset1 = 0, uint64_t length1 = 0,
+            uint64_t offset2 = 0, uint64_t length2 = 0,
+            uint64_t offset3 = 0, uint64_t length3 = 0 );
+
+      static std::unique_ptr<IoOp> generate_done();
+
+      static std::unique_ptr<IoOp> generate_barrier();
+
+      static std::unique_ptr<IoOp> generate_create(uint64_t size);
+
+      static std::unique_ptr<IoOp> generate_remove();
+
+      static std::unique_ptr<IoOp> generate_read(uint64_t offset,
+                                                 uint64_t length);
+
+      static std::unique_ptr<IoOp> generate_read2(uint64_t offset1,
+                                                  uint64_t length1,
+                                                  uint64_t offset2,
+                                                  uint64_t length2);
+
+      static std::unique_ptr<IoOp> generate_read3(uint64_t offset1,
+                                                  uint64_t length1,
+                                                  uint64_t offset2,
+                                                  uint64_t length2,
+                                                  uint64_t offset3,
+                                                  uint64_t length3);
+
+      static std::unique_ptr<IoOp> generate_write(uint64_t offset,
+                                                  uint64_t length);
+
+      static std::unique_ptr<IoOp> generate_write2(uint64_t offset1,
+                                                   uint64_t length1,
+                                                   uint64_t offset2,
+                                                   uint64_t length2);
+
+      static std::unique_ptr<IoOp> generate_write3(uint64_t offset1,
+                                                   uint64_t length1,
+                                                   uint64_t offset2,
+                                                   uint64_t length2,
+                                                   uint64_t offset3,
+                                                   uint64_t length3);
+
+      bool done();
+
+      std::string to_string(uint64_t block_size) const;
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.cc b/src/common/io_exerciser/IoSequence.cc
new file mode 100644
index 00000000000..4a7ca0593d1
--- /dev/null
+++ b/src/common/io_exerciser/IoSequence.cc
@@ -0,0 +1,500 @@
+#include "IoSequence.h"
+
+using Sequence = ceph::io_exerciser::Sequence;
+using IoSequence = ceph::io_exerciser::IoSequence;
+
+std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& seq)
+{
+  switch (seq)
+  {
+    case Sequence::SEQUENCE_SEQ0:
+      os << "SEQUENCE_SEQ0";
+      break;
+    case Sequence::SEQUENCE_SEQ1:
+      os << "SEQUENCE_SEQ1";
+      break;
+    case Sequence::SEQUENCE_SEQ2:
+      os << "SEQUENCE_SEQ2";
+      break;
+    case Sequence::SEQUENCE_SEQ3:
+      os << "SEQUENCE_SEQ3";
+      break;
+    case Sequence::SEQUENCE_SEQ4:
+      os << "SEQUENCE_SEQ4";
+      break;
+    case Sequence::SEQUENCE_SEQ5:
+      os << "SEQUENCE_SEQ5";
+      break;
+    case Sequence::SEQUENCE_SEQ6:
+      os << "SEQUENCE_SEQ6";
+      break;
+    case Sequence::SEQUENCE_SEQ7:
+      os << "SEQUENCE_SEQ7";
+      break;
+    case Sequence::SEQUENCE_SEQ8:
+      os << "SEQUENCE_SEQ8";
+      break;
+    case Sequence::SEQUENCE_SEQ9:
+      os << "SEQUENCE_SEQ9";
+      break;
+    case Sequence::SEQUENCE_END:
+      os << "SEQUENCE_END";
+      break;
+  }
+  return os;
+}
+
+IoSequence::IoSequence(std::pair<int,int> obj_size_range,
+                                           int seed) :
+        min_obj_size(obj_size_range.first), max_obj_size(obj_size_range.second),
+        create(true), barrier(false), done(false), remove(false),
+        obj_size(min_obj_size), step(-1), seed(seed)
+{
+  rng.seed(seed);
+}
+
+std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s,
+                                                          std::pair<int,int> obj_size_range,
+                                                          int seed)
+{
+  switch (s) {
+    case Sequence::SEQUENCE_SEQ0:
+      return std::make_unique<Seq0>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ1:
+      return std::make_unique<Seq1>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ2:
+      return std::make_unique<Seq2>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ3:
+      return std::make_unique<Seq3>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ4:
+      return std::make_unique<Seq4>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ5:
+      return std::make_unique<Seq5>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ6:
+      return std::make_unique<Seq6>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ7:
+      return std::make_unique<Seq7>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ8:
+      return std::make_unique<Seq8>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ9:
+      return std::make_unique<Seq9>(obj_size_range, seed);
+    default:
+      break;
+  }
+  return nullptr;
+}
+
+int IoSequence::get_step() const
+{
+  return step;
+}
+
+int IoSequence::get_seed() const
+{
+  return seed;
+}
+
+void IoSequence::set_min_object_size(uint64_t size)
+{
+  min_obj_size = size;
+  if (obj_size < size) {
+    obj_size = size;
+    if (obj_size > max_obj_size) {
+      done = true;
+    }
+  }
+}
+
+void IoSequence::set_max_object_size(uint64_t size)
+{
+  max_obj_size = size;
+  if (obj_size > size) {
+    done = true;
+  }
+}
+
+void IoSequence::select_random_object_size()
+{
+  if (max_obj_size != min_obj_size) {
+    obj_size = min_obj_size + rng(max_obj_size - min_obj_size);
+  }
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size()
+{
+  obj_size++;
+  if (obj_size > max_obj_size) {
+    done = true;
+  }
+  create = true;
+  barrier = true;
+  remove = true;
+  return IoOp::generate_barrier();
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::next()
+{
+  step++;
+  if (remove) {
+    remove = false;
+    return IoOp::generate_remove();
+  }
+  if (barrier) {
+    barrier = false;
+    return IoOp::generate_barrier();
+  }
+  if (done) {
+    return IoOp::generate_done();
+  }
+  if (create) {
+    create = false;
+    barrier = true;
+    return IoOp::generate_create(obj_size);
+  }
+  return _next();
+}
+
+
+
+ceph::io_exerciser::Seq0::Seq0(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0)
+{
+  select_random_object_size();
+  length = 1 + rng(obj_size - 1);
+}
+
+std::string ceph::io_exerciser::Seq0::get_name() const
+{
+  return "Sequential reads of length " + std::to_string(length) +
+    " with queue depth 1 (seqseed " + std::to_string(get_seed()) + ")";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next()
+{
+  std::unique_ptr<IoOp> r;
+  if (offset >= obj_size) {
+    done = true;
+    barrier = true;
+    remove = true;
+    return IoOp::generate_barrier();
+  }
+  if (offset + length > obj_size) {
+    r = IoOp::generate_read(offset, obj_size - offset);
+  } else {
+    r = IoOp::generate_read(offset, length);
+  }
+  offset += length;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq1::Seq1(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed)
+{
+  select_random_object_size();
+  count = 3 * obj_size;
+}
+
+std::string ceph::io_exerciser::Seq1::get_name() const
+{
+  return "Random offset, random length read/write I/O with queue depth 1 (seqseed "
+    + std::to_string(get_seed()) + ")";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next()
+{
+  barrier = true;
+  if (count-- == 0) {
+    done = true;
+    remove = true;
+    return IoOp::generate_barrier();
+  }
+
+  uint64_t offset = rng(obj_size - 1);
+  uint64_t length = 1 + rng(obj_size - 1 - offset);
+  return (rng(2) != 0) ? IoOp::generate_write(offset, length) :
+    IoOp::generate_read(offset, length);
+}
+
+
+
+ceph::io_exerciser::Seq2::Seq2(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(0) {}
+
+std::string ceph::io_exerciser::Seq2::get_name() const
+{
+  return "Permutations of offset and length read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next()
+{
+  length++;
+  if (length > obj_size - offset) {
+    length = 1;
+    offset++;
+    if (offset >= obj_size) {
+      offset = 0;
+      length = 0;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read(offset, length);
+}
+
+
+
+ceph::io_exerciser::Seq3::Seq3(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(0)
+{
+  set_min_object_size(2);
+}
+
+std::string ceph::io_exerciser::Seq3::get_name() const
+{
+  return "Permutations of offset 2-region 1-block read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next()
+{
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 1;
+    offset1++;
+    if (offset1 + 1 >= obj_size) {
+      offset1 = 0;
+      offset2 = 0;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read2(offset1, 1, offset1 + offset2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq4::Seq4(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
+{
+  set_min_object_size(3);
+}
+
+std::string ceph::io_exerciser::Seq4::get_name() const
+{
+  return "Permutations of offset 3-region 1-block read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next()
+{
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 2;
+    offset1++;
+    if (offset1 + 2 >= obj_size) {
+      offset1 = 0;
+      offset2 = 1;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read3(offset1, 1,
+                              offset1 + offset2, 1,
+                              (offset1 * 2 + offset2)/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq5::Seq5(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(1),
+  doneread(false), donebarrier(false) {}
+
+std::string ceph::io_exerciser::Seq5::get_name() const
+{
+  return "Permutation of length sequential writes";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next()
+{
+  if (offset >= obj_size) {
+    if (!doneread) {
+      if (!donebarrier) {
+        donebarrier = true;
+        return IoOp::generate_barrier();
+      }
+      doneread = true;
+      barrier = true;
+      return IoOp::generate_read(0, obj_size);
+    }
+    doneread = false;
+    donebarrier = false;
+    offset = 0;
+    length++;
+    if (length > obj_size) {
+      length = 1;
+      return increment_object_size();
+    }
+  }
+  uint64_t io_len = (offset + length > obj_size) ? (obj_size - offset) : length;
+  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  offset += io_len;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq6::Seq6(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(1),
+  doneread(false), donebarrier(false) {}
+
+std::string ceph::io_exerciser::Seq6::get_name() const
+{
+  return "Permutation of length sequential writes, different alignment";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next()
+{
+  if (offset >= obj_size) {
+    if (!doneread) {
+      if (!donebarrier) {
+        donebarrier = true;
+        return IoOp::generate_barrier();
+      }
+      doneread = true;
+      barrier = true;
+      return IoOp::generate_read(0, obj_size);
+    }
+    doneread = false;
+    donebarrier = false;
+    offset = 0;
+    length++;
+    if (length > obj_size) {
+      length = 1;
+      return increment_object_size();
+    }
+  }
+  uint64_t io_len = (offset == 0) ? (obj_size % length) : length;
+  if (io_len == 0) {
+    io_len = length;
+  }
+  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  offset += io_len;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq7::Seq7(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed)
+{
+  set_min_object_size(2);
+  offset = obj_size;
+}
+
+std::string ceph::io_exerciser::Seq7::get_name() const
+{
+  return "Permutations of offset 2-region 1-block writes";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  if (offset == 0) {
+    doneread = false;
+    donebarrier = false;
+    offset = obj_size+1;
+    return increment_object_size();
+  }
+  offset--;
+  if (offset == obj_size/2) {
+    return _next();
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write2(offset, 1, obj_size/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq8::Seq8(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
+{
+  set_min_object_size(3);
+}
+
+std::string ceph::io_exerciser::Seq8::get_name() const
+{
+  return "Permutations of offset 3-region 1-block write I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 2;
+    offset1++;
+    if (offset1 + 2 >= obj_size) {
+      offset1 = 0;
+      offset2 = 1;
+      return increment_object_size();
+    }
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write3(offset1, 1,
+                              offset1 + offset2, 1,
+                              (offset1 * 2 + offset2)/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq9::Seq9(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(0)
+{
+  
+}
+
+std::string ceph::io_exerciser::Seq9::get_name() const
+{
+  return "Permutations of offset and length write I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  length++;
+  if (length > obj_size - offset) {
+    length = 1;
+    offset++;
+    if (offset >= obj_size) {
+      offset = 0;
+      length = 0;
+      return increment_object_size();
+    }
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write(offset, length);
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.h b/src/common/io_exerciser/IoSequence.h
new file mode 100644
index 00000000000..114ff76303f
--- /dev/null
+++ b/src/common/io_exerciser/IoSequence.h
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "IoOp.h"
+
+#include "include/random.h"
+
+/* Overview
+ *
+ * enum Sequence
+ *   Enumeration of the different sequences
+ *
+ * class IoSequence
+ *   Virtual class. IoSequences generate a stream of IoOPs.
+ *   Sequences typically exhastively test permutations of
+ *   offset and length to allow validation of code such as
+ *   Erasure Coding. An IoSequence does not determine
+ *   whether I/Os are issued sequentially or in parallel,
+ *   it must generate barrier I/Os where operations must
+ *   be serialized.
+ *
+ * class Seq*
+ *   Implementations of IoSequence. Each class generates
+ *   a different sequence of I/O.
+ *
+ * generate_sequence
+ *   Create an IoSequence
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    enum class Sequence {
+      SEQUENCE_SEQ0,
+      SEQUENCE_SEQ1,
+      SEQUENCE_SEQ2,
+      SEQUENCE_SEQ3,
+      SEQUENCE_SEQ4,
+      SEQUENCE_SEQ5,
+      SEQUENCE_SEQ6,
+      SEQUENCE_SEQ7,
+      SEQUENCE_SEQ8,
+      SEQUENCE_SEQ9,
+      //
+      SEQUENCE_END,
+      SEQUENCE_BEGIN = SEQUENCE_SEQ0
+    };
+
+    inline Sequence operator++( Sequence& s )
+    {
+      return s = (Sequence)(((int)(s) + 1));
+    }
+
+    std::ostream& operator<<(std::ostream& os, const Sequence& seq);
+
+    /* I/O Sequences */
+
+    class IoSequence {
+    public:
+      virtual ~IoSequence() = default;
+
+      virtual std::string get_name() const = 0;
+      int get_step() const;
+      int get_seed() const;
+
+      std::unique_ptr<IoOp> next();
+
+      static std::unique_ptr<IoSequence>
+        generate_sequence(Sequence s, std::pair<int,int> obj_size_range, int seed );
+
+    protected:
+      uint64_t min_obj_size;
+      uint64_t max_obj_size;
+      bool create;
+      bool barrier;
+      bool done;
+      bool remove;
+      uint64_t obj_size;
+      int step;
+      int seed;
+      ceph::util::random_number_generator<int> rng =
+        ceph::util::random_number_generator<int>();
+
+      IoSequence(std::pair<int,int> obj_size_range, int seed);
+
+      virtual std::unique_ptr<IoOp> _next() = 0;
+
+      void set_min_object_size(uint64_t size);
+      void set_max_object_size(uint64_t size);
+      void select_random_object_size();
+      std::unique_ptr<IoOp> increment_object_size();
+
+    };
+
+    class Seq0: public IoSequence {
+    public:
+      Seq0(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+    };
+
+    class Seq1: public IoSequence {  
+    public:
+      Seq1(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next();
+
+    private:
+      int count;
+    };
+      
+    class Seq2: public IoSequence {
+    public:
+      Seq2(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    
+    private:
+      uint64_t offset;
+      uint64_t length;
+    };
+
+    class Seq3: public IoSequence {
+    public:
+      Seq3(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+    };
+
+    class Seq4: public IoSequence {
+    public:
+      Seq4(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+    };
+
+    class Seq5: public IoSequence {
+    public:
+      Seq5(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread;
+      bool donebarrier;
+    };
+
+    class Seq6: public IoSequence {
+    public:
+      Seq6(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread;
+      bool donebarrier;
+    };
+
+    class Seq7: public IoSequence {
+    public:
+      Seq7(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      bool doneread = true;
+      bool donebarrier = false;
+    };
+
+    class Seq8: public IoSequence {
+    public:
+      Seq8(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+      bool doneread = true;
+      bool donebarrier = false;
+    };
+
+    class Seq9: public IoSequence {
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread = true;
+      bool donebarrier = false;
+
+    public:
+      Seq9(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+
+      std::unique_ptr<IoOp> _next() override;
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.cc b/src/common/io_exerciser/Model.cc
new file mode 100644
index 00000000000..50812ecbb15
--- /dev/null
+++ b/src/common/io_exerciser/Model.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "Model.h"
+
+using Model = ceph::io_exerciser::Model;
+
+Model::Model(const std::string& oid, uint64_t block_size) : 
+num_io(0),
+oid(oid),
+block_size(block_size)
+{
+
+}
+
+const uint64_t Model::get_block_size() const
+{
+  return block_size;
+}
+
+const std::string Model::get_oid() const
+{
+  return oid;
+}
+
+int Model::get_num_io() const
+{
+  return num_io;
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.h b/src/common/io_exerciser/Model.h
new file mode 100644
index 00000000000..58d107409a6
--- /dev/null
+++ b/src/common/io_exerciser/Model.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "IoOp.h"
+
+#include <boost/asio/io_context.hpp>
+
+#include "librados/librados_asio.h"
+
+#include "include/interval_set.h"
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "common/Thread.h"
+
+/* Overview
+ *
+ * class Model
+ *   Virtual class. Models apply IoOps generated by an
+ *   IoSequence, they can choose how many I/Os to execute in
+ *   parallel and scale up the size of I/Os by the blocksize
+ *
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    class Model
+    {
+    protected:
+      int num_io{0};
+      std::string oid;
+      uint64_t block_size;
+
+    public:
+      Model(const std::string& oid, uint64_t block_size);
+      virtual ~Model() = default;
+
+      virtual bool readyForIoOp(IoOp& op) = 0;
+      virtual void applyIoOp(IoOp& op) = 0;
+      
+      const std::string get_oid() const;
+      const uint64_t get_block_size() const;
+      int get_num_io() const;
+    };
+
+    /* Simple RADOS I/O generator */
+
+    
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/ObjectModel.cc b/src/common/io_exerciser/ObjectModel.cc
new file mode 100644
index 00000000000..589f6434282
--- /dev/null
+++ b/src/common/io_exerciser/ObjectModel.cc
@@ -0,0 +1,174 @@
+#include "ObjectModel.h"
+
+#include <algorithm>
+#include <execution>
+#include <iterator>
+
+using ObjectModel = ceph::io_exerciser::ObjectModel;
+
+ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) :
+  Model(oid, block_size), created(false)
+{
+  rng.seed(seed);
+}
+
+int ObjectModel::get_seed(uint64_t offset) const
+{
+  ceph_assert(offset < contents.size());
+  return contents[offset];
+}
+
+std::vector<int> ObjectModel::get_seed_offsets(int seed) const
+{
+  std::vector<int> offsets;
+  for (size_t i = 0; i < contents.size(); i++)
+  {
+    if (contents[i] == seed)
+    {
+      offsets.push_back(i);
+    }
+  }
+
+  return offsets;
+}
+
+std::string ObjectModel::to_string(int mask) const
+{
+  if (!created) {
+    return "Object does not exist";
+  }
+  std::string result = "{";
+  for (uint64_t i = 0; i < contents.size(); i++) {
+    if (i != 0) {
+      result += ",";
+    }
+    result += std::to_string(contents[i] & mask);
+  }
+  result += "}";
+  return result;
+}
+
+bool ObjectModel::readyForIoOp(IoOp& op)
+{
+  return true;
+}
+
+void ObjectModel::applyIoOp(IoOp& op)
+{
+  auto generate_random = [&rng = rng]() {
+    return rng();
+  };
+
+  switch (op.op) {
+  case OpType::BARRIER:
+    reads.clear();
+    writes.clear();
+    break;
+
+  case OpType::CREATE:
+    ceph_assert(!created);
+    ceph_assert(reads.empty());
+    ceph_assert(writes.empty());
+    created = true;
+    contents.resize(op.length1);
+    std::generate(std::execution::seq, contents.begin(), contents.end(),
+                  generate_random);
+    break;
+
+  case OpType::REMOVE:
+    ceph_assert(created);
+    ceph_assert(reads.empty());
+    ceph_assert(writes.empty());
+    created = false;
+    contents.resize(0);
+    break;
+
+  case OpType::READ3:
+    ceph_assert(created);
+    ceph_assert(op.offset3 + op.length3 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset3, op.length3));
+    reads.union_insert(op.offset3, op.length3);
+    [[fallthrough]];
+
+  case OpType::READ2:
+    ceph_assert(created);
+    ceph_assert(op.offset2 + op.length2 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset2, op.length2));
+    reads.union_insert(op.offset2, op.length2);
+    [[fallthrough]];
+
+  case OpType::READ:
+    ceph_assert(created);
+    ceph_assert(op.offset1 + op.length1 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset1, op.length1));
+    reads.union_insert(op.offset1, op.length1);
+    num_io++;
+    break;
+
+  case OpType::WRITE3:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset3, op.length3));
+    ceph_assert(!writes.intersects(op.offset3, op.length3));
+    writes.union_insert(op.offset3, op.length3);
+    ceph_assert(op.offset3 + op.length3 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset3),
+                  std::next(contents.begin(), op.offset3 + op.length3),
+                  generate_random);
+    [[fallthrough]];
+
+  case OpType::WRITE2:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset2, op.length2));
+    ceph_assert(!writes.intersects(op.offset2, op.length2));
+    writes.union_insert(op.offset2, op.length2);
+    ceph_assert(op.offset2 + op.length2 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset2),
+                  std::next(contents.begin(), op.offset2 + op.length2),
+                  generate_random);
+    [[fallthrough]];
+
+  case OpType::WRITE:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset1, op.length1));
+    ceph_assert(!writes.intersects(op.offset1, op.length1));
+    writes.union_insert(op.offset1, op.length1);
+    ceph_assert(op.offset1 + op.length1 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset1),
+                  std::next(contents.begin(), op.offset1 + op.length1),
+                  generate_random);
+    num_io++;
+    break;
+  default:
+    break;
+  }
+}
+
+void ObjectModel::encode(ceph::buffer::list& bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(created, bl);
+  if (created) {
+    encode(contents, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void ObjectModel::decode(ceph::buffer::list::const_iterator& bl) {
+  DECODE_START(1, bl);
+  DECODE_OLDEST(1);
+  decode(created, bl);
+  if (created) {
+    decode(contents, bl);
+  } else {
+    contents.resize(0);
+  }
+  DECODE_FINISH(bl);
+}
diff --git a/src/common/io_exerciser/ObjectModel.h b/src/common/io_exerciser/ObjectModel.h
new file mode 100644
index 00000000000..93c70f41429
--- /dev/null
+++ b/src/common/io_exerciser/ObjectModel.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "Model.h"
+
+/* Overview
+ *
+ * class ObjectModel
+ *   An IoExerciser. Tracks the data stored in an object, applies
+ *   IoOp's to update the model. Polices that I/Os that are
+ *   permitted to run in parallel do not break rules. Provides
+ *   interface to query state of object. State can be encoded
+ *   and decoded
+ *
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    /* Model of an object to track its data contents */
+
+    class ObjectModel : public Model {
+    private:
+      bool created;
+      std::vector<int> contents;
+      ceph::util::random_number_generator<int> rng =
+        ceph::util::random_number_generator<int>();
+
+      // Track read and write I/Os that can be submitted in
+      // parallel to detect violations:
+      //
+      // * Read may not overlap with a parallel write
+      // * Write may not overlap with a parallel read or write
+      // * Create / remove may not be in parallel with read or write
+      //
+      // Fix broken test cases by adding barrier ops to restrict
+      // I/O exercisers from issuing conflicting ops in parallel
+      interval_set<uint64_t> reads;
+      interval_set<uint64_t> writes;
+    public:
+      ObjectModel(const std::string& oid, uint64_t block_size, int seed);
+      
+      int get_seed(uint64_t offset) const;
+      std::vector<int> get_seed_offsets(int seed) const;
+
+      std::string to_string(int mask = -1) const;
+
+      bool readyForIoOp(IoOp& op);
+      void applyIoOp(IoOp& op);
+      
+      void encode(ceph::buffer::list& bl) const;
+      void decode(ceph::buffer::list::const_iterator& bl);
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
new file mode 100644
index 00000000000..a28a1e2f488
--- /dev/null
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -0,0 +1,288 @@
+#include "RadosIo.h"
+
+#include "DataGenerator.h"
+
+using RadosIo = ceph::io_exerciser::RadosIo;
+
+RadosIo::RadosIo(librados::Rados& rados,
+        boost::asio::io_context& asio,
+        const std::string& pool,
+        const std::string& oid,
+        uint64_t block_size,
+        int seed,
+	int threads,
+        ceph::mutex& lock,
+        ceph::condition_variable& cond) :
+  Model(oid, block_size),
+  rados(rados),
+  asio(asio),
+  om(std::make_unique<ObjectModel>(oid, block_size, seed)),
+  db(data_generation::DataGenerator::create_generator(
+      data_generation::GenerationType::HeaderedSeededRandom, *om)),
+  pool(pool),
+  threads(threads),
+  lock(lock),
+  cond(cond),
+  outstanding_io(0)
+{
+  int rc;
+  rc = rados.ioctx_create(pool.c_str(), io);
+  ceph_assert(rc == 0);
+  allow_ec_overwrites(true);
+}
+
+RadosIo::~RadosIo()
+{
+}
+
+void RadosIo::start_io()
+{
+  std::lock_guard l(lock);
+  outstanding_io++;
+}
+
+void RadosIo::finish_io()
+{
+  std::lock_guard l(lock);
+  ceph_assert(outstanding_io > 0);
+  outstanding_io--;
+  cond.notify_all();
+}
+
+void RadosIo::wait_for_io(int count)
+{
+  std::unique_lock l(lock);
+  while (outstanding_io > count) {
+    cond.wait(l);
+  }
+}
+
+void RadosIo::allow_ec_overwrites(bool allow)
+{
+  int rc;
+  bufferlist inbl, outbl;
+  std::string cmdstr =
+    "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \
+      \"var\": \"allow_ec_overwrites\", \"val\": \"" +
+    (allow ? "true" : "false") + "\"}";
+  rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+}
+
+RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1,
+                                  uint64_t offset2, uint64_t length2,
+                                  uint64_t offset3, uint64_t length3 ) :
+  offset1(offset1), length1(length1),
+  offset2(offset2), length2(length2),
+  offset3(offset3), length3(length3)
+{
+
+}
+
+bool RadosIo::readyForIoOp(IoOp &op)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held
+  if (!om->readyForIoOp(op)) {
+    return false;
+  }
+  switch (op.op) {
+  case OpType::Done:
+  case OpType::BARRIER:
+    return outstanding_io == 0;
+  default:
+    return outstanding_io < threads;
+  }
+}
+
+void RadosIo::applyIoOp(IoOp &op)
+{
+  std::shared_ptr<AsyncOpInfo> op_info;
+
+  om->applyIoOp(op);
+
+  // If there are thread concurrent I/Os in flight then wait for
+  // at least one I/O to complete
+  wait_for_io(threads-1);
+  
+  switch (op.op) {
+  case OpType::Done:
+  [[ fallthrough ]];
+  case OpType::BARRIER:
+    // Wait for all outstanding I/O to complete
+    wait_for_io(0);
+    break;    
+
+  case OpType::CREATE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(0, op.length1);
+      op_info->bl1 = db->generate_data(0, op.length1);
+      op_info->wop.write_full(op_info->bl1);
+      auto create_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, create_cb);
+    }
+    break;
+
+  case OpType::REMOVE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>();
+      op_info->wop.remove();
+      auto remove_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, remove_cb);
+    }
+    break;
+
+  case OpType::READ:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
+      op_info->rop.read(op.offset1 * block_size,
+                        op.length1 * block_size,
+                        &op_info->bl1, nullptr);
+      auto read_cb = [this, op_info] (boost::system::error_code ec,
+                                      version_t ver,
+                                      bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::READ2:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1,
+                                              op.length1,
+                                              op.offset2,
+                                              op.length2);
+
+      op_info->rop.read(op.offset1 * block_size,
+                        op.length1 * block_size,
+                        &op_info->bl1, nullptr);
+      op_info->rop.read(op.offset2 * block_size,
+                    op.length2 * block_size,
+                    &op_info->bl2, nullptr);
+      auto read2_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
+                                       bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
+        db->validate(op_info->bl2, op_info->offset2, op_info->length2);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read2_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::READ3:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2,
+                                              op.offset3, op.length3);
+      op_info->rop.read(op.offset1 * block_size,
+                    op.length1 * block_size,
+                    &op_info->bl1, nullptr);
+      op_info->rop.read(op.offset2 * block_size,
+                    op.length2 * block_size,
+                    &op_info->bl2, nullptr);
+      op_info->rop.read(op.offset3 * block_size,
+                    op.length3 * block_size,
+                    &op_info->bl3, nullptr);
+      auto read3_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
+                                       bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
+        db->validate(op_info->bl2, op_info->offset2, op_info->length2);
+        db->validate(op_info->bl3, op_info->offset3, op_info->length3);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read3_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      auto write_cb = [this] (boost::system::error_code ec,
+                              version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE2:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+      op_info->bl2 = db->generate_data(op.offset2, op.length2);
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
+      auto write2_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write2_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE3:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2,
+                                              op.offset3, op.length3);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+      op_info->bl2 = db->generate_data(op.offset2, op.length2);
+      op_info->bl3 = db->generate_data(op.offset3, op.length3);
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
+      op_info->wop.write(op.offset3 * block_size, op_info->bl3);
+      auto write3_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write3_cb);
+      num_io++;
+    }
+    break;
+
+  default:
+    break;
+  }
+}
diff --git a/src/common/io_exerciser/RadosIo.h b/src/common/io_exerciser/RadosIo.h
new file mode 100644
index 00000000000..179c5bba3ae
--- /dev/null
+++ b/src/common/io_exerciser/RadosIo.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "ObjectModel.h"
+
+/* Overview
+ *
+ * class RadosIo
+ *   An IoExerciser. A simple RADOS client that generates I/Os
+ *   from IoOps. Uses an ObjectModel to track the data stored
+ *   in the object. Uses DataBuffer to create and validate
+ *   data buffers. When there are not barrier I/Os this may
+ *   issue multiple async I/Os in parallel.
+ * 
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    namespace data_generation {
+      class DataGenerator;
+    }
+    
+    class RadosIo: public Model {
+    protected:
+      librados::Rados& rados;
+      boost::asio::io_context& asio;
+      std::unique_ptr<ObjectModel> om;
+      std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db;
+      std::string pool;
+      int threads;
+      ceph::mutex& lock;
+      ceph::condition_variable& cond;
+      librados::IoCtx io;
+      int outstanding_io;
+
+      void start_io();
+      void finish_io();
+      void wait_for_io(int count);
+      
+    public:
+      RadosIo(librados::Rados& rados,
+              boost::asio::io_context& asio,
+              const std::string& pool,
+              const std::string& oid,
+              uint64_t block_size,
+              int seed,
+              int threads,
+              ceph::mutex& lock,
+              ceph::condition_variable& cond);
+
+      ~RadosIo();
+
+      void allow_ec_overwrites(bool allow);
+
+      class AsyncOpInfo {
+      public:
+        librados::ObjectReadOperation rop;
+        librados::ObjectWriteOperation wop;
+        ceph::buffer::list bl1;
+        ceph::buffer::list bl2;
+        ceph::buffer::list bl3;
+        uint64_t offset1;
+        uint64_t length1;
+        uint64_t offset2;
+        uint64_t length2;
+        uint64_t offset3;
+        uint64_t length3;
+
+        AsyncOpInfo(uint64_t offset1 = 0, uint64_t length1 = 0,
+                uint64_t offset2 = 0, uint64_t length2 = 0,
+                uint64_t offset3 = 0, uint64_t length3 = 0 );
+        ~AsyncOpInfo() = default;
+      };
+
+      // Must be called with lock held
+      bool readyForIoOp(IoOp& op);
+      
+      void applyIoOp(IoOp& op);
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
index 4d843be75dc..95353425de9 100644
--- a/src/common/map_cacher.hpp
+++ b/src/common/map_cacher.hpp
@@ -16,6 +16,7 @@
 #define MAPCACHER_H
 
 #include "include/Context.h"
+#include "include/expected.hpp"
 #include "common/sharedptr_registry.hpp"
 
 namespace MapCacher {
@@ -130,6 +131,50 @@ public:
     return -EINVAL;
   } ///< @return error value, 0 on success, -ENOENT if no more entries
 
+  /// Fetch first key/value std::pair after specified key
+  struct PosAndData {
+    K last_key;
+    V data;
+  };
+  using MaybePosAndData = tl::expected<PosAndData, int>;
+
+  MaybePosAndData get_1st_after_key(
+      K key  ///< [in] key after which to get next
+  )
+  {
+    ceph_assert(driver);
+    while (true) {
+      std::pair<K, boost::optional<V>> cached;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      ///\todo a driver->get_next() that returns an expected<K, V> would be nice
+      bool got_store{false};
+      std::pair<K, V> store;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+        return tl::unexpected(r);
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+        return tl::unexpected(-ENOENT);
+      } else if (got_cached && (!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  return PosAndData{cached.first, *cached.second};
+	} else {
+	  key = cached.first;
+	  continue;  // value was cached as removed, recurse
+	}
+      } else {
+	return PosAndData{store.first, store.second};
+      }
+    }
+    ceph_abort();  // not reachable
+    return tl::unexpected(-EINVAL);
+  }
+
+
   /// Adds operation setting keys to Transaction
   void set_keys(
     const std::map<K, V> &keys,  ///< [in] keys/values to std::set
diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
index c1a4ff2a435..d56d0ebee99 100644
--- a/src/common/mutex_debug.h
+++ b/src/common/mutex_debug.h
@@ -169,20 +169,16 @@ public:
   }
 
   bool try_lock(bool no_lockdep = false) {
-    bool locked = try_lock_impl();
-    if (locked) {
-      if (enable_lockdep(no_lockdep))
-	_locked();
-      _post_lock();
-    }
-    return locked;
+    ceph_assert(recursive || !is_locked_by_me());
+    return _try_lock(no_lockdep);
   }
 
   void lock(bool no_lockdep = false) {
+    ceph_assert(recursive || !is_locked_by_me());
     if (enable_lockdep(no_lockdep))
       _will_lock(recursive);
 
-    if (try_lock(no_lockdep))
+    if (_try_lock(no_lockdep))
       return;
 
     lock_impl();
@@ -198,6 +194,16 @@ public:
     unlock_impl();
   }
 
+private:
+  bool _try_lock(bool no_lockdep) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
 };
 
 
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index 09d37dfcd82..85fe62d2343 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -448,6 +448,19 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_restore_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW CloudRestore.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW Cloud Restore, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change Restore behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_mp_lock_max_time
   type: int
   level: advanced
@@ -2078,14 +2091,6 @@ options:
   services:
   - rgw
   with_legacy: true
-- name: rgw_data_log_obj_prefix
-  type: str
-  level: dev
-  default: data_log
-  fmt_desc: The object name prefix for the data log.
-  services:
-  - rgw
-  with_legacy: true
 - name: rgw_data_sync_poll_interval
   type: int
   level: dev
diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
index b03a3cab70c..4b4d191e09c 100644
--- a/src/common/scrub_types.cc
+++ b/src/common/scrub_types.cc
@@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_obj_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
@@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_snapset_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
index dd206f56f60..d86fc12b6c8 100644
--- a/src/common/scrub_types.h
+++ b/src/common/scrub_types.h
@@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
 			const pg_shard_t &primary);
   void set_version(uint64_t ver) { version = ver; }
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
@@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
   void set_size_mismatch();
 
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index fe09cc54510..0dca695ba3a 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -75,14 +75,15 @@ public:
       CollectionRef c,
       const ghobject_t& oid) = 0;
 
-    using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+    using omap_values_t = attrs_t;
     using omap_keys_t = std::set<std::string>;
     virtual read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
       const omap_keys_t& keys) = 0;
 
-    virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    using omap_values_paged_t = std::tuple<bool, omap_values_t>;
+    virtual read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -147,7 +148,8 @@ public:
       return seastar::now();
     }
 
-    virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    using fiemap_ret_t = std::map<uint64_t, uint64_t>;
+    virtual read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index cf8d3c0891d..5dcb7514ee1 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -990,8 +990,12 @@ void Cache::mark_transaction_conflicted(
     }
     efforts.mutate_delta_bytes += delta_stat.bytes;
 
-    for (auto &i: t.pre_alloc_list) {
-      epm.mark_space_free(i->get_paddr(), i->get_length());
+    if (t.get_pending_ool()) {
+      t.get_pending_ool()->is_conflicted = true;
+    } else {
+      for (auto &i: t.pre_alloc_list) {
+	epm.mark_space_free(i->get_paddr(), i->get_length());
+      }
     }
 
     auto& ool_stats = t.get_ool_write_stats();
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index cdad6dfb1b0..76c18bde667 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -158,12 +158,14 @@ parent_tracker_t::~parent_tracker_t() {
 
 std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
 {
-  out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length()
+  out << "LBAMapping(" << rhs.get_key()
+      << "~0x" << std::hex << rhs.get_length() << std::dec
       << "->" << rhs.get_val();
   if (rhs.is_indirect()) {
-    out << " indirect(" << rhs.get_intermediate_base() << "~"
-	<< rhs.get_intermediate_key() << "~"
-	<< rhs.get_intermediate_length() << ")";
+    out << ",indirect(" << rhs.get_intermediate_base()
+        << "~0x" << std::hex << rhs.get_intermediate_length()
+        << "@0x" << rhs.get_intermediate_offset() << std::dec
+        << ")";
   }
   out << ")";
   return out;
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6c5c6c6fcc2..6025725aa33 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -350,7 +350,7 @@ public:
 	<< ", modify_time=" << sea_time_point_printer_t{modify_time}
 	<< ", paddr=" << get_paddr()
 	<< ", prior_paddr=" << prior_poffset_str
-	<< ", length=" << get_length()
+	<< std::hex << ", length=0x" << get_length() << std::dec
 	<< ", state=" << state
 	<< ", last_committed_crc=" << last_committed_crc
 	<< ", refcount=" << use_count()
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 34ac199eed8..0458fbfed74 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -987,7 +987,19 @@ RandomBlockOolWriter::alloc_write_ool_extents(
     return alloc_write_iertr::now();
   }
   return seastar::with_gate(write_guard, [this, &t, &extents] {
-    return do_write(t, extents);
+    seastar::lw_shared_ptr<rbm_pending_ool_t> ptr =
+      seastar::make_lw_shared<rbm_pending_ool_t>();
+    ptr->pending_extents = t.get_pre_alloc_list();
+    assert(!t.is_conflicted());
+    t.set_pending_ool(ptr);
+    return do_write(t, extents
+    ).finally([this, ptr=ptr] {
+      if (ptr->is_conflicted) {
+	for (auto &e : ptr->pending_extents) {
+	  rb_cleaner->mark_space_free(e->get_paddr(), e->get_length());
+	}
+      }
+    });
   });
 }
 
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index 5d6fa3cb1b1..ef10ff9623b 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -173,16 +173,22 @@ public:
     if (!parent_modified()) {
       return;
     }
+    LOG_PREFIX(BtreeLBAMapping::maybe_fix_pos);
     auto &p = static_cast<LBALeafNode&>(*parent);
     p.maybe_fix_mapping_pos(*this);
+    SUBDEBUGT(seastore_lba, "fixed pin {}",
+              ctx.trans, static_cast<LBAMapping&>(*this));
   }
 
   LBAMappingRef refresh_with_pending_parent() final {
+    LOG_PREFIX(BtreeLBAMapping::refresh_with_pending_parent);
     assert(is_parent_valid() && !is_parent_viewable());
     auto &p = static_cast<LBALeafNode&>(*parent);
     auto &viewable_p = static_cast<LBALeafNode&>(
       *p.find_pending_version(ctx.trans, get_key()));
-    return viewable_p.get_mapping(ctx, get_key());
+    auto new_pin = viewable_p.get_mapping(ctx, get_key());
+    SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
+    return new_pin;
   }
 protected:
   std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
index 960ea6ba411..397a014a7c3 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -925,7 +925,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
     std::ostringstream sos;
     sos << "Node" << NODE_TYPE << FIELD_TYPE
         << "@" << extent.get_laddr()
-        << "+" << std::hex << extent.get_length() << std::dec
+        << "+0x" << std::hex << extent.get_length() << std::dec
         << "Lv" << (unsigned)level()
         << (is_level_tail() ? "$" : "");
     name = sos.str();
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 15774332373..d90edbb20db 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -17,6 +17,7 @@
 #include "common/safe_io.h"
 #include "include/stringify.h"
 #include "os/Transaction.h"
+#include "osd/osd_types_fmt.h"
 
 #include "crimson/common/buffer_io.h"
 
@@ -30,8 +31,6 @@
 #include "crimson/os/seastore/onode_manager.h"
 #include "crimson/os/seastore/object_data_handler.h"
 
-
-using std::string;
 using crimson::common::local_conf;
 
 template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
@@ -42,8 +41,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
   auto format(op_type_t op, FormatContext& ctx) const {
     std::string_view name = "unknown";
     switch (op) {
-      case op_type_t::TRANSACTION:
-      name = "transaction";
+      case op_type_t::DO_TRANSACTION:
+      name = "do_transaction";
       break;
     case op_type_t::READ:
       name = "read";
@@ -63,8 +62,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
     case op_type_t::OMAP_GET_VALUES:
       name = "omap_get_values";
       break;
-    case op_type_t::OMAP_LIST:
-      name = "omap_list";
+    case op_type_t::OMAP_GET_VALUES2:
+      name = "omap_get_values2";
       break;
     case op_type_t::MAX:
       name = "unknown";
@@ -143,14 +142,14 @@ void SeaStore::Shard::register_metrics()
   namespace sm = seastar::metrics;
   using op_type_t = crimson::os::seastore::op_type_t;
   std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = {
-    {op_type_t::TRANSACTION,     sm::label_instance("latency", "TRANSACTION")},
-    {op_type_t::READ,            sm::label_instance("latency", "READ")},
-    {op_type_t::WRITE,           sm::label_instance("latency", "WRITE")},
-    {op_type_t::GET_ATTR,        sm::label_instance("latency", "GET_ATTR")},
-    {op_type_t::GET_ATTRS,       sm::label_instance("latency", "GET_ATTRS")},
-    {op_type_t::STAT,            sm::label_instance("latency", "STAT")},
-    {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency",  "OMAP_GET_VALUES")},
-    {op_type_t::OMAP_LIST,       sm::label_instance("latency", "OMAP_LIST")},
+    {op_type_t::DO_TRANSACTION,   sm::label_instance("latency", "DO_TRANSACTION")},
+    {op_type_t::READ,             sm::label_instance("latency", "READ")},
+    {op_type_t::WRITE,            sm::label_instance("latency", "WRITE")},
+    {op_type_t::GET_ATTR,         sm::label_instance("latency", "GET_ATTR")},
+    {op_type_t::GET_ATTRS,        sm::label_instance("latency", "GET_ATTRS")},
+    {op_type_t::STAT,             sm::label_instance("latency", "STAT")},
+    {op_type_t::OMAP_GET_VALUES,  sm::label_instance("latency", "OMAP_GET_VALUES")},
+    {op_type_t::OMAP_GET_VALUES2, sm::label_instance("latency", "OMAP_GET_VALUES2")},
   };
 
   for (auto& [op_type, label] : labels_by_op_type) {
@@ -194,6 +193,9 @@ void SeaStore::Shard::register_metrics()
 
 seastar::future<> SeaStore::start()
 {
+  LOG_PREFIX(SeaStore::start);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
 #ifndef NDEBUG
   bool is_test = true;
@@ -214,19 +216,30 @@ seastar::future<> SeaStore::start()
   }).then([this, is_test] {
     ceph_assert(device);
     return shard_stores.start(root, device.get(), is_test);
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 seastar::future<> SeaStore::test_start(DeviceRef device_obj)
 {
+  LOG_PREFIX(SeaStore::test_start);
+  INFO("...");
+
   ceph_assert(device_obj);
   ceph_assert(root == "");
   device = std::move(device_obj);
-  return shard_stores.start_single(root, device.get(), true);
+  return shard_stores.start_single(root, device.get(), true
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 seastar::future<> SeaStore::stop()
 {
+  LOG_PREFIX(SeaStore::stop);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return seastar::do_for_each(secondaries, [](auto& sec_dev) {
     return sec_dev->stop();
@@ -239,17 +252,28 @@ seastar::future<> SeaStore::stop()
     }
   }).then([this] {
     return shard_stores.stop();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::test_mount()
 {
+  LOG_PREFIX(SeaStore::test_mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.local().mount_managers();
+  return shard_stores.local().mount_managers(
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::mount()
 {
+  LOG_PREFIX(SeaStore::mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return device->mount(
   ).safe_then([this] {
@@ -278,11 +302,13 @@ SeaStore::mount_ertr::future<> SeaStore::mount()
           return set_secondaries();
         });
       });
-    }).safe_then([this] {
-      return shard_stores.invoke_on_all([](auto &local_store) {
-        return local_store.mount_managers();
-      });
     });
+  }).safe_then([this] {
+    return shard_stores.invoke_on_all([](auto &local_store) {
+      return local_store.mount_managers();
+    });
+  }).safe_then([FNAME] {
+    INFO("done");
   }).handle_error(
     crimson::ct_error::assert_all{
       "Invalid error in SeaStore::mount"
@@ -302,9 +328,14 @@ seastar::future<> SeaStore::Shard::mount_managers()
 
 seastar::future<> SeaStore::umount()
 {
+  LOG_PREFIX(SeaStore::umount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.invoke_on_all([](auto &local_store) {
     return local_store.umount();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
@@ -332,7 +363,7 @@ seastar::future<> SeaStore::Shard::umount()
     onode_manager.reset();
   }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::umount"
+      "Invalid error in SeaStoreS::umount"
     }
   );
 }
@@ -345,15 +376,15 @@ seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid)
     auto [ret, fsid] = tuple;
     std::string str_fsid = stringify(new_osd_fsid);
     if (ret == -1) {
-       return write_meta("fsid", stringify(new_osd_fsid));
+      return write_meta("fsid", stringify(new_osd_fsid));
     } else if (ret == 0 && fsid != str_fsid) {
-       ERROR("on-disk fsid {} != provided {}",
-         fsid, stringify(new_osd_fsid));
-       throw std::runtime_error("store fsid error");
-     } else {
+      ERROR("on-disk fsid {} != provided {}",
+            fsid, stringify(new_osd_fsid));
+      throw std::runtime_error("store fsid error");
+    } else {
       return seastar::now();
-     }
-   });
+    }
+  });
 }
 
 seastar::future<>
@@ -379,6 +410,8 @@ SeaStore::Shard::mkfs_managers()
 	"mkfs_seastore",
 	[this](auto& t)
       {
+        LOG_PREFIX(SeaStoreS::mkfs_managers);
+        DEBUGT("...", t);
 	return onode_manager->mkfs(t
 	).si_then([this, &t] {
 	  return collection_manager->mkfs(t);
@@ -412,15 +445,22 @@ seastar::future<> SeaStore::set_secondaries()
 
 SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::test_mkfs);
+  INFO("uuid={} ...", new_osd_fsid);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } 
     return shard_stores.local().mkfs_managers(
     ).then([this, new_osd_fsid] {
       return prepare_meta(new_osd_fsid);
+    }).then([FNAME] {
+      INFO("done");
     });
   });
 }
@@ -448,27 +488,29 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
 
 SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::mkfs);
+  INFO("uuid={}, root={} ...", new_osd_fsid, root);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } else {
       return seastar::do_with(
         secondary_device_set_t(),
-        [this, new_osd_fsid](auto& sds) {
+        [this, new_osd_fsid, FNAME](auto& sds) {
         auto fut = seastar::now();
-        LOG_PREFIX(SeaStore::mkfs);
-        DEBUG("root: {}", root);
         if (!root.empty()) {
           fut = seastar::open_directory(root
-          ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable {
+          ).then([this, &sds, new_osd_fsid, FNAME](seastar::file rdir) mutable {
             std::unique_ptr<seastar::file> root_f =
               std::make_unique<seastar::file>(std::move(rdir));
             auto sub = root_f->list_directory(
-              [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<>
+              [this, &sds, new_osd_fsid, FNAME](auto de) mutable -> seastar::future<>
             {
-              LOG_PREFIX(SeaStore::mkfs);
               DEBUG("found file: {}", de.name);
               if (de.name.find("block.") == 0
                   && de.name.length() > 6 /* 6 for "block." */) {
@@ -533,6 +575,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
         return prepare_meta(new_osd_fsid);
       }).safe_then([this] {
 	return umount();
+      }).safe_then([FNAME] {
+        INFO("done");
       }).handle_error(
         crimson::ct_error::assert_all{
           "Invalid error in SeaStore::mkfs"
@@ -542,18 +586,22 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
   });
 }
 
-using coll_core_t = FuturizedStore::coll_core_t;
+using coll_core_t = SeaStore::coll_core_t;
 seastar::future<std::vector<coll_core_t>>
 SeaStore::list_collections()
 {
+  LOG_PREFIX(SeaStore::list_collections);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map([](auto &local_store) {
     return local_store.list_collections();
-  }).then([](std::vector<std::vector<coll_core_t>> results) {
+  }).then([FNAME](std::vector<std::vector<coll_core_t>> results) {
     std::vector<coll_core_t> collections;
     for (auto& colls : results) {
       collections.insert(collections.end(), colls.begin(), colls.end());
     }
+    DEBUG("got {} collections", collections.size());
     return seastar::make_ready_future<std::vector<coll_core_t>>(
       std::move(collections));
   });
@@ -561,14 +609,18 @@ SeaStore::list_collections()
 
 store_statfs_t SeaStore::Shard::stat() const
 {
-  return transaction_manager->store_stat();
+  LOG_PREFIX(SeaStoreS::stat);
+  auto ss = transaction_manager->store_stat();
+  DEBUG("stat={}", ss);
+  return ss;
 }
 
 seastar::future<store_statfs_t> SeaStore::stat() const
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::stat);
-  DEBUG("");
+  DEBUG("...");
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map_reduce0(
     [](const SeaStore::Shard &local_store) {
       return local_store.stat();
@@ -578,19 +630,30 @@ seastar::future<store_statfs_t> SeaStore::stat() const
       ss.add(ret);
       return std::move(ss);
     }
-  ).then([](store_statfs_t ss) {
+  ).then([FNAME](store_statfs_t ss) {
+    DEBUG("done, stat={}", ss);
     return seastar::make_ready_future<store_statfs_t>(std::move(ss));
   });
 }
 
 seastar::future<store_statfs_t> SeaStore::pool_statfs(int64_t pool_id) const
 {
-   //TODO
-   return SeaStore::stat();
+  LOG_PREFIX(SeaStore::pool_statfs);
+  DEBUG("pool_id={} ...", pool_id);
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  //TODO
+  return SeaStore::stat(
+  ).then([FNAME, pool_id](store_statfs_t ss) {
+    DEBUG("done, pool_id={}, ret={}", pool_id, ss);
+    return seastar::make_ready_future<store_statfs_t>(std::move(ss));
+  });
 }
 
 seastar::future<> SeaStore::report_stats()
 {
+  LOG_PREFIX(SeaStore::report_stats);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   shard_device_stats.resize(seastar::smp::count);
   shard_io_stats.resize(seastar::smp::count);
@@ -609,8 +672,7 @@ seastar::future<> SeaStore::report_stats()
       local_store.get_io_stats(report_detail, seconds);
     shard_cache_stats[seastar::this_shard_id()] =
       local_store.get_cache_stats(report_detail, seconds);
-  }).then([this] {
-    LOG_PREFIX(SeaStore);
+  }).then([this, FNAME] {
     auto now = seastar::lowres_clock::now();
     if (last_tp == seastar::lowres_clock::time_point::min()) {
       last_tp = now;
@@ -857,24 +919,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
         "list_objects",
         [this, ch, start, end, &limit, &ret](auto &t)
       {
+        LOG_PREFIX(SeaStoreS::list_objects);
+        DEBUGT("cid={} start={} end={} limit={} ...",
+               t, ch->get_cid(), start, end, limit);
         return get_coll_bits(
           ch, t
-	).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) {
+	).si_then([FNAME, this, ch, &t, start, end, &limit, &ret](auto bits) {
           if (!bits) {
+            DEBUGT("no bits, return none", t);
             return list_iertr::make_ready_future<
               OnodeManager::list_onodes_bare_ret
 	      >(std::make_tuple(
 		  std::vector<ghobject_t>(),
 		  ghobject_t::get_max()));
           } else {
-	    LOG_PREFIX(SeaStore::list_objects);
-	    DEBUGT("start {}, end {}, limit {}, bits {}",
-	      t, start, end, limit, *bits);
+	    DEBUGT("bits={} ...", t, *bits);
             auto filter = SeaStore::get_objs_range(ch, *bits);
 	    using list_iertr = OnodeManager::list_onodes_iertr;
 	    using repeat_ret = list_iertr::future<seastar::stop_iteration>;
             return trans_intr::repeat(
-              [this, &t, &ret, &limit, end,
+              [this, FNAME, &t, &ret, &limit, end,
 	       filter, ranges = get_ranges(ch, start, end, filter)
 	      ]() mutable -> repeat_ret {
 		if (limit == 0 || ranges.empty()) {
@@ -886,11 +950,10 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		auto pstart = ite->first;
 		auto pend = ite->second;
 		ranges.pop_front();
-		LOG_PREFIX(SeaStore::list_objects);
-		DEBUGT("pstart {}, pend {}, limit {}", t, pstart, pend, limit);
+		DEBUGT("pstart {}, pend {}, limit {} ...", t, pstart, pend, limit);
 		return onode_manager->list_onodes(
 		  t, pstart, pend, limit
-		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end]
+		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end, FNAME]
 			  (auto &&_ret) mutable {
 		  auto &next_objects = std::get<0>(_ret);
 		  auto &ret_objects = std::get<0>(ret);
@@ -901,7 +964,6 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		  std::get<1>(ret) = std::get<1>(_ret);
 		  assert(limit >= next_objects.size());
 		  limit -= next_objects.size();
-		  LOG_PREFIX(SeaStore::list_objects);
 		  DEBUGT("got {} objects, left limit {}",
 		    t, next_objects.size(), limit);
 		  assert(limit == 0 ||
@@ -914,10 +976,13 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		    seastar::stop_iteration
 		    >(seastar::stop_iteration::no);
 		});
-	      }).si_then([&ret] {
-		return list_iertr::make_ready_future<
-		  OnodeManager::list_onodes_bare_ret>(std::move(ret));
-	      });
+	      }
+            ).si_then([&ret, FNAME] {
+              DEBUG("got {} objects, next={}",
+                    std::get<0>(ret).size(), std::get<1>(ret));
+              return list_iertr::make_ready_future<
+                OnodeManager::list_onodes_bare_ret>(std::move(ret));
+            });
           }
         });
       }).safe_then([&ret](auto&& _ret) {
@@ -927,7 +992,7 @@ SeaStore::Shard::list_objects(CollectionRef ch,
       return std::move(ret);
     }).handle_error(
       crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::list_objects"
+        "Invalid error in SeaStoreS::list_objects"
       }
     );
   }).finally([this] {
@@ -939,23 +1004,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 seastar::future<CollectionRef>
 SeaStore::Shard::create_new_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::create_new_collection);
-  DEBUG("{}", cid);
+  LOG_PREFIX(SeaStoreS::create_new_collection);
+  DEBUG("cid={}", cid);
   return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
 }
 
 seastar::future<CollectionRef>
 SeaStore::Shard::open_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::open_collection);
-  DEBUG("{}", cid);
-  return list_collections().then([cid, this] (auto colls_cores) {
+  LOG_PREFIX(SeaStoreS::open_collection);
+  DEBUG("cid={} ...", cid);
+  return list_collections(
+  ).then([cid, this, FNAME] (auto colls_cores) {
     if (auto found = std::find(colls_cores.begin(),
                                colls_cores.end(),
                                std::make_pair(cid, seastar::this_shard_id()));
       found != colls_cores.end()) {
+      DEBUG("cid={} exists", cid);
       return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
     } else {
+      DEBUG("cid={} not exists", cid);
       return seastar::make_ready_future<CollectionRef>();
     }
   });
@@ -965,6 +1033,8 @@ seastar::future<>
 SeaStore::Shard::set_collection_opts(CollectionRef c,
                                         const pool_opts_t& opts)
 {
+  LOG_PREFIX(SeaStoreS::set_collection_opts);
+  DEBUG("cid={}, opts={} not implemented", c->get_cid(), opts);
   //TODO
   return seastar::now();
 }
@@ -986,6 +1056,8 @@ SeaStore::Shard::list_collections()
           "list_collections",
           [this, &ret](auto& t)
         {
+          LOG_PREFIX(SeaStoreS::list_collections);
+          DEBUGT("...", t);
           return transaction_manager->read_collection_root(t
           ).si_then([this, &t](auto coll_root) {
             return collection_manager->list(coll_root, t);
@@ -1004,7 +1076,7 @@ SeaStore::Shard::list_collections()
     }
   ).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::list_collections"
+      "Invalid error in SeaStoreS::list_collections"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1012,6 +1084,42 @@ SeaStore::Shard::list_collections()
   });
 }
 
+SeaStore::base_iertr::future<ceph::bufferlist>
+SeaStore::Shard::_read(
+  Transaction& t,
+  Onode& onode,
+  uint64_t offset,
+  std::size_t len,
+  uint32_t op_flags)
+{
+  LOG_PREFIX(SeaStoreS::_read);
+  size_t size = onode.get_layout().size;
+  if (offset >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
+           t, offset, len, size, op_flags);
+    return seastar::make_ready_future<ceph::bufferlist>();
+  }
+
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
+         t, offset, len, size, op_flags);
+  size_t corrected_len = (len == 0) ?
+    size - offset :
+    std::min(size - offset, len);
+
+  return ObjectDataHandler(max_object_size).read(
+    ObjectDataHandler::context_t{
+      *transaction_manager,
+      t,
+      onode,
+    },
+    offset,
+    corrected_len
+  ).si_then([FNAME, &t](auto bl) {
+    DEBUGT("got bl length=0x{:x}", t, bl.length());
+    return bl;
+  });
+}
+
 SeaStore::Shard::read_errorator::future<ceph::bufferlist>
 SeaStore::Shard::read(
   CollectionRef ch,
@@ -1020,9 +1128,6 @@ SeaStore::Shard::read(
   size_t len,
   uint32_t op_flags)
 {
-  LOG_PREFIX(SeaStore::read);
-  DEBUG("oid {} offset {} len {}", oid, offset, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1030,29 +1135,11 @@ SeaStore::Shard::read(
     ch,
     oid,
     Transaction::src_t::READ,
-    "read_obj",
+    "read",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
-      size_t size = onode.get_layout().size;
-
-      if (offset >= size) {
-	return seastar::make_ready_future<ceph::bufferlist>();
-      }
-
-      size_t corrected_len = (len == 0) ?
-	size - offset :
-	std::min(size - offset, len);
-
-      return ObjectDataHandler(max_object_size).read(
-        ObjectDataHandler::context_t{
-          *transaction_manager,
-          t,
-          onode,
-        },
-        offset,
-        corrected_len);
-    }
-  ).finally([this] {
+    [this, offset, len, op_flags](auto &t, auto &onode) {
+    return _read(t, onode, offset, len, op_flags);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1063,9 +1150,7 @@ SeaStore::Shard::exists(
   CollectionRef c,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::exists);
-  DEBUG("oid {}", oid);
-
+  LOG_PREFIX(SeaStoreS::exists);
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1073,12 +1158,14 @@ SeaStore::Shard::exists(
     c,
     oid,
     Transaction::src_t::READ,
-    "oid_exists",
+    "exists",
     op_type_t::READ,
-    [](auto&, auto&) {
+    [FNAME](auto& t, auto&) {
+    DEBUGT("exists", t);
     return seastar::make_ready_future<bool>(true);
   }).handle_error(
-    crimson::ct_error::enoent::handle([] {
+    crimson::ct_error::enoent::handle([FNAME] {
+      DEBUG("not exists");
       return seastar::make_ready_future<bool>(false);
     }),
     crimson::ct_error::assert_all{"unexpected error"}
@@ -1095,66 +1182,78 @@ SeaStore::Shard::readv(
   interval_set<uint64_t>& m,
   uint32_t op_flags)
 {
+  LOG_PREFIX(SeaStoreS::readv);
+  DEBUG("cid={} oid={} op_flags=0x{:x} {} intervals",
+        ch->get_cid(), _oid, op_flags, m.num_intervals());
+
   return seastar::do_with(
     _oid,
     ceph::bufferlist{},
-    [=, this, &m](auto &oid, auto &ret) {
+    [ch, op_flags, this, FNAME, &m](auto &oid, auto &ret) {
     return crimson::do_for_each(
       m,
-      [=, this, &oid, &ret](auto &p) {
+      [ch, op_flags, this, &oid, &ret](auto &p) {
       return read(
 	ch, oid, p.first, p.second, op_flags
 	).safe_then([&ret](auto bl) {
         ret.claim_append(bl);
       });
-    }).safe_then([&ret] {
+    }).safe_then([&ret, FNAME] {
+      DEBUG("got bl length=0x{:x}", ret.length());
       return read_errorator::make_ready_future<ceph::bufferlist>
         (std::move(ret));
     });
   });
-  return read_errorator::make_ready_future<ceph::bufferlist>();
 }
 
 using crimson::os::seastore::omap_manager::BtreeOMapManager;
 
+SeaStore::Shard::_omap_get_value_ret
+SeaStore::Shard::_get_attr(
+  Transaction& t,
+  Onode& onode,
+  std::string_view name) const
+{
+  LOG_PREFIX(SeaStoreS::_get_attr);
+  auto& layout = onode.get_layout();
+  if (name == OI_ATTR && layout.oi_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+    DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  if (name == SS_ATTR && layout.ss_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+    DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  DEBUGT("name={} ...", t, name);
+  return _omap_get_value(
+    t,
+    layout.xattr_root.get(
+      onode.get_metadata_hint(device->get_block_size())),
+    name);
+}
+
 SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::get_attr(
   CollectionRef ch,
   const ghobject_t& oid,
   std::string_view name) const
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::get_attr);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<ceph::bufferlist>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
-    [=, this](auto &t, auto& onode) -> _omap_get_value_ret {
-      auto& layout = onode.get_layout();
-      if (name == OI_ATTR && layout.oi_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      if (name == SS_ATTR && layout.ss_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      return _omap_get_value(
-        t,
-        layout.xattr_root.get(
-          onode.get_metadata_hint(device->get_block_size())),
-        name);
-    }
-  ).handle_error(
+    [this, name](auto &t, auto& onode) {
+    return _get_attr(t, onode, name);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1164,48 +1263,53 @@ SeaStore::Shard::get_attr(
   });
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::attrs_t>
+SeaStore::Shard::_get_attrs(
+  Transaction& t,
+  Onode& onode)
+{
+  LOG_PREFIX(SeaStoreS::_get_attrs);
+  DEBUGT("...", t);
+  auto& layout = onode.get_layout();
+  return omap_list(onode, layout.xattr_root, t, std::nullopt,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([&layout, &t, FNAME](auto p) {
+    auto& attrs = std::get<1>(p);
+    DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
+           t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
+    ceph::bufferlist bl;
+    if (layout.oi_size) {
+      bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+      attrs.emplace(OI_ATTR, std::move(bl));
+    }
+    if (layout.ss_size) {
+      bl.clear();
+      bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+      attrs.emplace(SS_ATTR, std::move(bl));
+    }
+    return seastar::make_ready_future<attrs_t>(std::move(attrs));
+  });
+}
+
 SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
 SeaStore::Shard::get_attrs(
   CollectionRef ch,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::get_attrs);
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<attrs_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
-    "get_addrs",
+    "get_attrs",
     op_type_t::GET_ATTRS,
-    [=, this](auto &t, auto& onode) {
-      auto& layout = onode.get_layout();
-      return omap_list(onode, layout.xattr_root, t, std::nullopt,
-        OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max()
-      ).si_then([&layout, &t, FNAME](auto p) {
-        auto& attrs = std::get<1>(p);
-        ceph::bufferlist bl;
-        if (layout.oi_size) {
-          bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-          attrs.emplace(OI_ATTR, std::move(bl));
-         DEBUGT("set oi from onode layout", t);
-        }
-        if (layout.ss_size) {
-          bl.clear();
-          bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-          attrs.emplace(SS_ATTR, std::move(bl));
-         DEBUGT("set ss from onode layout", t);
-        }
-        return seastar::make_ready_future<omap_values_t>(std::move(attrs));
-      });
-    }
-  ).handle_error(
+    [this](auto &t, auto& onode) {
+    return _get_attrs(t, onode);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1215,6 +1319,23 @@ SeaStore::Shard::get_attrs(
   });
 }
 
+seastar::future<struct stat> SeaStore::Shard::_stat(
+  Transaction& t,
+  Onode& onode,
+  const ghobject_t& oid)
+{
+  LOG_PREFIX(SeaStoreS::_stat);
+  struct stat st;
+  auto &olayout = onode.get_layout();
+  st.st_size = olayout.size;
+  st.st_blksize = device->get_block_size();
+  st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
+  st.st_nlink = 1;
+  DEBUGT("oid={}, size={}, blksize={}",
+         t, oid, st.st_size, st.st_blksize);
+  return seastar::make_ready_future<struct stat>(st);
+}
+
 seastar::future<struct stat> SeaStore::Shard::stat(
   CollectionRef c,
   const ghobject_t& oid)
@@ -1222,26 +1343,17 @@ seastar::future<struct stat> SeaStore::Shard::stat(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  LOG_PREFIX(SeaStore::stat);
   return repeat_with_onode<struct stat>(
     c,
     oid,
     Transaction::src_t::READ,
     "stat",
     op_type_t::STAT,
-    [=, this](auto &t, auto &onode) {
-      struct stat st;
-      auto &olayout = onode.get_layout();
-      st.st_size = olayout.size;
-      st.st_blksize = device->get_block_size();
-      st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
-      st.st_nlink = 1;
-      DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size);
-      return seastar::make_ready_future<struct stat>(st);
-    }
-  ).handle_error(
+    [this, oid](auto &t, auto &onode) {
+    return _stat(t, onode, oid);
+  }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::stat"
+      "Invalid error in SeaStoreS::stat"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1257,6 +1369,22 @@ SeaStore::Shard::omap_get_header(
   return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const omap_keys_t& keys)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("{} keys ...", t, keys.size());
+  omap_root_t omap_root = onode.get_layout().omap_root.get(
+    onode.get_metadata_hint(device->get_block_size()));
+  return _omap_get_values(
+    t,
+    std::move(omap_root),
+    keys);
+}
+
 SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
@@ -1266,22 +1394,15 @@ SeaStore::Shard::omap_get_values(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   return repeat_with_onode<omap_values_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
     [this, keys](auto &t, auto &onode) {
-      omap_root_t omap_root = onode.get_layout().omap_root.get(
-	onode.get_metadata_hint(device->get_block_size()));
-      return _omap_get_values(
-	t,
-	std::move(omap_root),
-	keys);
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, keys);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1298,58 +1419,62 @@ SeaStore::Shard::_omap_get_value(
     std::move(root),
     std::string(key),
     [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
-      if (root.is_null()) {
+    LOG_PREFIX(SeaStoreS::_omap_get_value);
+    if (root.is_null()) {
+      DEBUGT("key={} is absent because of null root", t, key);
+      return crimson::ct_error::enodata::make();
+    }
+    return manager.omap_get_value(root, t, key
+    ).si_then([&key, &t, FNAME](auto opt) -> _omap_get_value_ret {
+      if (!opt) {
+        DEBUGT("key={} is absent", t, key);
         return crimson::ct_error::enodata::make();
       }
-      return manager.omap_get_value(root, t, key
-      ).si_then([](auto opt) -> _omap_get_value_ret {
-        if (!opt) {
-          return crimson::ct_error::enodata::make();
-        }
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
-      });
-    }
-  );
+      DEBUGT("key={}, value length=0x{:x}", t, key, opt->length());
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
+    });
+  });
 }
 
-SeaStore::Shard::_omap_get_values_ret
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::_omap_get_values(
   Transaction &t,
   omap_root_t &&omap_root,
   const omap_keys_t &keys) const
 {
+  LOG_PREFIX(SeaStoreS::_omap_get_values);
   if (omap_root.is_null()) {
+    DEBUGT("{} keys are absent because of null root", t, keys.size());
     return seastar::make_ready_future<omap_values_t>();
   }
   return seastar::do_with(
     BtreeOMapManager(*transaction_manager),
     std::move(omap_root),
     omap_values_t(),
-    [&](auto &manager, auto &root, auto &ret) {
-      return trans_intr::do_for_each(
-        keys.begin(),
-        keys.end(),
-        [&](auto &key) {
-          return manager.omap_get_value(
-            root,
-            t,
-            key
-          ).si_then([&ret, &key](auto &&p) {
-            if (p) {
-              bufferlist bl;
-              bl.append(*p);
-              ret.emplace(
-                std::move(key),
-                std::move(bl));
-            }
-            return seastar::now();
-          });
+    [&t, &keys, FNAME](auto &manager, auto &root, auto &ret) {
+    return trans_intr::do_for_each(
+      keys.begin(),
+      keys.end(),
+      [&t, &manager, &root, &ret](auto &key) {
+      return manager.omap_get_value(
+        root,
+        t,
+        key
+      ).si_then([&ret, &key](auto &&p) {
+        if (p) {
+          bufferlist bl;
+          bl.append(*p);
+          ret.emplace(
+            std::move(key),
+            std::move(bl));
         }
-      ).si_then([&ret] {
-        return std::move(ret);
+        return seastar::now();
       });
-    }
-  );
+    }).si_then([&t, &ret, &keys, FNAME] {
+      DEBUGT("{} keys got {} values", t, keys.size(), ret.size());
+      return std::move(ret);
+    });
+  });
 }
 
 SeaStore::Shard::omap_list_ret
@@ -1377,51 +1502,74 @@ SeaStore::Shard::omap_list(
   });
 }
 
-SeaStore::Shard::omap_get_values_ret_t
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_paged_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const std::optional<std::string>& start)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("start={} ...", t, start.has_value() ? *start : "");
+  return omap_list(
+    onode,
+    onode.get_layout().omap_root,
+    t,
+    start,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([FNAME, &t](omap_values_paged_t ret) {
+    DEBUGT("got {} values, complete={}",
+           t, std::get<1>(ret).size(), std::get<0>(ret));
+    return ret;
+  });
+}
+
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<string> &start)
+  const std::optional<std::string> &start)
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::omap_get_values);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>;
-  return repeat_with_onode<ret_bare_t>(
-    c,
+  return repeat_with_onode<omap_values_paged_t>(
+    ch,
     oid,
     Transaction::src_t::READ,
-    "omap_list",
-    op_type_t::OMAP_LIST,
+    "omap_get_values2",
+    op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
-      return omap_list(
-	onode,
-	onode.get_layout().omap_root,
-	t,
-	start,
-	OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max());
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, start);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
 }
 
-SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
+SeaStore::base_iertr::future<SeaStore::Shard::fiemap_ret_t>
+SeaStore::Shard::_fiemap(
   Transaction &t,
   Onode &onode,
   uint64_t off,
   uint64_t len) const
 {
+  LOG_PREFIX(SeaStoreS::_fiemap);
+  size_t size = onode.get_layout().size;
+  if (off >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
+           t, off, len, size);
+    return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+  }
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
+         t, off, len, size);
+  size_t adjust_len = (len == 0) ?
+    size - off:
+    std::min(size - off, len);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &t, &onode] (auto &objhandler) {
+    [this, off, adjust_len, &t, &onode](auto &objhandler) {
     return objhandler.fiemap(
       ObjectDataHandler::context_t{
         *transaction_manager,
@@ -1429,39 +1577,31 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
         onode,
       },
       off,
-      len);
+      adjust_len);
+  }).si_then([FNAME, &t](auto ret) {
+    DEBUGT("got {} intervals", t, ret.size());
+    return ret;
   });
 }
 
-SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::fiemap_ret_t>
 SeaStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
   uint64_t len)
 {
-  LOG_PREFIX(SeaStore::fiemap);
-  DEBUG("oid: {}, off: {}, len: {} ", oid, off, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  return repeat_with_onode<std::map<uint64_t, uint64_t>>(
+  return repeat_with_onode<fiemap_ret_t>(
     ch,
     oid,
     Transaction::src_t::READ,
-    "fiemap_read",
+    "fiemap",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> _fiemap_ret {
-    size_t size = onode.get_layout().size;
-    if (off >= size) {
-      INFOT("fiemap offset is over onode size!", t);
-      return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
-    }
-    size_t adjust_len = (len == 0) ?
-      size - off:
-      std::min(size - off, len);
-    return _fiemap(t, onode, off, adjust_len);
+    [this, off, len](auto &t, auto &onode) {
+    return _fiemap(t, onode, off, len);
   }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
@@ -1469,7 +1609,7 @@ SeaStore::Shard::fiemap(
 }
 
 void SeaStore::Shard::on_error(ceph::os::Transaction &t) {
-  LOG_PREFIX(SeaStore::on_error);
+  LOG_PREFIX(SeaStoreS::on_error);
   ERROR(" transaction dump:\n");
   JSONFormatter f(true);
   f.open_object_section("transaction");
@@ -1490,17 +1630,22 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
   ++(shard_stats.starting_io_num);
 
   // repeat_with_internal_context ensures ordering via collection lock
+  auto num_bytes = _t.get_num_bytes();
   return repeat_with_internal_context(
     _ch,
     std::move(_t),
     Transaction::src_t::MUTATE,
     "do_transaction",
-    op_type_t::TRANSACTION,
-    [this](auto &ctx) {
-      return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
-        LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
-        SUBDEBUGT(seastore_t, "start with {} objects",
-                  t, ctx.iter.objects.size());
+    op_type_t::DO_TRANSACTION,
+    [this, num_bytes](auto &ctx) {
+      LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
+      return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
+        DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+               t, ctx.ch->get_cid(),
+               ctx.ext_transaction.get_num_ops(),
+               num_bytes,
+               ctx.iter.colls.size(),
+               ctx.iter.objects.size());
 #ifndef NDEBUG
 	TRACET(" transaction dump:\n", t);
 	JSONFormatter f(true);
@@ -1534,6 +1679,8 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
         }).si_then([this, &ctx] {
           return transaction_manager->submit_transaction(*ctx.transaction);
         });
+      }).safe_then([FNAME, &ctx] {
+        DEBUGT("done", *ctx.transaction);
       });
     }
   ).finally([this] {
@@ -1573,27 +1720,31 @@ SeaStore::Shard::_do_transaction_step(
   std::vector<OnodeRef> &d_onodes,
   ceph::os::Transaction::iterator &i)
 {
-  LOG_PREFIX(SeaStore::Shard::_do_transaction_step);
+  LOG_PREFIX(SeaStoreS::_do_transaction_step);
   auto op = i.decode_op();
-  SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op);
 
   using ceph::os::Transaction;
-  if (op->op == Transaction::OP_NOP)
+  if (op->op == Transaction::OP_NOP) {
+    DEBUGT("op NOP", *ctx.transaction);
     return tm_iertr::now();
+  }
 
   switch (op->op) {
     case Transaction::OP_RMCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op RMCOLL, cid={} ...", *ctx.transaction, cid);
       return _remove_collection(ctx, cid);
     }
     case Transaction::OP_MKCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op MKCOLL, cid={} ...", *ctx.transaction, cid);
       return _create_collection(ctx, cid, op->split_bits);
     }
     case Transaction::OP_COLL_HINT:
     {
+      DEBUGT("op COLL_HINT", *ctx.transaction);
       ceph::bufferlist hint;
       i.decode_bl(hint);
       return tm_iertr::now();
@@ -1611,14 +1762,18 @@ SeaStore::Shard::_do_transaction_step(
     create = true;
   }
   if (!onodes[op->oid]) {
+    const ghobject_t& oid = i.get_oid(op->oid);
     if (!create) {
-      fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_onode(*ctx.transaction, oid);
     } else {
-      fut = onode_manager->get_or_create_onode(
-        *ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get_or_create oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_or_create_onode(*ctx.transaction, oid);
     }
   }
-  return fut.si_then([&, op](auto get_onode) {
+  return fut.si_then([&, op, this, FNAME](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
@@ -1628,11 +1783,13 @@ SeaStore::Shard::_do_transaction_step(
     if ((op->op == Transaction::OP_CLONE
 	  || op->op == Transaction::OP_COLL_MOVE_RENAME)
 	&& !d_onodes[op->dest_oid]) {
+      const ghobject_t& dest_oid = i.get_oid(op->dest_oid);
+      DEBUGT("op {}, get_or_create dest oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, dest_oid);
       //TODO: use when_all_succeed after making onode tree
       //      support parallel extents loading
-      return onode_manager->get_or_create_onode(
-	*ctx.transaction, i.get_oid(op->dest_oid)
-      ).si_then([&, op](auto dest_onode) {
+      return onode_manager->get_or_create_onode(*ctx.transaction, dest_oid
+      ).si_then([&onodes, &d_onodes, op](auto dest_onode) {
 	assert(dest_onode);
 	auto &d_o = onodes[op->dest_oid];
 	assert(!d_o);
@@ -1644,13 +1801,13 @@ SeaStore::Shard::_do_transaction_step(
     } else {
       return OnodeManager::get_or_create_onode_iertr::now();
     }
-  }).si_then([&, op, this]() -> tm_ret {
-    LOG_PREFIX(SeaStore::_do_transaction_step);
+  }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret {
+    const ghobject_t& oid = i.get_oid(op->oid);
     try {
       switch (op->op) {
       case Transaction::OP_REMOVE:
       {
-	TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
+        DEBUGT("op REMOVE, oid={} ...", *ctx.transaction, oid);
         return _remove(ctx, onodes[op->oid]
 	).si_then([&onodes, &d_onodes, op] {
 	  onodes[op->oid].reset();
@@ -1660,6 +1817,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_CREATE:
       case Transaction::OP_TOUCH:
       {
+        DEBUGT("op CREATE/TOUCH, oid={} ...", *ctx.transaction, oid);
         return _touch(ctx, onodes[op->oid]);
       }
       case Transaction::OP_WRITE:
@@ -1669,6 +1827,8 @@ SeaStore::Shard::_do_transaction_step(
         uint32_t fadvise_flags = i.get_fadvise_flags();
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op WRITE, oid={}, 0x{:x}~0x{:x}, flags=0x{:x} ...",
+               *ctx.transaction, oid, off, len, fadvise_flags);
         return _write(
 	  ctx, onodes[op->oid], off, len, std::move(bl),
 	  fadvise_flags);
@@ -1676,6 +1836,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_TRUNCATE:
       {
         uint64_t off = op->off;
+        DEBUGT("op TRUNCATE, oid={}, 0x{:x} ...", *ctx.transaction, oid, off);
         return _truncate(ctx, onodes[op->oid], off);
       }
       case Transaction::OP_SETATTR:
@@ -1684,80 +1845,96 @@ SeaStore::Shard::_do_transaction_step(
         std::map<std::string, bufferlist> to_set;
         ceph::bufferlist& bl = to_set[name];
         i.decode_bl(bl);
+        DEBUGT("op SETATTR, oid={}, attr name={}, value length=0x{:x} ...",
+               *ctx.transaction, oid, name, bl.length());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_SETATTRS:
       {
         std::map<std::string, bufferlist> to_set;
         i.decode_attrset(to_set);
+        DEBUGT("op SETATTRS, oid={}, attrs size={} ...",
+               *ctx.transaction, oid, to_set.size());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_RMATTR:
       {
         std::string name = i.decode_string();
+        DEBUGT("op RMATTR, oid={}, attr name={} ...",
+               *ctx.transaction, oid, name);
         return _rmattr(ctx, onodes[op->oid], name);
       }
       case Transaction::OP_RMATTRS:
       {
+        DEBUGT("op RMATTRS, oid={} ...", *ctx.transaction, oid);
         return _rmattrs(ctx, onodes[op->oid]);
       }
       case Transaction::OP_OMAP_SETKEYS:
       {
         std::map<std::string, ceph::bufferlist> aset;
         i.decode_attrset(aset);
+        DEBUGT("op OMAP_SETKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, aset.size());
         return _omap_set_values(ctx, onodes[op->oid], std::move(aset));
       }
       case Transaction::OP_OMAP_SETHEADER:
       {
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op OMAP_SETHEADER, oid={}, length=0x{:x} ...",
+               *ctx.transaction, oid, bl.length());
         return _omap_set_header(ctx, onodes[op->oid], std::move(bl));
       }
       case Transaction::OP_OMAP_RMKEYS:
       {
         omap_keys_t keys;
         i.decode_keyset(keys);
+        DEBUGT("op OMAP_RMKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, keys.size());
         return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys));
       }
       case Transaction::OP_OMAP_RMKEYRANGE:
       {
-        string first, last;
+        std::string first, last;
         first = i.decode_string();
         last = i.decode_string();
+        DEBUGT("op OMAP_RMKEYRANGE, oid={}, first={}, last={} ...",
+               *ctx.transaction, oid, first, last);
         return _omap_rmkeyrange(
 	  ctx, onodes[op->oid],
 	  std::move(first), std::move(last));
       }
       case Transaction::OP_OMAP_CLEAR:
       {
+        DEBUGT("op OMAP_CLEAR, oid={} ...", *ctx.transaction, oid);
         return _omap_clear(ctx, onodes[op->oid]);
       }
       case Transaction::OP_ZERO:
       {
         objaddr_t off = op->off;
         extent_len_t len = op->len;
+        DEBUGT("op ZERO, oid={}, 0x{:x}~0x{:x} ...",
+               *ctx.transaction, oid, off, len);
         return _zero(ctx, onodes[op->oid], off, len);
       }
       case Transaction::OP_SETALLOCHINT:
       {
+        DEBUGT("op SETALLOCHINT, oid={}, not implemented",
+               *ctx.transaction, oid);
         // TODO
         return tm_iertr::now();
       }
       case Transaction::OP_CLONE:
       {
-	TRACET("cloning {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
+        DEBUGT("op CLONE, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
       }
       case Transaction::OP_COLL_MOVE_RENAME:
       {
+        DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	ceph_assert(op->cid == op->dest_cid);
-	TRACET("renaming {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
 	return _rename(
 	  ctx, onodes[op->oid], d_onodes[op->dest_oid]
 	).si_then([&onodes, &d_onodes, op] {
@@ -1793,7 +1970,7 @@ SeaStore::Shard::_do_transaction_step(
       return seastar::now();
     }),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::do_transaction_step"
+      "Invalid error in SeaStoreS::do_transaction_step"
     }
   );
 }
@@ -1829,7 +2006,7 @@ SeaStore::Shard::_rename(
   ).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_rename"}
+      "Invalid error in SeaStoreS::_rename"}
   );
 }
 
@@ -1850,7 +2027,7 @@ SeaStore::Shard::_remove_omaps(
       ).handle_error_interruptible(
 	crimson::ct_error::input_output_error::pass_further(),
 	crimson::ct_error::assert_all{
-	  "Invalid error in SeaStore::_remove"
+	  "Invalid error in SeaStoreS::_remove_omaps"
 	}
       );
     });
@@ -1863,8 +2040,6 @@ SeaStore::Shard::_remove(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_remove);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return _remove_omaps(
     ctx,
     onode,
@@ -1892,7 +2067,7 @@ SeaStore::Shard::_remove(
   }).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all(
-      "Invalid error in SeaStore::_remove"
+      "Invalid error in SeaStoreS::_remove"
     )
   );
 }
@@ -1902,8 +2077,6 @@ SeaStore::Shard::_touch(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_touch);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return tm_iertr::now();
 }
 
@@ -1915,8 +2088,6 @@ SeaStore::Shard::_write(
   ceph::bufferlist &&_bl,
   uint32_t fadvise_flags)
 {
-  LOG_PREFIX(SeaStore::_write);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   const auto &object_size = onode->get_layout().size;
   if (offset + len > object_size) {
     onode->update_onode_size(
@@ -2007,8 +2178,6 @@ SeaStore::Shard::_clone(
   OnodeRef &onode,
   OnodeRef &d_onode)
 {
-  LOG_PREFIX(SeaStore::_clone);
-  DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
     [this, &ctx, &onode, &d_onode](auto &objHandler) {
@@ -2034,9 +2203,10 @@ SeaStore::Shard::_zero(
   objaddr_t offset,
   extent_len_t len)
 {
-  LOG_PREFIX(SeaStore::_zero);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   if (offset + len >= max_object_size) {
+    LOG_PREFIX(SeaStoreS::_zero);
+    ERRORT("0x{:x}~0x{:x} >= 0x{:x}",
+           *ctx.transaction, offset, len, max_object_size);
     return crimson::ct_error::input_output_error::make();
   }
   const auto &object_size = onode->get_layout().size;
@@ -2092,8 +2262,6 @@ SeaStore::Shard::_omap_set_values(
   OnodeRef &onode,
   std::map<std::string, ceph::bufferlist> &&aset)
 {
-  LOG_PREFIX(SeaStore::_omap_set_values);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size());
   return _omap_set_kvs(
     onode,
     onode->get_layout().omap_root,
@@ -2112,8 +2280,6 @@ SeaStore::Shard::_omap_set_header(
   OnodeRef &onode,
   ceph::bufferlist &&header)
 {
-  LOG_PREFIX(SeaStore::_omap_set_header);
-  DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length());
   std::map<std::string, bufferlist> to_set;
   to_set[OMAP_HEADER_XATTR_KEY] = header;
   return _setattrs(ctx, onode,std::move(to_set));
@@ -2124,10 +2290,8 @@ SeaStore::Shard::_omap_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_omap_clear);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode);
-  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY))
-    .si_then([this, &ctx, &onode]() -> tm_ret {
+  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)
+  ).si_then([this, &ctx, &onode]() -> tm_ret {
     if (auto omap_root = onode->get_layout().omap_root.get(
       onode->get_metadata_hint(device->get_block_size()));
       omap_root.is_null()) {
@@ -2142,8 +2306,8 @@ SeaStore::Shard::_omap_clear(
         auto &omap_root) {
         return omap_manager.omap_clear(
           omap_root,
-          *ctx.transaction)
-        .si_then([&] {
+          *ctx.transaction
+        ).si_then([&] {
           if (omap_root.must_update()) {
 	    onode->update_omap_root(*ctx.transaction, omap_root);
           }
@@ -2159,8 +2323,6 @@ SeaStore::Shard::_omap_rmkeys(
   OnodeRef &onode,
   omap_keys_t &&keys)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeys);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
   auto omap_root = onode->get_layout().omap_root.get(
     onode->get_metadata_hint(device->get_block_size()));
   if (omap_root.is_null()) {
@@ -2201,10 +2363,9 @@ SeaStore::Shard::_omap_rmkeyrange(
   std::string first,
   std::string last)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeyrange);
-  DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last);
   if (first > last) {
-    ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last);
+    LOG_PREFIX(SeaStoreS::_omap_rmkeyrange);
+    ERRORT("range error, first:{} > last:{}", *ctx.transaction, first, last);
     ceph_abort();
   }
   auto omap_root = onode->get_layout().omap_root.get(
@@ -2247,8 +2408,6 @@ SeaStore::Shard::_truncate(
   OnodeRef &onode,
   uint64_t size)
 {
-  LOG_PREFIX(SeaStore::_truncate);
-  DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
   onode->update_onode_size(*ctx.transaction, size);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
@@ -2269,9 +2428,7 @@ SeaStore::Shard::_setattrs(
   OnodeRef &onode,
   std::map<std::string, bufferlist>&& aset)
 {
-  LOG_PREFIX(SeaStore::_setattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
-
+  LOG_PREFIX(SeaStoreS::_setattrs);
   auto fut = tm_iertr::now();
   auto& layout = onode->get_layout();
   if (auto it = aset.find(OI_ATTR); it != aset.end()) {
@@ -2333,8 +2490,6 @@ SeaStore::Shard::_rmattr(
   OnodeRef &onode,
   std::string name)
 {
-  LOG_PREFIX(SeaStore::_rmattr);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   auto& layout = onode->get_layout();
   if ((name == OI_ATTR) && (layout.oi_size > 0)) {
     onode->clear_object_info(*ctx.transaction);
@@ -2356,7 +2511,7 @@ SeaStore::Shard::_xattr_rmattr(
   OnodeRef &onode,
   std::string &&name)
 {
-  LOG_PREFIX(SeaStore::_xattr_rmattr);
+  LOG_PREFIX(SeaStoreS::_xattr_rmattr);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2384,8 +2539,6 @@ SeaStore::Shard::_rmattrs(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_rmattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   onode->clear_object_info(*ctx.transaction);
   onode->clear_snapset(*ctx.transaction);
   return _xattr_clear(ctx, onode);
@@ -2396,7 +2549,7 @@ SeaStore::Shard::_xattr_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_xattr_clear);
+  LOG_PREFIX(SeaStoreS::_xattr_clear);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2446,7 +2599,7 @@ SeaStore::Shard::_create_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2478,7 +2631,7 @@ SeaStore::Shard::_remove_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2489,40 +2642,53 @@ SeaStore::Shard::_get_collection(const coll_t& cid)
   return new SeastoreCollection{cid};
 }
 
+seastar::future<> SeaStore::write_meta(
+  const std::string& key,
+  const std::string& value) {
+  LOG_PREFIX(SeaStore::write_meta);
+  DEBUG("key={} value={} ...", key, value);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return seastar::do_with(key, value,
+    [this, FNAME](auto& key, auto& value) {
+    return shard_stores.local().write_meta(key, value
+    ).then([this, &key, &value] {
+      return mdstore->write_meta(key, value);
+    }).safe_then([FNAME, &key, &value] {
+      DEBUG("key={} value={} done", key, value);
+    }).handle_error(
+      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+    );
+  });
+}
+
 seastar::future<> SeaStore::Shard::write_meta(
   const std::string& key,
   const std::string& value)
 {
-  LOG_PREFIX(SeaStore::write_meta);
-  DEBUG("key: {}; value: {}", key, value);
-
   ++(shard_stats.io_num);
   ++(shard_stats.pending_io_num);
   // For TM::submit_transaction()
   ++(shard_stats.processing_inlock_io_num);
 
-  return seastar::do_with(
-      key, value,
-      [this, FNAME](auto& key, auto& value) {
-	return repeat_eagain([this, FNAME, &key, &value] {
-	  ++(shard_stats.repeat_io_num);
-
-	  return transaction_manager->with_transaction_intr(
-	    Transaction::src_t::MUTATE,
-            "write_meta",
-	    [this, FNAME, &key, &value](auto& t)
-          {
-            DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
-            return transaction_manager->update_root_meta(
-              t, key, value
-            ).si_then([this, &t] {
-              return transaction_manager->submit_transaction(t);
-            });
-          });
-	});
-      }
-  ).handle_error(
-    crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+  return repeat_eagain([this, &key, &value] {
+    ++(shard_stats.repeat_io_num);
+
+    return transaction_manager->with_transaction_intr(
+      Transaction::src_t::MUTATE,
+      "write_meta",
+      [this, &key, &value](auto& t)
+    {
+      LOG_PREFIX(SeaStoreS::write_meta);
+      DEBUGT("key={} value={} ...", t, key, value);
+      return transaction_manager->update_root_meta(
+        t, key, value
+      ).si_then([this, &t] {
+        return transaction_manager->submit_transaction(t);
+      });
+    });
+  }).handle_error(
+    crimson::ct_error::assert_all{"Invalid error in SeaStoreS::write_meta"}
   ).finally([this] {
     assert(shard_stats.pending_io_num);
     --(shard_stats.pending_io_num);
@@ -2535,13 +2701,17 @@ seastar::future<> SeaStore::Shard::write_meta(
 seastar::future<std::tuple<int, std::string>>
 SeaStore::read_meta(const std::string& key)
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::read_meta);
-  DEBUG("key: {}", key);
-  return mdstore->read_meta(key).safe_then([](auto v) {
+  DEBUG("key={} ...", key);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return mdstore->read_meta(key
+  ).safe_then([key, FNAME](auto v) {
     if (v) {
+      DEBUG("key={}, value={}", key, *v);
       return std::make_tuple(0, std::move(*v));
     } else {
+      ERROR("key={} failed", key);
       return std::make_tuple(-1, std::string(""));
     }
   }).handle_error(
@@ -2598,7 +2768,7 @@ shard_stats_t SeaStore::Shard::get_io_stats(
   ret.minus(last_shard_stats);
 
   if (report_detail && seconds != 0) {
-    LOG_PREFIX(SeaStore::get_io_stats);
+    LOG_PREFIX(SeaStoreS::get_io_stats);
     auto calc_conflicts = [](uint64_t ios, uint64_t repeats) {
       return (double)(repeats-ios)/ios;
     };
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index fb495a422f6..185072744f2 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -35,14 +35,14 @@ using OnodeRef = boost::intrusive_ptr<Onode>;
 class TransactionManager;
 
 enum class op_type_t : uint8_t {
-    TRANSACTION = 0,
+    DO_TRANSACTION = 0,
     READ,
     WRITE,
     GET_ATTR,
     GET_ATTRS,
     STAT,
     OMAP_GET_VALUES,
-    OMAP_LIST,
+    OMAP_GET_VALUES2,
     MAX
 };
 
@@ -71,20 +71,19 @@ struct col_obj_ranges_t {
 
 class SeaStore final : public FuturizedStore {
 public:
+  using base_ertr = TransactionManager::base_ertr;
+  using base_iertr = TransactionManager::base_iertr;
+
   class MDStore {
   public:
-    using base_iertr = crimson::errorator<
-      crimson::ct_error::input_output_error
-    >;
-
-    using write_meta_ertr = base_iertr;
+    using write_meta_ertr = base_ertr;
     using write_meta_ret = write_meta_ertr::future<>;
     virtual write_meta_ret write_meta(
       const std::string &key,
       const std::string &val
     ) = 0;
 
-    using read_meta_ertr = base_iertr;
+    using read_meta_ertr = base_ertr;
     using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
     virtual read_meta_ret read_meta(const std::string &key) = 0;
 
@@ -136,10 +135,7 @@ public:
       const omap_keys_t& keys) final;
 
     /// Retrieves paged set of values > start (if present)
-    using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
-    using omap_get_values_ret_t = read_errorator::future<
-      omap_get_values_ret_bare_t>;
-    omap_get_values_ret_t omap_get_values(
+    read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -170,7 +166,7 @@ public:
      * stages and locks as do_transaction. */
     seastar::future<> flush(CollectionRef ch) final;
 
-    read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
@@ -190,7 +186,6 @@ public:
       secondaries.emplace_back(&sec_dev);
     }
 
-    using coll_core_t = FuturizedStore::coll_core_t;
     seastar::future<std::vector<coll_core_t>> list_collections();
 
     seastar::future<> write_meta(const std::string& key,
@@ -305,18 +300,21 @@ public:
       auto begin_time = std::chrono::steady_clock::now();
       return seastar::do_with(
         oid, Ret{}, std::forward<F>(f),
-        [this, src, op_type, begin_time, tname
+        [this, ch, src, op_type, begin_time, tname
         ](auto &oid, auto &ret, auto &f)
       {
-        return repeat_eagain([&, this, src, tname] {
+        return repeat_eagain([&, this, ch, src, tname] {
           assert(src == Transaction::src_t::READ);
           ++(shard_stats.repeat_read_num);
 
           return transaction_manager->with_transaction_intr(
             src,
             tname,
-            [&, this](auto& t)
+            [&, this, ch, tname](auto& t)
           {
+            LOG_PREFIX(SeaStoreS::repeat_with_onode);
+            SUBDEBUGT(seastore, "{} cid={} oid={} ...",
+                      t, tname, ch->get_cid(), oid);
             return onode_manager->get_onode(t, oid
             ).si_then([&](auto onode) {
               return seastar::do_with(std::move(onode), [&](auto& onode) {
@@ -334,14 +332,16 @@ public:
       });
     }
 
-    using _fiemap_ret = ObjectDataHandler::fiemap_ret;
-    _fiemap_ret _fiemap(
-      Transaction &t,
-      Onode &onode,
-      uint64_t off,
-      uint64_t len) const;
+    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
+    using omap_list_ret = OMapManager::omap_list_ret;
+    omap_list_ret omap_list(
+      Onode& onode,
+      const omap_root_le_t& omap_root,
+      Transaction& t,
+      const std::optional<std::string>& start,
+      OMapManager::omap_list_config_t config) const;
 
-    using _omap_get_value_iertr = OMapManager::base_iertr::extend<
+    using _omap_get_value_iertr = base_iertr::extend<
       crimson::ct_error::enodata
       >;
     using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
@@ -350,25 +350,51 @@ public:
       omap_root_t &&root,
       std::string_view key) const;
 
-    using _omap_get_values_iertr = OMapManager::base_iertr;
-    using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
-    _omap_get_values_ret _omap_get_values(
+    base_iertr::future<omap_values_t> _omap_get_values(
       Transaction &t,
       omap_root_t &&root,
       const omap_keys_t &keys) const;
 
     friend class SeaStoreOmapIterator;
 
-    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
-    using omap_list_ret = OMapManager::omap_list_ret;
-    omap_list_ret omap_list(
-      Onode &onode,
-      const omap_root_le_t& omap_root,
+    base_iertr::future<ceph::bufferlist> _read( 
       Transaction& t,
-      const std::optional<std::string>& start,
-      OMapManager::omap_list_config_t config) const;
+      Onode& onode,
+      uint64_t offset,
+      std::size_t len,
+      uint32_t op_flags);
+
+    _omap_get_value_ret _get_attr(
+      Transaction& t,
+      Onode& onode,
+      std::string_view name) const;
+
+    base_iertr::future<attrs_t> _get_attrs(
+      Transaction& t,
+      Onode& onode);
+
+    seastar::future<struct stat> _stat(
+      Transaction& t,
+      Onode& onode,
+      const ghobject_t& oid);
+
+    base_iertr::future<omap_values_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const omap_keys_t& keys);
 
-    using tm_iertr = TransactionManager::base_iertr;
+    base_iertr::future<omap_values_paged_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const std::optional<std::string>& start);
+
+    base_iertr::future<fiemap_ret_t> _fiemap(
+      Transaction &t,
+      Onode &onode,
+      uint64_t off,
+      uint64_t len) const;
+
+    using tm_iertr = base_iertr;
     using tm_ret = tm_iertr::future<>;
     tm_ret _do_transaction_step(
       internal_context_t &ctx,
@@ -535,17 +561,7 @@ public:
     return shard_stores.local().get_fsid();
   }
 
-  seastar::future<> write_meta(
-    const std::string& key,
-    const std::string& value) final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.local().write_meta(
-      key, value).then([this, key, value] {
-      return mdstore->write_meta(key, value);
-    }).handle_error(
-      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
-    );
-  }
+  seastar::future<> write_meta(const std::string& key, const std::string& value) final;
 
   seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
 
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index e1430b30019..f379dd0117c 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -54,7 +54,9 @@ std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id)
   } else if (_id == DEVICE_ID_ROOT) {
     return out << "Dev(ROOT)";
   } else {
-    return out << "Dev(" << (unsigned)_id << ")";
+    return out << "Dev(0x"
+               << std::hex << (unsigned)_id << std::dec
+               << ")";
   }
 }
 
@@ -64,7 +66,7 @@ std::ostream &operator<<(std::ostream &out, const segment_id_t &segment)
     return out << "Seg[NULL]";
   } else {
     return out << "Seg[" << device_id_printer_t{segment.device_id()}
-               << "," << segment.device_segment_id()
+               << ",0x" << std::hex << segment.device_segment_id() << std::dec
                << "]";
   }
 }
@@ -93,12 +95,12 @@ std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq)
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_t &laddr) {
-  return out << 'L' << std::hex << laddr.value << std::dec;
+  return out << "L0x" << std::hex << laddr.value << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_offset_t &laddr_offset) {
   return out << laddr_offset.get_aligned_laddr()
-	     << "+" << std::hex << laddr_offset.get_offset() << std::dec;
+	     << "+0x" << std::hex << laddr_offset.get_offset() << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr)
@@ -123,18 +125,18 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
   } else if (has_device_off(id)) {
     auto &s = rhs.as_res_paddr();
     out << device_id_printer_t{id}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) {
     auto &s = rhs.as_seg_paddr();
     out << s.get_segment_id()
-        << ","
-        << s.get_segment_off();
+        << ",0x"
+        << std::hex << s.get_segment_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
     auto &s = rhs.as_blk_paddr();
     out << device_id_printer_t{s.get_device_id()}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else {
     out << "INVALID!";
   }
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 52515937a9e..5d8ad00ba22 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -80,6 +80,11 @@ struct rewrite_stats_t {
   }
 };
 
+struct rbm_pending_ool_t {
+  bool is_conflicted = false;
+  std::list<CachedExtentRef> pending_extents;
+};
+
 /**
  * Transaction
  *
@@ -554,6 +559,18 @@ public:
     return static_cast<T&>(*view);
   }
 
+  void set_pending_ool(seastar::lw_shared_ptr<rbm_pending_ool_t> ptr) {
+    pending_ool = ptr;
+  }
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> get_pending_ool() {
+    return pending_ool;
+  }
+
+  const auto& get_pre_alloc_list() {
+    return pre_alloc_list;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -650,6 +667,8 @@ private:
   const src_t src;
 
   transaction_id_t trans_id = TRANS_ID_NULL;
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
 };
 using TransactionRef = Transaction::Ref;
 
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index a76b7fbe0c9..f4e3b0858f2 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -48,7 +48,7 @@ TransactionManager::TransactionManager(
 TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
 {
   LOG_PREFIX(TransactionManager::mkfs);
-  INFO("enter");
+  INFO("...");
   return epm->mount(
   ).safe_then([this] {
     return journal->open_for_mkfs();
@@ -94,14 +94,15 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
   }).safe_then([this] {
     return close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
   });
 }
 
-TransactionManager::mount_ertr::future<> TransactionManager::mount()
+TransactionManager::mount_ertr::future<>
+TransactionManager::mount()
 {
   LOG_PREFIX(TransactionManager::mount);
-  INFO("enter");
+  INFO("...");
   cache->init();
   return epm->mount(
   ).safe_then([this] {
@@ -168,16 +169,17 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount()
     return epm->open_for_write();
   }).safe_then([FNAME, this] {
     epm->start_background();
-    INFO("completed");
+    INFO("done");
   }).handle_error(
     mount_ertr::pass_further{},
     crimson::ct_error::assert_all{"unhandled error"}
   );
 }
 
-TransactionManager::close_ertr::future<> TransactionManager::close() {
+TransactionManager::close_ertr::future<>
+TransactionManager::close() {
   LOG_PREFIX(TransactionManager::close);
-  INFO("enter");
+  INFO("...");
   return epm->stop_background(
   ).then([this] {
     return cache->close();
@@ -187,7 +189,7 @@ TransactionManager::close_ertr::future<> TransactionManager::close() {
   }).safe_then([this] {
     return epm->close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
     return seastar::now();
   });
 }
@@ -229,28 +231,26 @@ TransactionManager::ref_ret TransactionManager::remove(
   LogicalCachedExtentRef &ref)
 {
   LOG_PREFIX(TransactionManager::remove);
-  TRACET("{}", t, *ref);
+  DEBUGT("{} ...", t, *ref);
   return lba_manager->decref_extent(t, ref->get_laddr()
   ).si_then([this, FNAME, &t, ref](auto result) {
-    DEBUGT("extent refcount is decremented to {} -- {}",
-           t, result.refcount, *ref);
     if (result.refcount == 0) {
       cache->retire_extent(t, ref);
     }
+    DEBUGT("removed {}~0x{:x} refcount={} -- {}",
+           t, result.addr, result.length, result.refcount, *ref);
     return result.refcount;
   });
 }
 
-TransactionManager::ref_ret TransactionManager::_dec_ref(
+TransactionManager::ref_ret TransactionManager::remove(
   Transaction &t,
   laddr_t offset)
 {
-  LOG_PREFIX(TransactionManager::_dec_ref);
-  TRACET("{}", t, offset);
+  LOG_PREFIX(TransactionManager::remove);
+  DEBUGT("{} ...", t, offset);
   return lba_manager->decref_extent(t, offset
   ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
-    DEBUGT("extent refcount is decremented to {} -- {}~{}, {}",
-           t, result.refcount, offset, result.length, result.addr);
     auto fut = ref_iertr::now();
     if (result.refcount == 0) {
       if (result.addr.is_paddr() &&
@@ -259,8 +259,9 @@ TransactionManager::ref_ret TransactionManager::_dec_ref(
           t, result.addr.get_paddr(), result.length);
       }
     }
-
-    return fut.si_then([result=std::move(result)] {
+    return fut.si_then([result=std::move(result), offset, &t, FNAME] {
+      DEBUGT("removed {}~0x{:x} refcount={} -- offset={}",
+             t, result.addr, result.length, result.refcount, offset);
       return result.refcount;
     });
   });
@@ -271,19 +272,21 @@ TransactionManager::refs_ret TransactionManager::remove(
   std::vector<laddr_t> offsets)
 {
   LOG_PREFIX(TransactionManager::remove);
-  DEBUG("{} offsets", offsets.size());
+  DEBUGT("{} offsets ...", t, offsets.size());
   return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
-      [this, &t] (auto &&offsets, auto &refcnt) {
-      return trans_intr::do_for_each(offsets.begin(), offsets.end(),
-        [this, &t, &refcnt] (auto &laddr) {
-        return this->remove(t, laddr).si_then([&refcnt] (auto ref) {
-          refcnt.push_back(ref);
-          return ref_iertr::now();
-        });
-      }).si_then([&refcnt] {
-        return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt));
+    [this, &t, FNAME](auto &&offsets, auto &refcnts) {
+    return trans_intr::do_for_each(offsets.begin(), offsets.end(),
+      [this, &t, &refcnts](auto &laddr) {
+      return this->remove(t, laddr
+      ).si_then([&refcnts](auto ref) {
+        refcnts.push_back(ref);
+        return ref_iertr::now();
       });
+    }).si_then([&refcnts, &t, FNAME] {
+      DEBUGT("removed {} offsets", t, refcnts.size());
+      return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnts));
     });
+  });
 }
 
 TransactionManager::submit_transaction_iertr::future<>
@@ -340,6 +343,7 @@ TransactionManager::update_lba_mappings(
         return;
       }
       if (extent->is_logical()) {
+        assert(is_logical_type(extent->get_type()));
         // for rewritten extents, last_committed_crc should have been set
         // because the crc of the original extent may be reused.
         // also see rewrite_logical_extent()
@@ -359,6 +363,7 @@ TransactionManager::update_lba_mappings(
 #endif
         lextents.emplace_back(extent->template cast<LogicalCachedExtent>());
       } else {
+        assert(is_physical_type(extent->get_type()));
         pextents.emplace_back(extent);
       }
     };
@@ -515,7 +520,6 @@ TransactionManager::rewrite_logical_extent(
     ERRORT("extent has been invalidated -- {}", t, *extent);
     ceph_abort();
   }
-  TRACET("rewriting extent -- {}", t, *extent);
 
   auto lextent = extent->cast<LogicalCachedExtent>();
   cache->retire_extent(t, extent);
@@ -529,7 +533,7 @@ TransactionManager::rewrite_logical_extent(
       lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
     nlextent->rewrite(t, *lextent, 0);
 
-    DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+    DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
 
 #ifndef NDEBUG
     if (get_checksum_needed(lextent->get_paddr())) {
@@ -566,16 +570,16 @@ TransactionManager::rewrite_logical_extent(
       0,
       lextent->get_length(),
       extent_ref_count_t(0),
-      [this, lextent, &t](auto &extents, auto &off, auto &left, auto &refcount) {
+      [this, FNAME, lextent, &t]
+      (auto &extents, auto &off, auto &left, auto &refcount) {
       return trans_intr::do_for_each(
         extents,
-        [lextent, this, &t, &off, &left, &refcount](auto &nextent) {
-        LOG_PREFIX(TransactionManager::rewrite_logical_extent);
+        [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
         bool first_extent = (off == 0);
         ceph_assert(left >= nextent->get_length());
         auto nlextent = nextent->template cast<LogicalCachedExtent>();
         nlextent->rewrite(t, *lextent, off);
-        DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+        DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
 
         /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
          * extents since we're going to do it again once we either do the ool write
@@ -629,10 +633,18 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   {
     auto updated = cache->update_extent_from_transaction(t, extent);
     if (!updated) {
-      DEBUGT("extent is already retired, skipping -- {}", t, *extent);
+      DEBUGT("target={} {} already retired, skipping -- {}", t,
+             rewrite_gen_printer_t{target_generation},
+             sea_time_point_printer_t{modify_time},
+             *extent);
       return rewrite_extent_iertr::now();
     }
+
     extent = updated;
+    DEBUGT("target={} {} -- {} ...", t,
+           rewrite_gen_printer_t{target_generation},
+           sea_time_point_printer_t{modify_time},
+           *extent);
     ceph_assert(!extent->is_pending_io());
   }
 
@@ -650,9 +662,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
       // FIXME: is_dirty() is true for mutation pending extents
       // which shouldn't do inplace rewrite because a pending transaction
       // may fail.
-      DEBUGT("delta overwriting extent -- {}", t, *extent);
       t.add_inplace_rewrite_extent(extent);
       extent->set_inplace_rewrite_generation();
+      DEBUGT("rewritten as inplace rewrite -- {}", t, *extent);
       return rewrite_extent_iertr::now();
     }
     extent->set_target_rewrite_generation(INIT_GENERATION);
@@ -665,23 +677,25 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
     t.get_rewrite_stats().account_n_dirty();
   }
 
-  if (is_backref_node(extent->get_type())) {
-    DEBUGT("rewriting backref extent -- {}", t, *extent);
-    return backref_manager->rewrite_extent(t, extent);
-  }
-
   if (is_root_type(extent->get_type())) {
-    DEBUGT("rewriting root extent -- {}", t, *extent);
     cache->duplicate_for_write(t, extent);
+    DEBUGT("rewritten root {}", t, *extent);
     return rewrite_extent_iertr::now();
   }
 
+  auto fut = rewrite_extent_iertr::now();
   if (extent->is_logical()) {
-    return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+    assert(is_logical_type(extent->get_type()));
+    fut = rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+  } else if (is_backref_node(extent->get_type())) {
+    fut = backref_manager->rewrite_extent(t, extent);
   } else {
-    DEBUGT("rewriting physical extent -- {}", t, *extent);
-    return lba_manager->rewrite_extent(t, extent);
+    assert(is_lba_node(extent->get_type()));
+    fut = lba_manager->rewrite_extent(t, extent);
   }
+  return fut.si_then([FNAME, &t] {
+    DEBUGT("rewritten", t);
+  });
 }
 
 TransactionManager::get_extents_if_live_ret
@@ -693,7 +707,7 @@ TransactionManager::get_extents_if_live(
   extent_len_t len)
 {
   LOG_PREFIX(TransactionManager::get_extents_if_live);
-  TRACET("{} {}~{} {}", t, type, laddr, len, paddr);
+  DEBUGT("{} {}~0x{:x} {} ...", t, type, laddr, len, paddr);
 
   // This only works with segments to check if alive,
   // as parallel transactions may split the extent at the same time.
@@ -703,7 +717,7 @@ TransactionManager::get_extents_if_live(
   ).si_then([=, this, &t](auto extent)
 	    -> get_extents_if_live_ret {
     if (extent && extent->get_length() == len) {
-      DEBUGT("{} {}~{} {} is live in cache -- {}",
+      DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
              t, type, laddr, len, paddr, *extent);
       std::list<CachedExtentRef> res;
       res.emplace_back(std::move(extent));
@@ -757,7 +771,9 @@ TransactionManager::get_extents_if_live(
               list.emplace_back(std::move(ret));
               return seastar::now();
             });
-          }).si_then([&list] {
+          }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+            DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
+                   t, type, laddr, len, paddr, list.size());
             return get_extents_if_live_ret(
               interruptible::ready_future_marker{},
               std::move(list));
@@ -778,11 +794,11 @@ TransactionManager::get_extents_if_live(
       ).si_then([=, &t](auto ret) {
         std::list<CachedExtentRef> res;
         if (ret) {
-          DEBUGT("{} {}~{} {} is live as physical extent -- {}",
+          DEBUGT("{} {}~0x{:x} {} is absent and alive as physical extent -- {}",
                  t, type, laddr, len, paddr, *ret);
           res.emplace_back(std::move(ret));
         } else {
-          DEBUGT("{} {}~{} {} is not live as physical extent",
+          DEBUGT("{} {}~0x{:x} {} is not alive as physical extent",
                  t, type, laddr, len, paddr);
         }
         return get_extents_if_live_ret(
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 828b8a25592..c7a94a9ef11 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -106,8 +106,12 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::get_pin);
-    SUBTRACET(seastore_tm, "{}", t, offset);
-    return lba_manager->get_mapping(t, offset);
+    SUBDEBUGT(seastore_tm, "{} ...", t, offset);
+    return lba_manager->get_mapping(t, offset
+    ).si_then([FNAME, &t](LBAMappingRef pin) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *pin);
+      return pin;
+    });
   }
 
   /**
@@ -122,9 +126,13 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::get_pins);
-    SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, offset, length);
     return lba_manager->get_mappings(
-      t, offset, length);
+      t, offset, length
+    ).si_then([FNAME, &t](lba_pin_list_t pins) {
+      SUBDEBUGT(seastore_tm, "got {} pins", t, pins.size());
+      return pins;
+    });
   }
 
   /**
@@ -142,15 +150,15 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} {} ...",
+              t, offset, length, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset, length] (auto pin)
       -> read_extent_ret<T> {
       if (length != pin->get_length() || !pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} len {} got wrong pin {}",
-            t, offset, length, *pin);
+        SUBERRORT(seastore_tm, "{}~0x{:x} {} got wrong {}",
+                  t, offset, length, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -167,15 +175,15 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}", t, offset);
+    SUBDEBUGT(seastore_tm, "{} {} ...",
+              t, offset, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset] (auto pin)
       -> read_extent_ret<T> {
       if (!pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} got wrong pin {}",
-            t, offset, *pin);
+        SUBERRORT(seastore_tm, "{} {} got wrong {}",
+                  t, offset, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -187,6 +195,8 @@ public:
     Transaction &t,
     LBAMappingRef pin)
   {
+    LOG_PREFIX(TransactionManager::read_pin);
+    SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
     auto fut = base_iertr::make_ready_future<LBAMappingRef>();
     if (!pin->is_parent_viewable()) {
       if (pin->is_parent_valid()) {
@@ -212,52 +222,12 @@ public:
       } else {
 	return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
       }
+    }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *ext);
+      return ext;
     });
   }
 
-  template <typename T>
-  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
-  get_extent_if_linked(
-    Transaction &t,
-    LBAMappingRef pin)
-  {
-    ceph_assert(pin->is_parent_viewable());
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    auto v = pin->get_logical_extent(t);
-    if (v.has_child()) {
-      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
-#ifndef NDEBUG
-        auto lextent = extent->template cast<LogicalCachedExtent>();
-        auto pin_laddr = pin->get_key();
-        if (pin->is_indirect()) {
-          pin_laddr = pin->get_intermediate_base();
-        }
-        assert(lextent->get_laddr() == pin_laddr);
-#endif
-	return extent->template cast<T>();
-      });
-    } else {
-      return pin;
-    }
-  }
-
-  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
-    Transaction &t,
-    LBAMappingRef pin,
-    extent_types_t type)
-  {
-    ceph_assert(!pin->parent_modified());
-    auto v = pin->get_logical_extent(t);
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    if (v.has_child()) {
-      return std::move(v.get_child_fut());
-    } else {
-      return pin_to_extent_by_type(t, std::move(pin), type);
-    }
-  }
-
   /// Obtain mutable copy of extent
   LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
     LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -265,24 +235,15 @@ public:
       t,
       ref)->cast<LogicalCachedExtent>();
     if (!ret->has_laddr()) {
-      SUBDEBUGT(seastore_tm,
-	"duplicating extent for write -- {} -> {}",
-	t,
-	*ref,
-	*ret);
+      SUBDEBUGT(seastore_tm, "duplicate from {}", t, *ref);
       ret->set_laddr(ref->get_laddr());
     } else {
-      SUBTRACET(seastore_tm,
-	"extent is already duplicated -- {}",
-	t,
-	*ref);
       assert(ref->is_mutable());
       assert(&*ref == &*ret);
     }
     return ret;
   }
 
-
   using ref_iertr = LBAManager::ref_iertr;
   using ref_ret = ref_iertr::future<extent_ref_count_t>;
 
@@ -302,26 +263,15 @@ public:
    * remove
    *
    * Remove the extent and the corresponding lba mapping,
-   * users must make sure that lba mapping's refcount is 1
+   * users must make sure that lba mapping's refcount > 1
    */
   ref_ret remove(
     Transaction &t,
     LogicalCachedExtentRef &ref);
 
-  /**
-   * remove
-   *
-   * 1. Remove the indirect mapping(s), and if refcount drops to 0,
-   *    also remove the direct mapping and retire the extent.
-   * 
-   * 2. Remove the direct mapping(s) and retire the extent if
-   * 	refcount drops to 0.
-   */
   ref_ret remove(
     Transaction &t,
-    laddr_t offset) {
-    return _dec_ref(t, offset);
-  }
+    laddr_t offset);
 
   /// remove refcount for list of offset
   using refs_ret = ref_iertr::future<std::vector<unsigned>>;
@@ -346,23 +296,23 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto ext = cache->alloc_new_non_data_extent<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (!ext) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
       *ext
-    ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
-      LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-      SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
+    ).si_then([ext=std::move(ext), &t, FNAME](auto &&) mutable {
+      SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -385,14 +335,15 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_data_extents);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto exts = cache->alloc_new_data_extents<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (exts.empty()) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extents(
@@ -403,7 +354,7 @@ public:
       EXTENT_DEFAULT_REF_COUNT
     ).si_then([exts=std::move(exts), &t, FNAME](auto &&) mutable {
       for (auto &ext : exts) {
-	SUBDEBUGT(seastore_tm, "new extent: {}", t, *ext);
+	SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       }
       return alloc_extent_iertr::make_ready_future<
 	std::vector<TCachedExtentRef<T>>>(std::move(exts));
@@ -411,15 +362,21 @@ public:
   }
 
   template <typename T>
-  read_extent_ret<T> get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) {
+  read_extent_ret<T> get_mutable_extent_by_laddr(
+      Transaction &t,
+      laddr_t laddr,
+      extent_len_t len) {
+    LOG_PREFIX(TransactionManager::get_mutable_extent_by_laddr);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, laddr, len);
     return get_pin(t, laddr
     ).si_then([this, &t, len](auto pin) {
       ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved());
       ceph_assert(!pin->is_clone());
       ceph_assert(pin->get_length() == len);
       return this->read_pin<T>(t, std::move(pin));
-    }).si_then([this, &t](auto extent) {
+    }).si_then([this, &t, FNAME](auto extent) {
       auto ext = get_mutable_extent(t, extent)->template cast<T>();
+      SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
       return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -476,10 +433,8 @@ public:
       extent_len_t original_len = pin->get_length();
       paddr_t original_paddr = pin->get_val();
       LOG_PREFIX(TransactionManager::remap_pin);
-      SUBDEBUGT(seastore_tm,
-	"original laddr: {}, original paddr: {}, original length: {},"
-	" remap to {} extents",
-	t, original_laddr, original_paddr, original_len, remaps.size());
+      SUBDEBUGT(seastore_tm, "{}~0x{:x} {} into {} remaps ... {}",
+                t, original_laddr, original_len, original_paddr, remaps.size(), *pin);
       // The according extent might be stable or pending.
       auto fut = base_iertr::now();
       if (!pin->is_indirect()) {
@@ -536,14 +491,13 @@ public:
 	    auto remap_len = remap.len;
 	    auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr();
 	    auto remap_paddr = original_paddr.add_offset(remap_offset);
+	    SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...",
+	              t, remap_laddr, remap_len, remap_paddr);
 	    ceph_assert(remap_len < original_len);
 	    ceph_assert(remap_offset + remap_len <= original_len);
 	    ceph_assert(remap_len != 0);
 	    ceph_assert(remap_offset % cache->get_block_size() == 0);
 	    ceph_assert(remap_len % cache->get_block_size() == 0);
-	    SUBDEBUGT(seastore_tm,
-	      "remap laddr: {}, remap paddr: {}, remap length: {}", t,
-	      remap_laddr, remap_paddr, remap_len);
 	    auto extent = cache->alloc_remapped_extent<T>(
 	      t,
 	      remap_laddr,
@@ -555,13 +509,15 @@ public:
 	  }
 	});
       }
-      return fut.si_then([this, &t, &pin, &remaps, &extents] {
+      return fut.si_then([this, &t, &pin, &remaps, &extents, FNAME] {
 	return lba_manager->remap_mappings(
 	  t,
 	  std::move(pin),
 	  std::vector<remap_entry>(remaps.begin(), remaps.end()),
 	  std::move(extents)
-	).si_then([](auto ret) {
+	).si_then([FNAME, &t](auto ret) {
+	  SUBDEBUGT(seastore_tm, "remapped {} pins",
+	            t, ret.remapped_mappings.size());
 	  return Cache::retire_extent_iertr::make_ready_future<
 	    std::vector<LBAMappingRef>>(std::move(ret.remapped_mappings));
 	});
@@ -581,11 +537,15 @@ public:
     laddr_t hint,
     extent_len_t len) {
     LOG_PREFIX(TransactionManager::reserve_region);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
+    SUBDEBUGT(seastore_tm, "hint {}~0x{:x} ...", t, hint, len);
     return lba_manager->reserve_region(
       t,
       hint,
-      len);
+      len
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "reserved {}", t, *pin);
+      return pin;
+    });
   }
 
   /*
@@ -612,15 +572,17 @@ public:
         : mapping.get_key();
 
     LOG_PREFIX(TransactionManager::clone_pin);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
-      t, mapping.get_length(), hint, intermediate_key);
+    SUBDEBUGT(seastore_tm, "{} clone to hint {} ...", t, mapping, hint);
     return lba_manager->clone_mapping(
       t,
       hint,
       mapping.get_length(),
       intermediate_key,
       intermediate_base
-    );
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "cloned as {}", t, *pin);
+      return pin;
+    });
   }
 
   /* alloc_extents
@@ -635,10 +597,10 @@ public:
      extent_len_t len,
      int num) {
      LOG_PREFIX(TransactionManager::alloc_extents);
-     SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}",
-               t, len, hint, num);
+     SUBDEBUGT(seastore_tm, "hint {}~({} * 0x{:x}) ...",
+               t, hint, num, len);
      return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
-       [this, &t, hint, len, num] (auto &extents) {
+       [this, &t, hint, len, num, FNAME](auto &extents) {
        return trans_intr::do_for_each(
                        boost::make_counting_iterator(0),
                        boost::make_counting_iterator(num),
@@ -647,7 +609,8 @@ public:
            [&extents](auto &&node) {
            extents.push_back(node);
          });
-       }).si_then([&extents] {
+       }).si_then([&extents, &t, FNAME] {
+         SUBDEBUGT(seastore_tm, "allocated {} extents", t, extents.size());
          return alloc_extents_iertr::make_ready_future
                 <std::vector<TCachedExtentRef<T>>>(std::move(extents));
        });
@@ -753,7 +716,7 @@ public:
     const std::string& key,
     const std::string& value) {
     LOG_PREFIX(TransactionManager::update_root_meta);
-    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value);
+    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
     ).si_then([this, &t, &key, &value](RootBlockRef root) {
@@ -808,7 +771,7 @@ public:
     return cache->get_root(t).si_then([&t](auto croot) {
       LOG_PREFIX(TransactionManager::read_collection_root);
       auto ret = croot->get_root().collection_root.get();
-      SUBTRACET(seastore_tm, "{}~{}",
+      SUBTRACET(seastore_tm, "{}~0x{:x}",
                 t, ret.get_location(), ret.get_size());
       return ret;
     });
@@ -821,7 +784,7 @@ public:
    */
   void write_collection_root(Transaction &t, coll_root_t cmroot) {
     LOG_PREFIX(TransactionManager::write_collection_root);
-    SUBDEBUGT(seastore_tm, "{}~{}",
+    SUBDEBUGT(seastore_tm, "{}~0x{:x}",
               t, cmroot.get_location(), cmroot.get_size());
     auto croot = cache->get_root_fast(t);
     croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
@@ -853,6 +816,49 @@ private:
 
   shard_stats_t& shard_stats;
 
+  template <typename T>
+  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+  get_extent_if_linked(
+    Transaction &t,
+    LBAMappingRef pin)
+  {
+    ceph_assert(pin->is_parent_viewable());
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    auto v = pin->get_logical_extent(t);
+    if (v.has_child()) {
+      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+#ifndef NDEBUG
+        auto lextent = extent->template cast<LogicalCachedExtent>();
+        auto pin_laddr = pin->get_key();
+        if (pin->is_indirect()) {
+          pin_laddr = pin->get_intermediate_base();
+        }
+        assert(lextent->get_laddr() == pin_laddr);
+#endif
+	return extent->template cast<T>();
+      });
+    } else {
+      return pin;
+    }
+  }
+
+  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
+    Transaction &t,
+    LBAMappingRef pin,
+    extent_types_t type)
+  {
+    ceph_assert(!pin->parent_modified());
+    auto v = pin->get_logical_extent(t);
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    if (v.has_child()) {
+      return std::move(v.get_child_fut());
+    } else {
+      return pin_to_extent_by_type(t, std::move(pin), type);
+    }
+  }
+
   rewrite_extent_ret rewrite_logical_extent(
     Transaction& t,
     LogicalCachedExtentRef extent);
@@ -862,11 +868,6 @@ private:
     ExtentPlacementManager::dispatch_result_t dispatch_result,
     std::optional<journal_seq_t> seq_to_trim = std::nullopt);
 
-  /// Remove refcount for offset
-  ref_ret _dec_ref(
-    Transaction &t,
-    laddr_t offset);
-
   using update_lba_mappings_ret = LBAManager::update_mappings_ret;
   update_lba_mappings_ret update_lba_mappings(
     Transaction &t,
@@ -886,7 +887,7 @@ private:
     Transaction &t,
     LBAMappingRef pin) {
     LOG_PREFIX(TransactionManager::pin_to_extent);
-    SUBTRACET(seastore_tm, "getting extent {}", t, *pin);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
     static_assert(is_logical_type(T::TYPE));
     using ret = pin_to_extent_ret<T>;
     auto &pref = *pin;
@@ -950,7 +951,8 @@ private:
       extent_types_t type)
   {
     LOG_PREFIX(TransactionManager::pin_to_extent_by_type);
-    SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} type {} ...",
+              t, *pin, type);
     assert(is_logical_type(type));
     auto &pref = *pin;
     return cache->get_absent_extent_by_type(
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 683dc6ea649..522a93a1ddc 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -52,6 +52,12 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     return peering_state.is_backfilling();
   }
 
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {
+    return peering_state.prepare_backfill_for_missing(soid, v, peers);
+  }
   PeeringFacade(PeeringState& peering_state)
     : peering_state(peering_state) {
   }
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf..018e58b68f8 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -225,7 +225,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
   const BackfillInterval& backfill_info) const
 {
   return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
-	 !backfill_info.extends_to_end();
+	 !backfill_info.extends_to_end() && backfill_info.empty();
 }
 
 void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
@@ -266,6 +266,7 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
   logger().debug("{}: check={}", __func__, check);
   const auto& primary_bi = backfill_state().backfill_info;
   result_t result { {}, primary_bi.begin };
+  std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
 
   for (const auto& bt : peering_state().get_backfill_targets()) {
     const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
@@ -273,9 +274,13 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
     // Find all check peers that have the wrong version
     if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
         check == primary_bi.begin && check == peer_bi.begin) {
-      if(peer_bi.objects.begin()->second != obj_v &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (peer_bi.objects.begin()->second != obj_v) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       } else {
         // it's fine, keep it! OR already recovering
       }
@@ -284,12 +289,22 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
       // Only include peers that we've caught up to their backfill line
       // otherwise, they only appear to be missing this object
       // because their peer_bi.begin > backfill_info.begin.
-      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt)) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       }
     }
   }
+  for (auto &backfill : backfills) {
+    auto &soid = backfill.first;
+    auto &obj_v = backfill.second.first;
+    auto &peers = backfill.second.second;
+    backfill_listener().enqueue_push(soid, obj_v, peers);
+  }
   return result;
 }
 
@@ -327,16 +342,29 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
   }
   trim_backfill_infos();
 
-  while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+				   primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    logger().debug("{}: reached end for current local chunk", __func__);
+    post_event(RequestPrimaryScanning{});
+    return;
+  }
+
+  do {
     if (!backfill_listener().budget_available()) {
       post_event(RequestWaiting{});
       return;
     } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
-                                      primary_bi)) {
+				      primary_bi)) {
       // Count simultaneous scans as a single op and let those complete
       post_event(RequestReplicasScanning{});
       return;
     }
+
+    if (all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+      break;
+    }
     // Get object within set of peers to operate on and the set of targets
     // for which that object applies.
     if (const hobject_t check = \
@@ -355,30 +383,23 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      primary_bi.pop_front();
+      if (!primary_bi.empty()) {
+	primary_bi.pop_front();
+      }
     }
     backfill_listener().maybe_flush();
-  }
+  } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
-  if (should_rescan_primary(backfill_state().peer_backfill_info,
-                            primary_bi)) {
-    // need to grab one another chunk of the object namespace and restart
-    // the queueing.
-    logger().debug("{}: reached end for current local chunk",
-                   __func__);
-    post_event(RequestPrimaryScanning{});
-  } else {
-    if (backfill_state().progress_tracker->tracked_objects_completed()
-	&& Enqueuing::all_enqueued(peering_state(),
-				   backfill_state().backfill_info,
-				   backfill_state().peer_backfill_info)) {
-      backfill_state().last_backfill_started = hobject_t::get_max();
-      backfill_listener().update_peers_last_backfill(hobject_t::get_max());
-    }
-    logger().debug("{}: reached end for both local and all peers "
-                   "but still has in-flight operations", __func__);
-    post_event(RequestWaiting{});
+  if (backfill_state().progress_tracker->tracked_objects_completed()
+      && Enqueuing::all_enqueued(peering_state(),
+				 backfill_state().backfill_info,
+				 backfill_state().peer_backfill_info)) {
+    backfill_state().last_backfill_started = hobject_t::get_max();
+    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   }
+  logger().debug("{}: reached end for both local and all peers "
+		 "but still has in-flight operations", __func__);
+  post_event(RequestWaiting{});
 }
 
 // -- PrimaryScanning
@@ -403,7 +424,7 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
 {
   logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -480,7 +501,7 @@ BackfillState::ReplicasScanning::react(ObjectPushed evt)
 {
   logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -496,16 +517,8 @@ BackfillState::Waiting::react(ObjectPushed evt)
 {
   logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
-  if (!Enqueuing::all_enqueued(peering_state(),
-                               backfill_state().backfill_info,
-                               backfill_state().peer_backfill_info)) {
-    return transit<Enqueuing>();
-  } else {
-    // we still have something to wait on
-    logger().debug("Waiting::react() on ObjectPushed; still waiting");
-    return discard_event();
-  }
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
+  return transit<Enqueuing>();;
 }
 
 // -- Done
@@ -559,7 +572,8 @@ void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
 
 void BackfillState::ProgressTracker::complete_to(
   const hobject_t& obj,
-  const pg_stat_t& stats)
+  const pg_stat_t& stats,
+  bool may_push_to_max)
 {
   logger().debug("{}: obj={}",
                  __func__, obj);
@@ -570,6 +584,7 @@ void BackfillState::ProgressTracker::complete_to(
   } else {
     ceph_abort_msg("completing untracked object shall not happen");
   }
+  auto new_last_backfill = peering_state().earliest_backfill();
   for (auto it = std::begin(registry);
        it != std::end(registry) &&
          it->second.stage != op_stage_t::enqueued_push;
@@ -579,15 +594,18 @@ void BackfillState::ProgressTracker::complete_to(
     peering_state().update_complete_backfill_object_stats(
       soid,
       *item.stats);
+    assert(soid > new_last_backfill);
+    new_last_backfill = soid;
   }
-  if (Enqueuing::all_enqueued(peering_state(),
+  if (may_push_to_max &&
+      Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
                               backfill_state().peer_backfill_info) &&
       tracked_objects_completed()) {
     backfill_state().last_backfill_started = hobject_t::get_max();
     backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   } else {
-    backfill_listener().update_peers_last_backfill(obj);
+    backfill_listener().update_peers_last_backfill(new_last_backfill);
   }
 }
 
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 6c36db81813..ddc0cbf7355 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -336,7 +336,8 @@ struct BackfillState::BackfillListener {
 
   virtual void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) = 0;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) = 0;
 
   virtual void enqueue_drop(
     const pg_shard_t& target,
@@ -375,6 +376,10 @@ struct BackfillState::PeeringFacade {
   virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) = 0;
   virtual bool is_backfilling() const = 0;
+  virtual void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) = 0;
   virtual ~PeeringFacade() {}
 };
 
@@ -421,7 +426,7 @@ public:
 
   bool enqueue_push(const hobject_t&);
   void enqueue_drop(const hobject_t&);
-  void complete_to(const hobject_t&, const pg_stat_t&);
+  void complete_to(const hobject_t&, const pg_stat_t&, bool may_push_to_max);
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 8d2d10fbd7c..34ad97ceb06 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -23,6 +23,7 @@
 #include "messages/MOSDOp.h"
 #include "messages/MOSDPeeringOp.h"
 #include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGRemove.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 #include "messages/MOSDRepOpReply.h"
@@ -863,6 +864,8 @@ OSD::do_ms_dispatch(
     [[fallthrough]];
   case MSG_OSD_PG_LOG:
     return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+  case MSG_OSD_PG_REMOVE:
+    return handle_pg_remove(conn, boost::static_pointer_cast<MOSDPGRemove>(m));
   case MSG_OSD_REPOP:
     return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
   case MSG_OSD_REPOPREPLY:
@@ -1555,6 +1558,27 @@ seastar::future<> OSD::handle_peering_op(
     std::move(*evt)).second;
 }
 
+seastar::future<> OSD::handle_pg_remove(
+  crimson::net::ConnectionRef conn,
+  Ref<MOSDPGRemove> m)
+{
+  LOG_PREFIX(OSD::handle_pg_remove);
+  const int from = m->get_source().num();
+  std::vector<seastar::future<>> futs;
+  for (auto &pg : m->pg_list) {
+    DEBUG("{} from {}", pg, from);
+    futs.emplace_back(
+      pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+	conn,
+	pg_shard_t{from, pg.shard},
+	pg,
+	m->get_epoch(),
+	m->get_epoch(),
+	PeeringState::DeleteStart()).second);
+  }
+  return seastar::when_all_succeed(std::move(futs));
+}
+
 seastar::future<> OSD::check_osdmap_features()
 {
   LOG_PREFIX(OSD::check_osdmap_features);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index de39d808274..d7d54d5d2c3 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -208,6 +208,8 @@ private:
                                         Ref<MOSDRepOpReply> m);
   seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
                                       Ref<MOSDPeeringOp> m);
+  seastar::future<> handle_pg_remove(crimson::net::ConnectionRef conn,
+				     Ref<MOSDPGRemove> m);
   seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
                                            Ref<MOSDFastDispatchOp> m);
   seastar::future<> handle_scrub_command(crimson::net::ConnectionRef conn,
diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc
index a56d58d2066..68638d3a7b1 100644
--- a/src/crimson/osd/osd_operations/client_request_common.cc
+++ b/src/crimson/osd/osd_operations/client_request_common.cc
@@ -71,30 +71,4 @@ CommonClientRequest::do_recover_missing(
   }
 }
 
-bool CommonClientRequest::should_abort_request(
-  const Operation& op,
-  std::exception_ptr eptr)
-{
-  if (*eptr.__cxa_exception_type() ==
-      typeid(::crimson::common::actingset_changed)) {
-    try {
-      std::rethrow_exception(eptr);
-    } catch(::crimson::common::actingset_changed& e) {
-      if (e.is_primary()) {
-        logger().debug("{} {} operation restart, acting set changed", __func__, op);
-        return false;
-      } else {
-        logger().debug("{} {} operation abort, up primary changed", __func__, op);
-        return true;
-      }
-    }
-  } else {
-    assert(*eptr.__cxa_exception_type() ==
-      typeid(crimson::common::system_shutdown_exception));
-    crimson::get_logger(ceph_subsys_osd).debug(
-        "{} {} operation skipped, system shutdown", __func__, op);
-    return true;
-  }
-}
-
 } // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h
index 951bf653799..4c3cf42777b 100644
--- a/src/crimson/osd/osd_operations/client_request_common.h
+++ b/src/crimson/osd/osd_operations/client_request_common.h
@@ -16,9 +16,6 @@ struct CommonClientRequest {
     Ref<PG> pg,
     const hobject_t& soid,
     const osd_reqid_t& reqid);
-
-  static bool should_abort_request(
-    const crimson::Operation& op, std::exception_ptr eptr);
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 2968a6f4385..a19bb0826f0 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -54,9 +54,9 @@ seastar::future<> InternalClientRequest::start()
 {
   track_event<StartEvent>();
   return crimson::common::handle_system_shutdown([this] {
-    return seastar::repeat([this] {
       LOG_PREFIX(InternalClientRequest::start);
       DEBUGI("{}: in repeat", *this);
+
       return interruptor::with_interruption([this]() mutable {
         return enter_stage<interruptor>(
 	  client_pp().wait_for_active
@@ -121,17 +121,12 @@ seastar::future<> InternalClientRequest::start()
           PG::load_obc_ertr::all_same_way([] {
             return seastar::now();
           })
-        ).then_interruptible([] {
-          return seastar::stop_iteration::yes;
-        });
-      }, [this](std::exception_ptr eptr) {
-        if (should_abort_request(*this, std::move(eptr))) {
-          return seastar::stop_iteration::yes;
-        } else {
-          return seastar::stop_iteration::no;
-        }
-      }, pg, start_epoch);
-    }).then([this] {
+	);
+      }, [](std::exception_ptr eptr) {
+	return seastar::now();
+      }, pg, start_epoch
+
+    ).then([this] {
       track_event<CompletionEvent>();
     }).handle_exception_type([](std::system_error &error) {
       logger().debug("error {}, message: {}", error.code(), error.what());
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index d210773ca30..97d48c1fa45 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -481,6 +481,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
   auto [objs_to_rm, next] = fut.get();
   if (objs_to_rm.empty()) {
     logger().info("all objs removed, removing coll for {}", pgid);
+    t.remove(coll_ref->get_cid(), pgid.make_snapmapper_oid());
     t.remove(coll_ref->get_cid(), pgmeta_oid);
     t.remove_collection(coll_ref->get_cid());
     (void) shard_services.get_store().do_transaction(
@@ -490,7 +491,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
     return {next, false};
   } else {
     for (auto &obj : objs_to_rm) {
-      if (obj == pgmeta_oid) {
+      if (obj == pgmeta_oid || obj.is_internal_pg_local()) {
         continue;
       }
       logger().trace("pg {}, removing obj {}", pgid, obj);
@@ -517,7 +518,8 @@ Context *PG::on_clean()
 {
   recovery_handler->on_pg_clean();
   scrubber.on_primary_active_clean();
-  return nullptr;
+  recovery_finisher = new C_PG_FinishRecovery(*this);
+  return recovery_finisher;
 }
 
 seastar::future<> PG::clear_temp_objects()
@@ -1885,4 +1887,19 @@ void PG::cancel_pglog_based_recovery_op() {
   pglog_based_recovery_op->cancel();
   reset_pglog_based_recovery_op();
 }
+
+void PG::C_PG_FinishRecovery::finish(int r) {
+  LOG_PREFIX(PG::C_PG_FinishRecovery::finish);
+  auto &peering_state = pg.get_peering_state();
+  if (peering_state.is_deleting() || !peering_state.is_clean()) {
+    DEBUGDPP("raced with delete or repair", pg);
+    return;
+  }
+  if (this == pg.recovery_finisher) {
+    peering_state.purge_strays();
+    pg.recovery_finisher = nullptr;
+  } else {
+    DEBUGDPP("stale recovery finsher", pg);
+  }
+}
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 93279a18c56..d8bbc56abcc 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -375,7 +375,7 @@ public:
   }
   void check_blocklisted_watchers() final;
   void clear_primary_state() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
 
   void queue_check_readable(epoch_t last_peering_reset,
@@ -394,7 +394,7 @@ public:
   void on_replica_activate() final;
   void on_activate_complete() final;
   void on_new_interval() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
   Context *on_clean() final;
   void on_activate_committed() final {
@@ -712,9 +712,17 @@ public:
   }
   seastar::future<> stop();
 private:
+  class C_PG_FinishRecovery : public Context {
+  public:
+    explicit C_PG_FinishRecovery(PG &pg) : pg(pg) {}
+    void finish(int r) override;
+  private:
+    PG& pg;
+  };
   std::unique_ptr<PGBackend> backend;
   std::unique_ptr<RecoveryBackend> recovery_backend;
   std::unique_ptr<PGRecovery> recovery_handler;
+  C_PG_FinishRecovery *recovery_finisher;
 
   PeeringState peering_state;
   eversion_t projected_last_update;
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index 4f874d526b3..ec3af0d2b00 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -528,10 +528,12 @@ void PGRecovery::request_primary_scan(
 
 void PGRecovery::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &peers)
 {
-  logger().info("{}: obj={} v={}",
-                 __func__, obj, v);
+  logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers);
+  auto &peering_state = pg->get_peering_state();
+  peering_state.prepare_backfill_for_missing(obj, v, peers);
   auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj);
   if (!added)
     return;
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 6cd29c3dc52..705b3176b97 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -110,7 +110,8 @@ private:
     const hobject_t& begin) final;
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) final;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index 5f7c4a62447..a053d9d5044 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -767,20 +767,26 @@ seastar::future<> ShardServices::dispatch_context_transaction(
   LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
   if (ctx.transaction.empty()) {
     DEBUG("empty transaction");
-    return seastar::now();
+    co_await get_store().flush(col);
+    Context* on_commit(
+      ceph::os::Transaction::collect_all_contexts(ctx.transaction));
+    if (on_commit) {
+      on_commit->complete(0);
+    }
+    co_return;
   }
 
   DEBUG("do_transaction ...");
-  auto ret = get_store().do_transaction(
+  co_await get_store().do_transaction(
     col,
     ctx.transaction.claim_and_reset());
-  return ret;
+  co_return;
 }
 
 seastar::future<> ShardServices::dispatch_context_messages(
   BufferedRecoveryMessages &&ctx)
 {
-  LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
+  LOG_PREFIX(OSDSingletonState::dispatch_context_messages);
   auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
     [FNAME, this](auto& osd_messages) {
       auto& [peer, messages] = osd_messages;
diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt
index 40da7e495c3..af8f7e185c8 100644
--- a/src/crypto/isa-l/CMakeLists.txt
+++ b/src/crypto/isa-l/CMakeLists.txt
@@ -1,36 +1,17 @@
-set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
-set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}")
+# build isa-l_crypto from its makefile and expose as target ISAL::Crypto
+include(BuildISALCrypto)
+build_isal_crypto()
 
 set(isal_crypto_plugin_srcs
   isal_crypto_accel.cc 
-  isal_crypto_plugin.cc
-  ${isal_dir}/aes/cbc_pre.c
-  ${isal_dir}/aes/cbc_multibinary.asm
-  ${isal_dir}/aes/keyexp_128.asm
-  ${isal_dir}/aes/keyexp_192.asm
-  ${isal_dir}/aes/keyexp_256.asm
-  ${isal_dir}/aes/keyexp_multibinary.asm
-  ${isal_dir}/aes/cbc_dec_128_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_128_x8_avx.asm
-  ${isal_dir}/aes/cbc_dec_192_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_192_x8_avx.asm
-  ${isal_dir}/aes/cbc_dec_256_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_256_x8_avx.asm
-  ${isal_dir}/aes/cbc_enc_128_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_128_x8_sb.asm
-  ${isal_dir}/aes/cbc_enc_192_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_192_x8_sb.asm
-  ${isal_dir}/aes/cbc_enc_256_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_256_x8_sb.asm)
+  isal_crypto_plugin.cc)
 
 if(HAVE_NASM_X64)
 add_dependencies(crypto_plugins ceph_crypto_isal)
 endif(HAVE_NASM_X64)
 
 add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs})
-target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include)
-
-target_link_libraries(ceph_crypto_isal PRIVATE Boost::context)
+target_link_libraries(ceph_crypto_isal PRIVATE ISAL::Crypto Boost::context)
 
 set_target_properties(ceph_crypto_isal PROPERTIES
   VERSION 1.0.0
diff --git a/src/doc/rgw/cloud-restore.md b/src/doc/rgw/cloud-restore.md
new file mode 100644
index 00000000000..d54b18dfa50
--- /dev/null
+++ b/src/doc/rgw/cloud-restore.md
@@ -0,0 +1,127 @@
+# cloud-restore
+
+## Introduction
+
+[`cloud-transition`](https://docs.ceph.com/en/latest/radosgw/cloud-transition) feature enables data transition to a remote cloud service as part of Lifecycle Configuration via Storage Classes. However the transition is unidirectional; data cannot be transitioned back from the remote zone.
+
+The `cloud-restore` feature enables restoration of those transitioned objects from the remote cloud S3 endpoints back into RGW.
+
+The objects can be restored either by using S3 `restore-object` CLI or via `read-through`. The restored copies can be either temporary or permanent.
+
+## S3 restore-object CLI
+
+The goal here is to implement minimal functionality of [`S3RestoreObject`](https://docs.aws.amazon.com/cli/latest/reference/s3api/restore-object.html) API so that users can restore the cloud transitioned objects.
+
+```sh
+aws s3api restore-object \
+                    --bucket <value> \
+                    --key <value>  ( can be object name or * for Bulk restore) \
+                    [--version-id <value>] \
+                    --restore-request (structure) {
+                     // for temporary restore
+                        { "Days": integer, }  
+                        // if Days not provided, it will be considered as permanent copy
+                    }
+```
+
+This CLI may be extended in future to include custom parameters (like target-bucket/storage-class etc) specific to RGW.
+
+## read-through
+
+As per the cloud-transition feature functionality, the cloud-transitioned objects cannot be read. `GET` on those objects fails with ‘InvalidObjectState’ error.
+
+But using this restore feature, transitioned objects can be restored and read. New tier-config options `allow_read_through` and `read_through_restore_days` are added for the same. Only when `allow_read_through` is enabled, `GET` on the transitioned objects will restore the objects from the S3 endpoint.
+
+Note: The object copy restored via `readthrough` is temporary and is retained only for the duration of `read_through_restore_days`.
+
+## Design
+
+* Similar to cloud-transition feature, this feature currently works for **only s3 compatible cloud endpoint**.
+* This feature works for only **cloud-transitioned objects**. In order to validate this, `retain_head_object` option should be set to true so that the object’s `HEAD` object can be verified before restoring the object.
+
+* **Request flow:**
+  * Once the `HEAD` object is verified, its cloudtier storage class config details are fetched.
+Note: Incase the cloudtier storage-class is deleted/updated, the object may not be restored.
+  * RestoreStatus for the `HEAD` object is marked `RestoreAlreadyInProgress`
+  * Object Restore is done asynchronously by issuing either S3 `GET` or S3 `RESTORE` request to the remote endpoint.
+  * Once the object is restored, RestoreStaus is updated as `CloudRestored` and RestoreType is set to either `Temporary` or `Permanent`.
+  * Incase the operation fails, RestoreStatus is marked as `RestoreFailed`.
+
+* **New attrs:** Below are the new attrs being added
+  * `user.rgw.restore-status`: <Restore operation Status>
+  * `user.rgw.restore-type`: <Type of Restore>
+  * `user.rgw.restored-at`: <Restoration Time>
+  * `user.rgw.restore-expiry-date`: <Expiration time incase of temporary copies>
+  * `user.rgw.cloudtier_storage_class`: <CloudTier storage class used in case of temporarily restored copies>
+
+```cpp
+        enum RGWRestoreStatus : uint8_t {
+          None  = 0,
+          RestoreAlreadyInProgress = 1,
+          CloudRestored = 2,
+          RestoreFailed = 3
+        };
+        enum class RGWRestoreType : uint8_t {
+          None = 0,
+          Temporary = 1,
+          Permanent = 2
+        };
+```
+
+* **Response:**
+* `S3 restore-object CLI`  returns SUCCESS - either the 200 OK or 202 Accepted status code.
+  * If the object is not previously restored, then RGW returns 202 Accepted in the response.
+  * If the object is previously restored, RGW returns 200 OK in the response.
+    * Special errors:
+        Code: RestoreAlreadyInProgress ( Cause: Object restore is already in progress.)
+        Code: ObjectNotFound (if Object is not found in cloud endpoint)
+        Code: I/O error (for any other I/O errors during restore)
+* `GET request` continues to return an  ‘InvalidObjectState’ error till the object is successfully restored.
+  * S3 head-object can be used to verify if the restore is still in progress.
+  * Once the object is restored, GET will return the object data.
+
+* **StorageClass**: By default, the objects are restored to `STANDARD` storage class. However, as per [AWS S3 Restore](https://docs.aws.amazon.com/cli/latest/reference/s3api/restore-object.html) the storage-class remains the same for restored objects. Hence for the temporary copies, the `x-amz-storage-class` returned contains original cloudtier storage-class.
+  * Note: A new tier-config option may be added to select the storage-class to restore the objects to.
+
+* **mtime**: If the restored object is temporary, object is still marked `RGWObj::CloudTiered`  and mtime is not changed i.e, still set to transition time. But in case the object is permanent copy, it is marked `RGWObj::Main` and mtime is updated to the restore time (now()).
+
+* **Lifecycle**:
+  * `Temporary` copies are not subjected to any further transition to the cloud. However (as is the case with cloud-transitioned objects) they can be deleted via regular LC expiration rules or via external S3 Delete request.
+  * `Permanent` copies are treated as any regular objects and are subjected to any LC rules applicable.
+
+* **Replication**:  The restored objects (both temporary and permanent) are also replicated like regular objects and will be deleted across the zones post expiration.
+
+* **VersionedObjects** : In case of versioning, if any object is cloud-transitioned, it would have been non-current. Post restore too, the same non-current object will be updated with the downloaded data and its HEAD object will be updated accordingly as the case with regular objects.
+
+* **Temporary Object Expiry**: This is done via Object Expirer
+  * When the object is restored as temporary, `user.rgw.expiry-date` is set accordingly and `delete_at` attr is also updated with the same value.
+  * This object is then added to the list used by `ObjectExpirer`.
+  * `LC` worker thread is used to scan through that list and post expiry, resets the objects back to cloud-transitioned state i.e,
+    * HEAD object with size=0
+    * new attrs removed
+    * `delete_at` reset
+  * Note: A new RGW option `rgw_restore_debug_interval` is added, which when set will be considered as `Days` value (similar to `rgw_lc_debug_interval`).
+
+* **FAILED Restore**: In case the restore operation fails,
+  * The HEAD object will be updated accordingly.. i.e, Storage-class is reset to the original cloud-tier storage class
+  * All the new attrs added will be removed , except for `user.rgw.restore-status` which will be updated as `RestoreFailed`
+
+* **Check Restore Progress**: Users can issue S3 `head-object` request to check if the restore is done or still in progress for any object.
+
+* **RGW down/restarts** - Since the restore operation is asynchronous, we need to keep track of the objects being restored. In case RGW is down/restarts, this data will be used to retrigger on-going restore requests or do appropriate cleanup for the failed requests.
+
+* **Compression** - If the placement-target to which the objects are being restored to has compression enabled, the data will be compressed accordingly (bug2294512)
+
+* **Encryption** - If the restored object is encrypted, the old sse-related xattrs/keys from the HEAD stub will be copied back into object metadata (bug2294512)
+
+* **Delete cloud object post restore** - Once the object is successfully restored, the object at the remote endpoint is still retained. However we could choose to delete it for permanent restored copies by adding new tier-config option.
+
+## Future work
+
+* **Bulk Restore**: In the case of BulkRestore, some of the objects may not be restored. User needs to manually cross-check the objects to check the objects restored or InProgress.
+
+* **Admin CLIs**: Admin debug commands will be provided to start, check the status and cancel the restore operations.
+
+* **Admin Ops**
+
+* **Restore Notifications**
diff --git a/src/erasure-code/isa/CMakeLists.txt b/src/erasure-code/isa/CMakeLists.txt
index 2ca398ffcb1..6162075cbc8 100644
--- a/src/erasure-code/isa/CMakeLists.txt
+++ b/src/erasure-code/isa/CMakeLists.txt
@@ -1,113 +1,18 @@
-# ISA
-set(isal_src_dir ${CMAKE_SOURCE_DIR}/src/isa-l)
-include_directories(${isal_src_dir}/include)
+# build isa-l from its makefile and expose as target ISAL::ISAL
+include(BuildISAL)
+build_isal()
 
-if(HAVE_NASM_X64_AVX2)
-  set(CMAKE_ASM_FLAGS "-i ${isal_src_dir}/include/ ${CMAKE_ASM_FLAGS}")
-  set(isa_srcs
-    ${isal_src_dir}/erasure_code/ec_base.c
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/ec_highlevel_func.c
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/ec_multibinary.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mul_avx.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mul_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx512.asm
-    ${isal_src_dir}/raid/raid_base.c
-    ${isal_src_dir}/raid/raid_multibinary.asm
-    ${isal_src_dir}/raid/xor_check_sse.asm
-    ${isal_src_dir}/raid/xor_gen_sse.asm
-    ${isal_src_dir}/raid/xor_gen_avx.asm
-    ${isal_src_dir}/raid/xor_gen_avx512.asm
-    ${isal_src_dir}/raid/pq_check_sse.asm
-    ${isal_src_dir}/raid/pq_gen_sse.asm
-    ${isal_src_dir}/raid/pq_gen_avx.asm
-    ${isal_src_dir}/raid/pq_gen_avx2.asm
-    ErasureCodeIsa.cc
-    ErasureCodeIsaTableCache.cc
-    ErasureCodePluginIsa.cc
-  )
-elseif(HAVE_ARMV8_SIMD)
-  set(isa_srcs
-    ${isal_src_dir}/erasure_code/ec_base.c
-    ${isal_src_dir}/erasure_code/aarch64/ec_aarch64_highlevel_func.c
-    ${isal_src_dir}/erasure_code/aarch64/ec_aarch64_dispatcher.c
-    ${isal_src_dir}/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_2vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_3vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_4vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_5vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_6vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_mul_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/ec_multibinary_arm.S
-    ${isal_src_dir}/raid/raid_base.c
-    ${isal_src_dir}/raid/aarch64/raid_aarch64_dispatcher.c
-    ${isal_src_dir}/raid/aarch64/raid_multibinary_arm.S
-    ${isal_src_dir}/raid/aarch64/xor_check_neon.S
-    ${isal_src_dir}/raid/aarch64/xor_gen_neon.S
-    ${isal_src_dir}/raid/aarch64/pq_check_neon.S
-    ${isal_src_dir}/raid/aarch64/pq_gen_neon.S
-    ErasureCodeIsa.cc
-    ErasureCodeIsaTableCache.cc
-    ErasureCodePluginIsa.cc
-  )
-  set_source_files_properties(
-    ${isal_src_dir}/erasure_code/aarch64/ec_multibinary_arm.S
-    ${isal_src_dir}/raid/aarch64/raid_multibinary_arm.S
-    PROPERTIES COMPILE_FLAGS "-D__ASSEMBLY__"
-  )
-endif()
+# ISA
+set(isa_srcs
+  ErasureCodeIsa.cc
+  ErasureCodeIsaTableCache.cc
+  ErasureCodePluginIsa.cc
+)
 
 add_library(ec_isa SHARED
   ${isa_srcs}
   $<TARGET_OBJECTS:erasure_code_objs>)
-target_link_libraries(ec_isa ${EXTRALIBS})
+target_link_libraries(ec_isa ISAL::ISAL ${EXTRALIBS})
 set_target_properties(ec_isa PROPERTIES
   INSTALL_RPATH "")
 install(TARGETS ec_isa DESTINATION ${erasure_plugin_dir})
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 57ee5ee7167..79defaec376 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -13,6 +13,7 @@
  */
 
 #include <filesystem>
+#include <memory>
 #include "common/async/context_pool.h"
 #include "common/ceph_argparse.h"
 #include "common/code_environment.h"
@@ -268,10 +269,14 @@ global_init(const std::map<std::string,std::string> *defaults,
     if (g_conf()->setgroup.length() > 0) {
       gid = atoi(g_conf()->setgroup.c_str());
       if (!gid) {
-	char buf[4096];
+	// There's no actual well-defined max that I could find in
+	// library documentation. If we're allocating on the heap,
+	// 64KiB seems at least reasonable.
+	static constexpr std::size_t size = 64 * 1024;
+	auto buf = std::make_unique_for_overwrite<char[]>(size);
 	struct group gr;
 	struct group *g = 0;
-	getgrnam_r(g_conf()->setgroup.c_str(), &gr, buf, sizeof(buf), &g);
+	getgrnam_r(g_conf()->setgroup.c_str(), &gr, buf.get(), size, &g);
 	if (!g) {
 	  cerr << "unable to look up group '" << g_conf()->setgroup << "'"
 	       << ": " << cpp_strerror(errno) << std::endl;
diff --git a/src/librados/librados_asio.h b/src/librados/librados_asio.h
index 19a8c8fc01d..0aedc376575 100644
--- a/src/librados/librados_asio.h
+++ b/src/librados/librados_asio.h
@@ -16,6 +16,7 @@
 
 #include "include/rados/librados.hpp"
 #include "common/async/completion.h"
+#include "librados/AioCompletionImpl.h"
 
 /// Defines asynchronous librados operations that satisfy all of the
 /// "Requirements on asynchronous operations" imposed by the C++ Networking TS
@@ -53,20 +54,20 @@ using unique_aio_completion_ptr =
 /// argument to the handler.
 template <typename Result>
 struct Invoker {
-  using Signature = void(boost::system::error_code, Result);
+  using Signature = void(boost::system::error_code, version_t, Result);
   Result result;
   template <typename Completion>
-  void dispatch(Completion&& completion, boost::system::error_code ec) {
-    ceph::async::dispatch(std::move(completion), ec, std::move(result));
+  void dispatch(Completion&& completion, boost::system::error_code ec, version_t ver) {
+    ceph::async::dispatch(std::move(completion), ec, ver, std::move(result));
   }
 };
 // specialization for Result=void
 template <>
 struct Invoker<void> {
-  using Signature = void(boost::system::error_code);
+  using Signature = void(boost::system::error_code, version_t);
   template <typename Completion>
-  void dispatch(Completion&& completion, boost::system::error_code ec) {
-    ceph::async::dispatch(std::move(completion), ec);
+  void dispatch(Completion&& completion, boost::system::error_code ec, version_t ver) {
+    ceph::async::dispatch(std::move(completion), ec, ver);
   }
 };
 
@@ -82,12 +83,15 @@ struct AsyncOp : Invoker<Result> {
     auto p = std::unique_ptr<Completion>{static_cast<Completion*>(arg)};
     // move result out of Completion memory being freed
     auto op = std::move(p->user_data);
-    const int ret = op.aio_completion->get_return_value();
+    // access AioCompletionImpl directly to avoid locking
+    const librados::AioCompletionImpl* pc = op.aio_completion->pc;
+    const int ret = pc->rval;
+    const version_t ver = pc->objver;
     boost::system::error_code ec;
     if (ret < 0) {
       ec.assign(-ret, librados::detail::err_category());
     }
-    op.dispatch(std::move(p), ec);
+    op.dispatch(std::move(p), ec, ver);
   }
 
   template <typename Executor1, typename CompletionHandler>
@@ -103,7 +107,7 @@ struct AsyncOp : Invoker<Result> {
 
 
 /// Calls IoCtx::aio_read() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                 size_t len, uint64_t off, CompletionToken&& token)
@@ -119,7 +123,7 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
         int ret = io.aio_read(oid, op.aio_completion.get(), &op.result, len, off);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
@@ -127,24 +131,24 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_write() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code).
+/// given handler with signature (error_code, version_t).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
-                 bufferlist &bl, size_t len, uint64_t off,
+                 const bufferlist &bl, size_t len, uint64_t off,
                  CompletionToken&& token)
 {
   using Op = detail::AsyncOp<void>;
   using Signature = typename Op::Signature;
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
-          bufferlist &bl, size_t len, uint64_t off) {
+          const bufferlist &bl, size_t len, uint64_t off) {
         auto p = Op::create(ex, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_write(oid, op.aio_completion.get(), bl, len, off);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec);
+          ceph::async::post(std::move(p), ec, 0);
         } else {
           p.release(); // release ownership until completion
         }
@@ -152,7 +156,7 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectReadOperation *read_op, int flags,
@@ -170,7 +174,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                                  flags, &op.result);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
@@ -178,7 +182,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code).
+/// given handler with signature (error_code, version_t).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectWriteOperation *write_op, int flags,
@@ -196,7 +200,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
         int ret = io.aio_operate(oid, op.aio_completion.get(), write_op, flags, trace_ctx);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec);
+          ceph::async::post(std::move(p), ec, 0);
         } else {
           p.release(); // release ownership until completion
         }
@@ -204,7 +208,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_notify() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                   bufferlist& bl, uint64_t timeout_ms, CompletionToken &&token)
@@ -221,7 +225,7 @@ auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                                 bl, timeout_ms, &op.result);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
diff --git a/src/librbd/crypto/LoadRequest.cc b/src/librbd/crypto/LoadRequest.cc
index 5bc57d693c5..66beed59130 100644
--- a/src/librbd/crypto/LoadRequest.cc
+++ b/src/librbd/crypto/LoadRequest.cc
@@ -31,7 +31,7 @@ LoadRequest<I>::LoadRequest(
         Context* on_finish) : m_image_ctx(image_ctx),
                               m_on_finish(on_finish),
                               m_format_idx(0),
-                              m_is_current_format_cloned(false),
+                              m_is_current_format_assumed(false),
                               m_formats(std::move(formats)) {
 }
 
@@ -108,7 +108,7 @@ void LoadRequest<I>::handle_load(int r) {
   ldout(m_image_ctx->cct, 20) << "r=" << r << dendl;
 
   if (r < 0) {
-    if (m_is_current_format_cloned &&
+    if (m_is_current_format_assumed &&
         m_detected_format_name == UNKNOWN_FORMAT) {
       // encryption format was not detected, assume plaintext
       ldout(m_image_ctx->cct, 5) << "assuming plaintext for image "
@@ -125,19 +125,29 @@ void LoadRequest<I>::handle_load(int r) {
   }
 
   ldout(m_image_ctx->cct, 5) << "loaded format " << m_detected_format_name
-                             << (m_is_current_format_cloned ? " (cloned)" : "")
+                             << (m_is_current_format_assumed ? " (assumed)" : "")
                              << " for image " << m_current_image_ctx->name
                              << dendl;
 
   m_format_idx++;
+  if (!m_current_image_ctx->migration_info.empty()) {
+    // prepend the format to use for the migration source image
+    // it's done implicitly here because this image is moved to the
+    // trash when migration is prepared
+    ceph_assert(m_current_image_ctx->parent != nullptr);
+    ldout(m_image_ctx->cct, 20) << "under migration, cloning format" << dendl;
+    m_formats.insert(m_formats.begin() + m_format_idx,
+                     m_formats[m_format_idx - 1]->clone());
+  }
+
   m_current_image_ctx = m_current_image_ctx->parent;
   if (m_current_image_ctx != nullptr) {
     // move on to loading parent
     if (m_format_idx >= m_formats.size()) {
       // try to load next ancestor using the same format
-      ldout(m_image_ctx->cct, 20) << "cloning format" << dendl;
-      m_is_current_format_cloned = true;
+      ldout(m_image_ctx->cct, 20) << "out of formats, cloning format" << dendl;
       m_formats.push_back(m_formats[m_formats.size() - 1]->clone());
+      m_is_current_format_assumed = true;
     }
 
     load();
diff --git a/src/librbd/crypto/LoadRequest.h b/src/librbd/crypto/LoadRequest.h
index 84f595bb6c6..702748a2418 100644
--- a/src/librbd/crypto/LoadRequest.h
+++ b/src/librbd/crypto/LoadRequest.h
@@ -44,7 +44,7 @@ private:
     Context* m_on_finish;
 
     size_t m_format_idx;
-    bool m_is_current_format_cloned;
+    bool m_is_current_format_assumed;
     std::vector<EncryptionFormat> m_formats;
     I* m_current_image_ctx;
     std::string m_detected_format_name;
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 0e9b6996ad2..dfad411d323 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -4589,8 +4589,11 @@ void InodeStoreBase::dump(Formatter *f) const
     for (const auto& [key, val] : *xattrs) {
       f->open_object_section("xattr");
       f->dump_string("key", key);
-      std::string v(val.c_str(), val.length());
-      f->dump_string("val", v);
+      if (val.length()) {
+        f->dump_string("val", std::string(val.c_str(), val.length()));
+      } else {
+        f->dump_string("val", "");
+      }
       f->close_section();
     }
   }
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 8e267503ab2..c2f3544f97b 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -3107,7 +3107,7 @@ void MDSRankDispatcher::evict_clients(
   dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
 
   if (victims.empty()) {
-    on_finish(-ESRCH, "no hosts match", outbl);
+    on_finish(0, "no hosts match", outbl);
     return;
   }
 
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 068258289af..c01ea9e7103 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -99,6 +99,8 @@ int NVMeofGwMap::cfg_add_gw(
         return 0;
       }
     }
+  }
+  for (auto& itr: created_gws[group_key]) {
     if (itr.second.availability == gw_availability_t::GW_DELETING) {
       //Was found some GW in "Deleting" state. Just to inherit its ANA group
       NvmeGwMonState & gw_created = created_gws[group_key][itr.first];
@@ -252,7 +254,7 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
   }
 }
 
-int NVMeofGwMap::process_gw_map_gw_no_subsystems(
+int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
 {
   int rc = 0;
@@ -422,7 +424,6 @@ void NVMeofGwMap::find_failback_gw(
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
-
   dout(10) << "Find failback GW for GW " << gw_id << dendl;
   for (auto& gw_state_it: gws_states) {
     auto& st = gw_state_it.second;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 29710371742..267d85b10f9 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -54,7 +54,7 @@ public:
   int process_gw_map_gw_down(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
-  int process_gw_map_gw_no_subsystems(
+  int process_gw_map_gw_no_subsys_no_listeners(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
   void update_active_timers(bool &propose_pending);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 734e90defd9..d9e936e27df 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -367,6 +367,13 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
 	std::stringstream  sstrm1;
 	sstrm1 << state.availability;
 	f->dump_string("Availability", sstrm1.str());
+	uint32_t num_listeners = 0;
+	if (state.availability == gw_availability_t::GW_AVAILABLE) {
+	  for (auto &subs: state.subsystems) {
+	    num_listeners += subs.listeners.size();
+	  }
+	  f->dump_unsigned("num-listeners", num_listeners);
+	}
 	sstrm1.str("");
 	for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
 	  sstrm1 << " " << state_itr.first + 1 << ": "
@@ -476,7 +483,7 @@ void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
     if (avail == gw_availability_t::GW_UNAVAILABLE) {
       pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
     } else {
-      pending_map.process_gw_map_gw_no_subsystems(gw_id, group_key, propose_pending);
+      pending_map.process_gw_map_gw_no_subsys_no_listeners(gw_id, group_key, propose_pending);
     }
 
   }
@@ -600,7 +607,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   if (sub.size() == 0) {
     avail = gw_availability_t::GW_CREATED;
-  }
+  } else {
+    bool listener_found = false;
+    for (auto &subs: sub) {
+      if (subs.listeners.size()) {
+        listener_found = true;
+        break;
+      }
+    }
+    if (!listener_found) {
+     avail = gw_availability_t::GW_CREATED;
+    }
+  }// for HA no-subsystems and no-listeners are same usecases
   if (pending_map.created_gws[group_key][gw_id].subsystems != sub) {
     dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
     pending_map.created_gws[group_key][gw_id].subsystems =  sub;
diff --git a/src/mypy-constrains.txt b/src/mypy-constrains.txt
index 7810870804e..0a79b8ef4f1 100644
--- a/src/mypy-constrains.txt
+++ b/src/mypy-constrains.txt
@@ -2,7 +2,7 @@
 # Unfortunately this means we have to manually update those 
 # packages regularly. 
 
-mypy==1.1.1
+mypy==1.9
 
 # global
 types-python-dateutil==0.1.3
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 3dcd96830c4..5f4f1a4d48a 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3760,15 +3760,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
 {
   auto t0 = mono_clock::now();
   std::lock_guard hl(h->lock);
+  auto& fnode = h->file->fnode;
   dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
-           << " file " << h->file->fnode << dendl;
+           << " file " << fnode << dendl;
   if (h->file->deleted) {
     dout(10) << __func__ << "  deleted, no-op" << dendl;
     return 0;
   }
 
   // we never truncate internal log files
-  ceph_assert(h->file->fnode.ino > 1);
+  ceph_assert(fnode.ino > 1);
 
   // truncate off unflushed data?
   if (h->pos < offset &&
@@ -3782,20 +3783,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
     if (r < 0)
       return r;
   }
-  if (offset == h->file->fnode.size) {
-    return 0;  // no-op!
-  }
-  if (offset > h->file->fnode.size) {
+  if (offset > fnode.size) {
     ceph_abort_msg("truncate up not supported");
   }
-  ceph_assert(h->file->fnode.size >= offset);
+  ceph_assert(offset <= fnode.size);
   _flush_bdev(h);
-
-  std::lock_guard ll(log.lock);
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
-  h->file->fnode.size = offset;
-  h->file->is_dirty = true;
-  log.t.op_file_update_inc(h->file->fnode);
+  {
+    std::lock_guard ll(log.lock);
+    std::lock_guard dl(dirty.lock);
+    bool changed_extents = false;
+    vselector->sub_usage(h->file->vselector_hint, fnode);
+    uint64_t x_off = 0;
+    auto p = fnode.seek(offset, &x_off);
+    uint64_t cut_off =
+      (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
+    uint64_t new_allocated;
+    if (0 == cut_off) {
+      // whole pextent to remove
+      changed_extents = true;
+      new_allocated = offset;
+    } else if (cut_off < p->length) {
+      dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
+      new_allocated = (offset - x_off) + cut_off;
+      p->length = cut_off;
+      changed_extents = true;
+      ++p;
+    } else {
+      ceph_assert(cut_off >= p->length);
+      new_allocated  = (offset - x_off) + p->length;
+      // just leave it here
+      ++p;
+    }
+    while (p != fnode.extents.end()) {
+      dirty.pending_release[p->bdev].insert(p->offset, p->length);
+      p = fnode.extents.erase(p);
+      changed_extents = true;
+    }
+    if (changed_extents) {
+      fnode.size = offset;
+      fnode.allocated = new_allocated;
+      fnode.reset_delta();
+      log.t.op_file_update(fnode);
+      // sad, but is_dirty must be set to signal flushing of the log
+      h->file->is_dirty = true;
+    } else {
+      if (offset != fnode.size) {
+        fnode.size = offset;
+        //skipping log.t.op_file_update_inc, it will be done by flush()
+        h->file->is_dirty = true;
+      }
+    }
+    vselector->add_usage(h->file->vselector_hint, fnode);
+  }
   logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0);
   return 0;
 }
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 68040af4282..7cbe0a1d121 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
   }
 
   rocksdb::Status Close() override {
-    fs->fsync(h);
 
-    // mimic posix env, here.  shrug.
-    size_t block_size;
-    size_t last_allocated_block;
-    GetPreallocationStatus(&block_size, &last_allocated_block);
-    if (last_allocated_block > 0) {
-      int r = fs->truncate(h, h->pos);
-      if (r < 0)
-	return err_to_status(r);
+    int r = fs->truncate(h, h->pos);
+    if (r < 0) {
+      return err_to_status(r);
     }
-
+    fs->fsync(h);
     return rocksdb::Status::OK();
   }
 
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 02bb04c4a0a..1fc87610502 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -327,15 +327,14 @@ void ECCommon::ReadPipeline::get_min_want_to_read_shards(
 {
   const auto [left_chunk_index, right_chunk_index] =
     sinfo.offset_length_to_data_chunk_indices(offset, length);
-  for(uint64_t i = left_chunk_index; i < right_chunk_index; i++) {
-    auto raw_chunk = i % sinfo.get_data_chunk_count();
+  const auto distance =
+    std::min(right_chunk_index - left_chunk_index,
+             sinfo.get_data_chunk_count());
+  for(uint64_t i = 0; i < distance; i++) {
+    auto raw_chunk = (left_chunk_index + i) % sinfo.get_data_chunk_count();
     auto chunk = chunk_mapping.size() > raw_chunk ?
       chunk_mapping[raw_chunk] : static_cast<int>(raw_chunk);
-    if (auto [_, inserted] = want_to_read->insert(chunk); !inserted) {
-      // aready processed all chunks
-      ceph_assert(want_to_read->size() == sinfo.get_data_chunk_count());
-      break;
-    }
+    want_to_read->insert(chunk);
   }
 }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index fb3a415a542..ce46bb245ea 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -3930,11 +3930,6 @@ int OSD::init()
 
   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
 
-  if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
-    dout(2) << "compacting object store's DB" << dendl;
-    store->compact();
-  }
-
   // prime osd stats
   {
     struct store_statfs_t stbuf;
@@ -4080,6 +4075,11 @@ int OSD::init()
   if (is_stopping())
     return 0;
 
+  if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
+    dout(2) << "compacting object store's DB" << dendl;
+    store->compact();
+  }
+
   // start objecter *after* we have authenticated, so that we don't ignore
   // the OSDMaps it requests.
   service.final_init();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 76256df49b8..71b9b713385 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1137,46 +1137,10 @@ void PG::update_snap_map(
   const vector<pg_log_entry_t> &log_entries,
   ObjectStore::Transaction &t)
 {
-  for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+  for (const auto& entry : log_entries) {
     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-    if (i->soid.snap < CEPH_MAXSNAP) {
-      if (i->is_delete()) {
-	int r = snap_mapper.remove_oid(
-	  i->soid,
-	  &_t);
-	if (r)
-	  derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
-        // On removal tolerate missing key corruption
-        ceph_assert(r == 0 || r == -ENOENT);
-      } else if (i->is_update()) {
-	ceph_assert(i->snaps.length() > 0);
-	vector<snapid_t> snaps;
-	bufferlist snapbl = i->snaps;
-	auto p = snapbl.cbegin();
-	try {
-	  decode(snaps, p);
-	} catch (...) {
-	  derr << __func__ << " decode snaps failure on " << *i << dendl;
-	  snaps.clear();
-	}
-	set<snapid_t> _snaps(snaps.begin(), snaps.end());
-
-	if (i->is_clone() || i->is_promote()) {
-	  snap_mapper.add_oid(
-	    i->soid,
-	    _snaps,
-	    &_t);
-	} else if (i->is_modify()) {
-	  int r = snap_mapper.update_snaps(
-	    i->soid,
-	    _snaps,
-	    0,
-	    &_t);
-	  ceph_assert(r == 0);
-	} else {
-	  ceph_assert(i->is_clean());
-	}
-      }
+    if (entry.soid.snap < CEPH_MAXSNAP) {
+      snap_mapper.update_snap_map(entry, &_t);
     }
   }
 }
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 22222b7f7af..8d768ec4a66 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -3033,7 +3033,9 @@ void PeeringState::proc_primary_info(
   ceph_assert(!is_primary());
 
   update_history(oinfo.history);
-  if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+  bool has_scrub_error = (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors);
+  info.stats = oinfo.stats;
+  if (has_scrub_error) {
     info.stats.stats.sum.num_scrub_errors = 0;
     info.stats.stats.sum.num_shallow_scrub_errors = 0;
     info.stats.stats.sum.num_deep_scrub_errors = 0;
diff --git a/src/osd/osd_types_fmt.h b/src/osd/osd_types_fmt.h
index 04f4d46ee51..100ce6e4646 100644
--- a/src/osd/osd_types_fmt.h
+++ b/src/osd/osd_types_fmt.h
@@ -392,4 +392,6 @@ inline std::ostream &operator<<(std::ostream &lhs, const object_stat_sum_t &sum)
 
 #if FMT_VERSION >= 90000
 template <bool TrackChanges> struct fmt::formatter<pg_missing_set<TrackChanges>> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<pool_opts_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<store_statfs_t> : fmt::ostream_formatter {};
 #endif
diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index a00ab2caece..7f28ca2d642 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -1,11 +1,13 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 
-#include "ScrubStore.h"
+#include "./ScrubStore.h"
 #include "osd/osd_types.h"
 #include "common/scrub_types.h"
 #include "include/rados/rados_types.hpp"
 
+#include "pg_scrubber.h"
+
 using std::ostringstream;
 using std::string;
 using std::vector;
@@ -13,21 +15,9 @@ using std::vector;
 using ceph::bufferlist;
 
 namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
-  ostringstream ss;
-  ss << "scrub_" << pgid;
-  return pgid.make_temp_ghobject(ss.str());
-}
-
 string first_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -47,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -60,14 +45,9 @@ string last_object_key(int64_t pool)
 string first_snap_key(int64_t pool)
 {
   // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
-  // the representing the minimal and maximum keys. and this relies on how
+  // representing the minimal and maximum keys. and this relies on how
   // hobject_t::to_str() works: hex(pool).hex(revhash).
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
@@ -86,123 +66,447 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_snap_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
+
+}  // namespace
+
+#undef dout_context
+#define dout_context (m_scrubber.get_pg_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix_fn(_dout, this, __func__)
+
+template <class T>
+static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
+{
+  return t->gen_prefix(*_dout, fn);
 }
 
 namespace Scrub {
 
-Store*
-Store::create(ObjectStore* store,
-	      ObjectStore::Transaction* t,
-	      const spg_t& pgid,
-	      const coll_t& coll)
+Store::Store(
+    PgScrubber& scrubber,
+    ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll)
+    : m_scrubber{scrubber}
+    , object_store{osd_store}
+    , coll{coll}
 {
-  ceph_assert(store);
   ceph_assert(t);
-  ghobject_t oid = make_scrub_object(pgid);
-  t->touch(coll, oid);
-  return new Store{coll, oid, store};
+
+  // shallow errors DB object
+  const auto sh_err_obj =
+      pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
+  t->touch(coll, sh_err_obj);
+  shallow_db.emplace(
+      pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
+
+  // and the DB for deep errors
+  const auto dp_err_obj =
+      pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid));
+  t->touch(coll, dp_err_obj);
+  deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
+
+  dout(20) << fmt::format(
+		  "created Scrub::Store for pg[{}], shallow: {}, deep: {}",
+		  pgid, sh_err_obj, dp_err_obj)
+	   << dendl;
 }
 
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
-  : coll(coll),
-    hoid(oid),
-    driver(store, coll, hoid),
-    backend(&driver)
-{}
 
 Store::~Store()
 {
-  ceph_assert(results.empty());
+  ceph_assert(!shallow_db || shallow_db->results.empty());
+  ceph_assert(!deep_db || deep_db->results.empty());
 }
 
+
+std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const
+{
+  if (fn.starts_with("operator")) {
+    // it's a lambda, and __func__ is not available
+    return m_scrubber.gen_prefix(out) << "Store::";
+  } else {
+    return m_scrubber.gen_prefix(out) << "Store::" << fn << ": ";
+  }
+}
+
+
 void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
   add_object_error(pool, e);
 }
 
+
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_object_key(pool, e.object)] = bl;
+  using librados::obj_err_t;
+  const auto key = to_object_key(pool, e.object);
+  dout(20) << fmt::format(
+		  "{}: adding error for object {} ({}). Errors: {} ({}/{}) "
+		  "unfiltered:{}",
+		  (current_level == scrub_level_t::deep ? "deep" : "shallow"),
+		  e.object, key, obj_err_t{e.errors},
+		  obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS},
+		  obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e)
+	   << dendl;
+
+  if (current_level == scrub_level_t::deep) {
+    // not overriding the deep errors DB during shallow scrubs
+    deep_db->results[key] = e.encode();
+  }
+
+  // only shallow errors are stored in the shallow DB
+  auto e_copy = e;
+  e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS;
+  e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS;
+  shallow_db->results[key] = e_copy.encode();
 }
 
+
 void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
   add_snap_error(pool, e);
 }
 
+
 void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_snap_key(pool, e.object)] = bl;
+  // note: snap errors are only placed in the shallow store
+  shallow_db->results[to_snap_key(pool, e.object)] = e.encode();
 }
 
-bool Store::empty() const
+
+bool Store::is_empty() const
 {
-  return results.empty();
+  return (!shallow_db || shallow_db->results.empty()) &&
+	 (!deep_db || deep_db->results.empty());
 }
 
+
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    OSDriver::OSTransaction txn = driver.get_transaction(t);
-    backend.set_keys(results, &txn);
+    auto txn = shallow_db->driver.get_transaction(t);
+    shallow_db->backend.set_keys(shallow_db->results, &txn);
+    txn = deep_db->driver.get_transaction(t);
+    deep_db->backend.set_keys(deep_db->results, &txn);
+  }
+
+  shallow_db->results.clear();
+  deep_db->results.clear();
+}
+
+
+void Store::clear_level_db(
+    ObjectStore::Transaction* t,
+    at_level_t& db,
+    std::string_view db_name)
+{
+  dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name)
+	   << dendl;
+  // easiest way to guarantee that the object representing the DB exists
+  t->touch(coll, db.errors_hoid);
+
+  // remove all the keys in the DB
+  t->omap_clear(coll, db.errors_hoid);
+
+  // restart the 'in progress' part of the MapCacher
+  db.backend.reset();
+}
+
+
+void Store::reinit(
+    ObjectStore::Transaction* t,
+    scrub_level_t level)
+{
+  // Note: only one caller, and it creates the transaction passed to reinit().
+  // No need to assert on 't'
+  dout(20) << fmt::format(
+		  "re-initializing the Scrub::Store (for {} scrub)",
+		  (level == scrub_level_t::deep ? "deep" : "shallow"))
+	   << dendl;
+
+  current_level = level;
+
+  // always clear the known shallow errors DB (as both shallow and deep scrubs
+  // would recreate it)
+  if (shallow_db) {
+    clear_level_db(t, *shallow_db, "shallow");
+  }
+  // only a deep scrub recreates the deep errors DB
+  if (level == scrub_level_t::deep && deep_db) {
+    clear_level_db(t, *deep_db, "deep");
   }
-  results.clear();
 }
 
+
 void Store::cleanup(ObjectStore::Transaction* t)
 {
-  t->remove(coll, hoid);
+  dout(20) << "discarding error DBs" << dendl;
+  ceph_assert(t);
+  if (shallow_db)
+    t->remove(coll, shallow_db->errors_hoid);
+  if (deep_db)
+    t->remove(coll, deep_db->errors_hoid);
 }
 
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
-		       const librados::object_id_t& start,
-		       uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_snap_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_snap_key(pool) : to_snap_key(pool, start));
+  vector<bufferlist> errors;
+  const string begin =
+      (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start));
   const string end = last_snap_key(pool);
-  return get_errors(begin, end, max_return);
+
+  // the snap errors are stored only in the shallow store
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin);
+
+  while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) {
+    errors.push_back(latest_sh->data);
+    latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+  }
+
+  return errors;
 }
 
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
-			 const librados::object_id_t& start,
-			 uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_object_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_object_key(pool) : to_object_key(pool, start));
+  const string begin =
+      (start.name.empty() ? first_object_key(pool)
+			  : to_object_key(pool, start));
   const string end = last_object_key(pool);
+  dout(20) << fmt::format("fetching errors, from {} to {}", begin, end)
+	   << dendl;
   return get_errors(begin, end, max_return);
 }
 
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
-		  const string& end,
-		  uint64_t max_return) const
+
+inline void decode(
+    librados::inconsistent_obj_t& obj,
+    ceph::buffer::list::const_iterator& bp)
 {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+
+inconsistent_obj_wrapper decode_wrapper(
+    hobject_t obj,
+    ceph::buffer::list::const_iterator bp)
+{
+  inconsistent_obj_wrapper iow{obj};
+  iow.decode(bp);
+  return iow;
+}
+
+
+void Store::collect_specific_store(
+    MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+    Store::ExpCacherPosData& latest,
+    std::vector<bufferlist>& errors,
+    std::string_view end_key,
+    uint64_t max_return) const
+{
+  while (max_return-- && latest.has_value() &&
+	 latest.value().last_key < end_key) {
+    errors.push_back(latest->data);
+    latest = backend.get_1st_after_key(latest->last_key);
+  }
+}
+
+
+/*
+ * Implementation notes:
+ * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f
+ *   for why we encode the shard_info_t in the store.
+ * - to maintain known shard_info-s created during a deep scrub (but only when
+ *   needed), we use our knowledge of the level of the last scrub performed
+ *   (current_level), and the object user version as encoded in the error
+ *   structure.
+ */
+bufferlist Store::merge_encoded_error_wrappers(
+    hobject_t obj,
+    ExpCacherPosData& latest_sh,
+    ExpCacherPosData& latest_dp) const
+{
+  // decode both error wrappers
+  auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
+  auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
+
+  // note: the '20' level is just until we're sure the merging works as
+  // expected
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    dout(20) << fmt::format(
+		    "merging errors {}. Deep: {:#x}-({})", sh_wrap.object,
+		    dp_wrap.errors, dp_wrap)
+	     << dendl;
+    dout(20) << fmt::format(
+		    "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object,
+		    sh_wrap.errors, sh_wrap)
+	     << dendl;
+    // dev: list the attributes:
+    for (const auto& [shard, si] : sh_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+    for (const auto& [shard, si] : dp_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+  }
+
+  // Actual merging of the shard map entries is only performed if the
+  // latest version is from the shallow scrub.
+  // Otherwise, the deep scrub, which (for the shards info) contains all data,
+  // and the shallow scrub is ignored.
+  if (current_level == scrub_level_t::shallow) {
+    // is the object data related to the same object version?
+    if (sh_wrap.version == dp_wrap.version) {
+      // combine the error information
+      dp_wrap.errors |= sh_wrap.errors;
+      for (const auto& [shard, si] : sh_wrap.shards) {
+	if (dp_wrap.shards.contains(shard)) {
+	  dout(20) << fmt::format(
+			  "-----> {}-{}  combining: sh-errors: {} dp-errors:{}",
+			  sh_wrap.object, shard, si, dp_wrap.shards[shard])
+		   << dendl;
+	  const auto saved_er = dp_wrap.shards[shard].errors;
+	  dp_wrap.shards[shard].selected_oi = si.selected_oi;
+	  dp_wrap.shards[shard].primary = si.primary;
+	  dp_wrap.shards[shard].errors |= saved_er;
+
+	  // the attributes:
+	  for (const auto& [attr, bl] : si.attrs) {
+	    if (!dp_wrap.shards[shard].attrs.contains(attr)) {
+	      dout(20) << fmt::format(
+			      "-----> {}-{}  copying shallow attr: attr: {}",
+			      sh_wrap.object, shard, attr)
+		       << dendl;
+	      dp_wrap.shards[shard].attrs[attr] = bl;
+	    }
+	    // otherwise - we'll ignore the shallow attr buffer
+	  }
+	} else {
+	  // the deep scrub data for this shard is missing. We take the shallow
+	  // scrub data.
+	  dp_wrap.shards[shard] = si;
+	}
+      }
+    } else if (sh_wrap.version > dp_wrap.version) {
+	if (false && dp_wrap.version == 0) {
+	  // there was a read error in the deep scrub. The deep version
+	  // shows as '0'. That's severe enough for us to ignore the shallow.
+	  dout(10) << fmt::format("{} ignoring deep after read failure",
+			  sh_wrap.object)
+		   << dendl;
+	} else {
+	  // There is a new shallow version of the object results.
+	  // The deep data is for an older version of that object.
+	  // There are multiple possibilities here, but for now we ignore the
+	  // deep data.
+	  dp_wrap = sh_wrap;
+	}
+      }
+  }
+
+  return dp_wrap.encode();
+}
+
+
+// a better way to implement get_errors(): use two generators, one for each store.
+// and sort-merge the results. Almost like a merge-sort, but with equal
+// keys combined. 'todo' once 'ranges' are really working.
+
+std::vector<bufferlist> Store::get_errors(
+    const std::string& from_key,
+    const std::string& end_key,
+    uint64_t max_return) const
+{
+  // merge the input from the two sorted DBs into 'errors' (until
+  // enough errors are collected)
   vector<bufferlist> errors;
-  auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !backend.get_next(next.first, &next)) {
-    if (next.first >= end)
+  dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key)
+	   << dendl;
+
+  ceph_assert(shallow_db);
+  ceph_assert(deep_db);
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key);
+  ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key);
+
+  while (max_return) {
+    dout(20) << fmt::format(
+		    "n:{} latest_sh: {}, latest_dp: {}", max_return,
+		    (latest_sh ? latest_sh->last_key : "(none)"),
+		    (latest_dp ? latest_dp->last_key : "(none)"))
+	     << dendl;
+
+    // keys not smaller than end_key are not interesting
+    if (latest_sh.has_value() && latest_sh->last_key >= end_key) {
+      latest_sh = tl::unexpected(-EINVAL);
+    }
+    if (latest_dp.has_value() && latest_dp->last_key >= end_key) {
+      latest_dp = tl::unexpected(-EINVAL);
+    }
+
+    if (!latest_sh && !latest_dp) {
+      // both stores are exhausted
+      break;
+    }
+    if (!latest_sh.has_value()) {
+      // continue with the deep store
+      dout(10) << fmt::format("collecting from deep store") << dendl;
+      collect_specific_store(
+	  deep_db->backend, latest_dp, errors, end_key, max_return);
       break;
-    errors.push_back(next.second);
+    }
+    if (!latest_dp.has_value()) {
+      // continue with the shallow store
+      dout(10) << fmt::format("collecting from shallow store") << dendl;
+      collect_specific_store(
+	  shallow_db->backend, latest_sh, errors, end_key, max_return);
+      break;
+    }
+
+    // we have results from both stores. Select the one with a lower key.
+    // If the keys are equal, combine the errors.
+    if (latest_sh->last_key == latest_dp->last_key) {
+      auto bl = merge_encoded_error_wrappers(
+	  shallow_db->errors_hoid.hobj, latest_sh, latest_dp);
+      errors.push_back(bl);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+
+    } else if (latest_sh->last_key < latest_dp->last_key) {
+      dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key)
+	       << dendl;
+      errors.push_back(latest_sh->data);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+    } else {
+      dout(20) << fmt::format("deep store element ({})", latest_dp->last_key)
+	       << dendl;
+      errors.push_back(latest_dp->data);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+    }
     max_return--;
   }
+
+  dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
   return errors;
 }
-
-} // namespace Scrub
+}  // namespace Scrub
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 567badf608b..0955654d78e 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -1,10 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
+#pragma once
 
 #include "common/map_cacher.hpp"
+#include "osd/osd_types_fmt.h"
 #include "osd/SnapMapper.h"  // for OSDriver
 
 namespace librados {
@@ -13,27 +12,71 @@ struct object_id_t;
 
 struct inconsistent_obj_wrapper;
 struct inconsistent_snapset_wrapper;
+class PgScrubber;
 
 namespace Scrub {
 
+/**
+ * Storing errors detected during scrubbing.
+ *
+ * From both functional and internal perspectives, the store is a pair of key-value
+ * databases: one maps objects to shallow errors detected during their scrubbing,
+ * and other stores deep errors.
+ * Note that the first store is updated in both shallow and in deep scrubs. The
+ * second - only while deep scrubbing.
+ *
+ * The DBs can be consulted by the operator, when trying to list 'errors known
+ * at this point in time'. Whenever a scrub starts - the relevant entries in the
+ * DBs are removed. Specifically - the shallow errors DB is recreated each scrub,
+ * while the deep errors DB is recreated only when a deep scrub starts.
+ *
+ * When queried - the data from both DBs is merged for each named object, and
+ * returned to the operator.
+ *
+ * Implementation:
+ * Each of the two DBs is implemented as OMAP entries of a single, uniquely named,
+ * object. Both DBs are cached using the general KV Cache mechanism.
+ */
+
 class Store {
  public:
   ~Store();
-  static Store* create(ObjectStore* store,
-		       ObjectStore::Transaction* t,
-		       const spg_t& pgid,
-		       const coll_t& coll);
+
+  Store(
+      PgScrubber& scrubber,
+      ObjectStore& osd_store,
+      ObjectStore::Transaction* t,
+      const spg_t& pgid,
+      const coll_t& coll);
+
+
+  /// mark down detected errors, either shallow or deep
   void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+
   void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
   // and a variant-friendly interface:
   void add_error(int64_t pool, const inconsistent_obj_wrapper& e);
   void add_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
-  bool empty() const;
+  [[nodiscard]] bool is_empty() const;
   void flush(ObjectStore::Transaction*);
+
+  /// remove both shallow and deep errors DBs. Called on interval.
   void cleanup(ObjectStore::Transaction*);
 
+  /**
+   * prepare the Store object for a new scrub session.
+   * This involves clearing one or both of the errors DBs, and resetting
+   * the cache.
+   *
+   * @param level: the scrub level to prepare for. Whenever a deep scrub
+   * is requested, both the shallow and deep errors DBs are cleared.
+   * If, on the other hand, a shallow scrub is requested, only the shallow
+   * errors DB is cleared.
+   */
+  void reinit(ObjectStore::Transaction* t, scrub_level_t level);
+
   std::vector<ceph::buffer::list> get_snap_errors(
     int64_t pool,
     const librados::object_id_t& start,
@@ -44,20 +87,89 @@ class Store {
     const librados::object_id_t& start,
     uint64_t max_return) const;
 
+  std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+
  private:
-  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
-  std::vector<ceph::buffer::list> get_errors(const std::string& start,
-					     const std::string& end,
-					     uint64_t max_return) const;
- private:
+  /**
+   * at_level_t
+   *
+   * The machinery for caching and storing errors at a specific scrub level.
+   */
+  struct at_level_t {
+    at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr)
+	: errors_hoid{err_obj}
+	, driver{std::move(drvr)}
+	, backend{&driver}
+    {}
+
+    /// the object in the PG store, where the errors are stored
+    ghobject_t errors_hoid;
+
+    /// abstracted key fetching
+    OSDriver driver;
+
+    /// a K,V cache for the errors that are detected during the scrub
+    /// session. The errors marked for a specific object are stored as
+    /// an OMap entry with the object's name as the key.
+    MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+    /// a temp object mapping seq-id to inconsistencies
+    std::map<std::string, ceph::buffer::list> results;
+  };
+
+  using CacherPosData =
+      MapCacher::MapCacher<std::string, ceph::buffer::list>::PosAndData;
+  using ExpCacherPosData = tl::expected<CacherPosData, int>;
+
+  /// access to the owning Scrubber object, for logging mostly
+  PgScrubber& m_scrubber;
+
+  /// the OSD's storage backend
+  ObjectStore& object_store;
+
+  /// the collection (i.e. - the PG store) in which the errors are stored
   const coll_t coll;
-  const ghobject_t hoid;
-  // a temp object holding mappings from seq-id to inconsistencies found in
-  // scrubbing
-  OSDriver driver;
-  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
-  std::map<std::string, ceph::buffer::list> results;
+
+  scrub_level_t current_level;
+
+  /**
+   * the machinery (backend details, cache, etc.) for storing both levels
+   * of errors (note: 'optional' to allow delayed creation w/o dynamic
+   * allocations; and 'mutable', as the caching mechanism is used in const
+   * methods)
+   */
+  mutable std::optional<at_level_t> shallow_db;
+  mutable std::optional<at_level_t> deep_db;
+
+  std::vector<ceph::buffer::list> get_errors(
+      const std::string& start,
+      const std::string& end,
+      uint64_t max_return) const;
+
+  void collect_specific_store(
+      MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+      ExpCacherPosData& latest,
+      std::vector<bufferlist>& errors,
+      std::string_view end_key,
+      uint64_t max_return) const;
+
+  /**
+   * Clear the DB of errors at a specific scrub level by performing an
+   * omap_clear() on the DB object, and resetting the MapCacher.
+   */
+  void clear_level_db(
+      ObjectStore::Transaction* t,
+      at_level_t& db,
+      std::string_view db_name);
+
+  /**
+   * merge the two error wrappers - fetched from both DBs for the same object.
+   * Specifically, the object errors are or'ed, and so are the per-shard
+   * entries.
+   */
+  bufferlist merge_encoded_error_wrappers(
+      hobject_t obj,
+      ExpCacherPosData& latest_sh,
+      ExpCacherPosData& latest_dp) const;
 };
 }  // namespace Scrub
-
-#endif	// CEPH_SCRUB_RESULT_H
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 555d13ba72b..594ffb15e2b 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica,
   m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
 }
 
+// only called on interval change. Both DBs are to be removed.
 void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
 {
   if (!m_store)
@@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
   ceph_assert(!m_store);
 }
 
+
+void PgScrubber::reinit_scrub_store()
+{
+  // Entering, 0 to 3 of the following objects(*) may exist:
+  // ((*)'objects' here: both code objects (the ScrubStore object) and
+  // actual Object Store objects).
+  // 1. The ScrubStore object itself.
+  // 2,3. The two special hobjects in the coll (the PG data) holding the last
+  //      scrub's results.
+  //
+  // The Store object can be deleted and recreated, as a way to guarantee
+  // no junk is left. We won't do it here, but we will clear the at_level_t
+  // structures.
+  // The hobjects: possibly. The shallow DB object is always cleared. The
+  // deep one - only if running a deep scrub.
+  ObjectStore::Transaction t;
+  if (m_store) {
+    dout(10) << __func__ << " reusing existing store" << dendl;
+    m_store->flush(&t);
+  } else {
+    dout(10) << __func__ << " creating new store" << dendl;
+    m_store = std::make_unique<Scrub::Store>(
+	*this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
+  }
+
+  // regardless of whether the ScrubStore object was recreated or reused, we need to
+  // (possibly) clear the actual DB objects in the Object Store.
+  m_store->reinit(&t, m_active_target->level());
+  m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+}
+
+
 void PgScrubber::on_init()
 {
   // going upwards from 'inactive'
@@ -1217,14 +1250,8 @@ void PgScrubber::on_init()
     m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow,
     m_pg->get_actingset());
 
-  //  create a new store
-  {
-    ObjectStore::Transaction t;
-    cleanup_store(&t);
-    m_store.reset(
-      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
-    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-  }
+  // create or reuse the 'known errors' store
+  reinit_scrub_store();
 
   m_start = m_pg->info.pgid.pgid.get_hobj_start();
   m_active = true;
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index ff8c98d387e..1a5813bd923 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -771,6 +771,16 @@ class PgScrubber : public ScrubPgIF,
 
   std::unique_ptr<Scrub::Store> m_store;
 
+  /**
+   * the ScrubStore sub-object caches and manages the database of known
+   * scrub errors. reinit_scrub_store() clears the database and re-initializes
+   * the ScrubStore object.
+   *
+   * in the next iteration - reinit_..() potentially deletes only the
+   * shallow errors part of the database.
+   */
+  void reinit_scrub_store();
+
   int num_digest_updates_pending{0};
   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
index 4a574ed66d9..d15862c08ba 100644
--- a/src/osdc/Journaler.h
+++ b/src/osdc/Journaler.h
@@ -529,43 +529,35 @@ public:
   // ===================
 
   Header get_last_committed() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_committed;
   }
   Header get_last_written() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_written;
   }
 
   uint64_t get_layout_period() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout.get_period();
   }
   file_layout_t get_layout() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout;
   }
   bool is_active() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_ACTIVE;
   }
   bool is_stopping() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_STOPPING;
   }
   int get_error() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return error;
   }
   bool is_readonly() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return readonly;
   }
@@ -573,32 +565,26 @@ public:
   bool _is_readable();
   bool try_read_entry(bufferlist& bl);
   uint64_t get_write_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return write_pos;
   }
   uint64_t get_write_safe_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return safe_pos;
   }
   uint64_t get_read_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return read_pos;
   }
   uint64_t get_expire_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return expire_pos;
   }
   uint64_t get_trimmed_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return trimmed_pos;
   }
   size_t get_journal_envelope_size() const { 
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return journal_stream.get_envelope_size(); 
   }
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5216c489064..f8f0efc9d28 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -135,13 +135,13 @@ DEFAULT_IMAGE = 'quay.io/ceph/ceph'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -446,7 +446,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         Option(
             'default_registry',
             type='str',
-            default='docker.io',
+            default='quay.io',
             desc='Search-registry to which we should normalize unqualified image names. '
                  'This is not the default registry',
         ),
@@ -764,6 +764,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
         self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
         self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
+        self.rgw_service: RgwService = cast(RgwService, self.cephadm_services['rgw'])
 
         self.scheduled_async_actions: List[Callable] = []
 
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index c6212c9efb8..611c27c3453 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -950,6 +950,10 @@ class CephadmServe:
                     )
                     continue
 
+                # set multisite config before deploying the rgw daemon
+                if service_type == 'rgw':
+                    self.mgr.rgw_service.set_realm_zg_zone(cast(RGWSpec, spec))
+
                 # deploy new daemon
                 daemon_id = slot.name
 
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index eb9a1c838a6..9043577bc5a 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -984,10 +984,9 @@ class RgwService(CephService):
     def allow_colo(self) -> bool:
         return True
 
-    def config(self, spec: RGWSpec) -> None:  # type: ignore
+    def set_realm_zg_zone(self, spec: RGWSpec) -> None:
         assert self.TYPE == spec.service_type
 
-        # set rgw_realm rgw_zonegroup and rgw_zone, if present
         if spec.rgw_realm:
             ret, out, err = self.mgr.check_mon_command({
                 'prefix': 'config set',
@@ -1010,6 +1009,12 @@ class RgwService(CephService):
                 'value': spec.rgw_zone,
             })
 
+    def config(self, spec: RGWSpec) -> None:  # type: ignore
+        assert self.TYPE == spec.service_type
+
+        # set rgw_realm rgw_zonegroup and rgw_zone, if present
+        self.set_realm_zg_zone(spec)
+
         if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
             # generate a self-signed cert for the rgw service
             cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 8b15aace373..162815da24c 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -123,10 +123,9 @@ class NvmeofService(CephService):
             gateways = json.loads(out)['gateways']
             cmd_dicts = []
 
-            spec = cast(NvmeofServiceSpec,
-                        self.mgr.spec_store.all_specs.get(daemon_descrs[0].service_name(), None))
-
             for dd in daemon_descrs:
+                spec = cast(NvmeofServiceSpec,
+                            self.mgr.spec_store.all_specs.get(dd.service_name(), None))
                 service_name = dd.service_name()
                 if dd.hostname is None:
                     err_msg = ('Trying to config_dashboard nvmeof but no hostname is defined')
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index 260e7418e2d..f33bc6c8dfd 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -46,6 +46,12 @@ server {
     # add_header Content-Security-Policy "default-src 'self'; script-src 'self'; object-src 'none'; base-uri 'none'; require-trusted-types-for 'script'; frame-ancestors 'self';";
 {% endif %}
 
+{% if spec.enable_health_check_endpoint %}
+    location /health {
+         return 200 'OK';
+         add_header Content-Type text/plain;
+    }
+{% endif %}
 {% if oauth2_proxy_url %}
     location /oauth2/ {
         proxy_pass {{ oauth2_proxy_url }};
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
index f2c32f87977..9148ddc4a14 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
@@ -1,5 +1,8 @@
 
 server {
+    ssl_client_certificate /etc/nginx/ssl/ca.crt;
+    ssl_verify_client on;
+
     listen              {{ internal_port }} ssl;
     listen              [::]:{{ internal_port }} ssl;
     ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -9,6 +12,12 @@ server {
     ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
     ssl_prefer_server_ciphers on;
 
+{% if spec.enable_health_check_endpoint %}
+    location /health {
+         return 200 'OK';
+         add_header Content-Type text/plain;
+    }
+{% endif %}
 {% if dashboard_endpoints %}
     location /internal/dashboard {
         rewrite ^/internal/dashboard/(.*) /$1 break;
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 5a485f98be3..975c125225d 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -2040,7 +2040,7 @@ class TestCephadm(object):
             ), CephadmOrchestrator.apply_iscsi),
             (CustomContainerSpec(
                 service_id='hello-world',
-                image='docker.io/library/hello-world:latest',
+                image='quay.io/hello-world/hello-world:latest',
                 uid=65534,
                 gid=65534,
                 dirs=['foo/bar'],
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0..b874161f109 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -3446,6 +3446,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -3760,6 +3763,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py
index 78a2d73118f..42e590945cd 100644
--- a/src/pybind/mgr/cephadm/tests/test_spec.py
+++ b/src/pybind/mgr/cephadm/tests/test_spec.py
@@ -130,7 +130,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "d94d7969094d",
         "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0",
-        "container_image_name": "docker.io/prom/alertmanager:latest",
+        "container_image_name": "quay.io/prometheus/alertmanager:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "alertmanager",
         "version": "0.20.0",
@@ -145,7 +145,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "c4b036202241",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "crash",
         "version": "15.2.0",
@@ -160,7 +160,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "5b7b94b48f31",
         "container_image_id": "87a51ecf0b1c9a7b187b21c1b071425dafea0d765a96d5bc371c791169b3d7f4",
-        "container_image_name": "docker.io/ceph/ceph-grafana:latest",
+        "container_image_name": "quay.io/ceph/ceph-grafana:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "grafana",
         "version": "6.6.2",
@@ -175,7 +175,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "9ca007280456",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001.gkjwqp",
         "daemon_type": "mgr",
         "version": "15.2.0",
@@ -190,7 +190,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "3d1ba9a2b697",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "mon",
         "version": "15.2.0",
@@ -205,7 +205,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "36d026c68ba1",
         "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87",
-        "container_image_name": "docker.io/prom/node-exporter:latest",
+        "container_image_name": "quay.io/prometheus/node-exporter:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "node-exporter",
         "version": "0.18.1",
@@ -220,7 +220,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "faf76193cbfe",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "0",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -235,7 +235,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "f82505bae0f1",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "1",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -250,7 +250,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "2708d84cd484",
         "container_image_id": "358a0d2395fe711bb8258e8fb4b2d7865c0a9a6463969bcd1452ee8869ea6653",
-        "container_image_name": "docker.io/prom/prometheus:latest",
+        "container_image_name": "quay.io/prom/prometheus:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "prometheus",
         "version": "2.17.1",
@@ -569,7 +569,7 @@ def test_dd_octopus(dd_json):
         CustomContainerSpec(
             service_type='container',
             service_id='hello-world',
-            image='docker.io/library/hello-world:latest',
+            image='quay.io/hello-world/hello-world:latest',
         ),
         DaemonDescription(
             daemon_type='container',
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index d8ffab2da51..ed3d26807e5 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -29,17 +29,17 @@ CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
 def normalize_image_digest(digest: str, default_registry: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/centos', 'quay.io')
+    'quay.io/centos'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index ec9c9897081..f199867943d 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -63,7 +63,10 @@ else:
 
         @EndpointDoc(
             "Get information from a specific NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_model(model.Subsystem, first="subsystems")
         @handle_nvmeof_error
@@ -78,6 +81,7 @@ else:
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
                 "enable_ha": Param(bool, "Enable high availability"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -95,6 +99,7 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "force": Param(bool, "Force delete", "false"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -111,12 +116,15 @@ else:
     class NVMeoFListener(RESTController):
         @EndpointDoc(
             "List all NVMeoF listeners",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Listener, pick="listeners")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_listeners(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_listeners(
                 NVMeoFClient.pb2.list_listeners_req(subsystem=nqn)
             )
 
@@ -128,6 +136,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -138,9 +147,10 @@ else:
             host_name: str,
             traddr: str,
             trsvcid: int = 4420,
-            adrfam: int = 0,  # IPv4
+            adrfam: int = 0,  # IPv4,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.create_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.create_listener(
                 NVMeoFClient.pb2.create_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -158,6 +168,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -170,8 +181,9 @@ else:
             trsvcid: int = 4420,
             adrfam: int = 0,  # IPv4
             force: bool = False,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.delete_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.delete_listener(
                 NVMeoFClient.pb2.delete_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -187,12 +199,15 @@ else:
     class NVMeoFNamespace(RESTController):
         @EndpointDoc(
             "List all NVMeoF namespaces in a subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Namespace, pick="namespaces")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn)
             )
 
@@ -201,12 +216,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.Namespace, first="namespaces")
         @handle_nvmeof_error
-        def get(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def get(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn, nsid=int(nsid))
             )
 
@@ -217,12 +233,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceIOStats)
         @handle_nvmeof_error
-        def io_stats(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_get_io_stats(
+        def io_stats(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_get_io_stats(
                 NVMeoFClient.pb2.namespace_get_io_stats_req(
                     subsystem_nqn=nqn, nsid=int(nsid))
             )
@@ -237,6 +254,7 @@ else:
                 "size": Param(int, "RBD image size"),
                 "block_size": Param(int, "NVMeoF namespace block size"),
                 "load_balancing_group": Param(int, "Load balancing group"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceCreation)
@@ -250,8 +268,9 @@ else:
             size: Optional[int] = 1024,
             block_size: int = 512,
             load_balancing_group: Optional[int] = None,
+            gw_group: Optional[str] = None,
         ):
-            return NVMeoFClient().stub.namespace_add(
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_add(
                 NVMeoFClient.pb2.namespace_add_req(
                     subsystem_nqn=nqn,
                     rbd_image_name=rbd_image_name,
@@ -274,6 +293,7 @@ else:
                 "rw_mbytes_per_second": Param(int, "Read/Write MB/s"),
                 "r_mbytes_per_second": Param(int, "Read MB/s"),
                 "w_mbytes_per_second": Param(int, "Write MB/s"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -288,12 +308,13 @@ else:
             rw_mbytes_per_second: Optional[int] = None,
             r_mbytes_per_second: Optional[int] = None,
             w_mbytes_per_second: Optional[int] = None,
+            gw_group: Optional[str] = None
         ):
             if rbd_image_size:
                 mib = 1024 * 1024
                 new_size_mib = int((rbd_image_size + mib - 1) / mib)
 
-                response = NVMeoFClient().stub.namespace_resize(
+                response = NVMeoFClient(gw_group=gw_group).stub.namespace_resize(
                     NVMeoFClient.pb2.namespace_resize_req(
                         subsystem_nqn=nqn, nsid=int(nsid), new_size=new_size_mib
                     )
@@ -336,12 +357,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_delete(
+        def delete(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_delete(
                 NVMeoFClient.pb2.namespace_delete_req(subsystem_nqn=nqn, nsid=int(nsid))
             )
 
@@ -351,7 +373,10 @@ else:
     class NVMeoFHost(RESTController):
         @EndpointDoc(
             "List all allowed hosts for an NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(
             model.Host,
@@ -362,8 +387,8 @@ else:
             else o,
         )
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_hosts(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_hosts(
                 NVMeoFClient.pb2.list_hosts_req(subsystem=nqn)
             )
 
@@ -372,12 +397,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to allow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def create(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.add_host(
+        def create(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.add_host(
                 NVMeoFClient.pb2.add_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -386,12 +412,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to disallow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.remove_host(
+        def delete(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.remove_host(
                 NVMeoFClient.pb2.remove_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -400,12 +427,15 @@ else:
     class NVMeoFConnection(RESTController):
         @EndpointDoc(
             "List all NVMeoF Subsystem Connections",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Connection, pick="connections")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_connections(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_connections(
                 NVMeoFClient.pb2.list_connections_req(subsystem=nqn)
             )
 
diff --git a/src/pybind/mgr/dashboard/controllers/osd.py b/src/pybind/mgr/dashboard/controllers/osd.py
index c9d14177200..07d8db7755b 100644
--- a/src/pybind/mgr/dashboard/controllers/osd.py
+++ b/src/pybind/mgr/dashboard/controllers/osd.py
@@ -5,12 +5,14 @@ import logging
 import time
 from typing import Any, Dict, List, Optional, Union
 
+import cherrypy
 from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError  # type: ignore
 from mgr_util import get_most_recent_rate
 
 from .. import mgr
 from ..exceptions import DashboardException
 from ..security import Scope
+from ..services._paginate import ListPaginator
 from ..services.ceph_service import CephService, SendCommandError
 from ..services.exception import handle_orchestrator_error, handle_send_command_error
 from ..services.orchestrator import OrchClient, OrchFeature
@@ -121,8 +123,30 @@ def osd_task(name, metadata, wait_for=2.0):
 @APIRouter('/osd', Scope.OSD)
 @APIDoc('OSD management API', 'OSD')
 class Osd(RESTController):
-    def list(self):
-        osds = self.get_osd_map()
+    @RESTController.MethodMap(version=APIVersion(1, 1))
+    def list(self, offset: int = 0, limit: int = 10,
+             search: str = '', sort: str = ''):
+        all_osds = self.get_osd_map()
+
+        paginator = ListPaginator(int(offset), int(limit), sort, search,
+                                  input_list=all_osds.values(),
+                                  searchable_params=['id'],
+                                  sortable_params=['id'],
+                                  default_sort='+id')
+
+        cherrypy.response.headers['X-Total-Count'] = paginator.get_count()
+
+        paginated_osds_list = list(paginator.list())
+        # creating a dictionary to have faster lookups
+        paginated_osds_by_id = {osd['id']: osd for osd in paginated_osds_list}
+        try:
+            osds = {
+                key: paginated_osds_by_id[int(key)]
+                for key in all_osds.keys()
+                if int(key) in paginated_osds_by_id
+            }
+        except ValueError as e:
+            raise DashboardException(e, component='osd', http_status_code=400)
 
         # Extending by osd stats information
         for stat in mgr.get('osd_stats')['osd_stats']:
diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060..2e6e466f97b 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -162,9 +162,9 @@ class RgwMultisiteController(RESTController):
     @ReadPermission
     @allow_empty_body
     # pylint: disable=W0102,W0613
-    def get_sync_status(self):
+    def get_sync_status(self, daemon_name=None):
         multisite_instance = RgwMultisite()
-        result = multisite_instance.get_multisite_sync_status()
+        result = multisite_instance.get_multisite_sync_status(daemon_name)
         return result
 
     @Endpoint(path='/sync-policy')
@@ -176,6 +176,15 @@ class RgwMultisiteController(RESTController):
         if all_policy:
             sync_policy_list = []
             buckets = json.loads(RgwBucket().list(stats=False))
+            zonegroups_info = RgwMultisite().get_all_zonegroups_info()
+            default_zonegroup = ''
+            if 'zonegroups' in zonegroups_info and 'default_zonegroup' in zonegroups_info:
+                default_zonegroup = next(
+                    (zonegroup['name'] for zonegroup in zonegroups_info['zonegroups']
+                        if 'id' in zonegroup and 'name' in zonegroup
+                        and zonegroup['id'] == zonegroups_info['default_zonegroup']),
+                    ''
+                )
             for bucket in buckets:
                 sync_policy = multisite_instance.get_sync_policy(bucket, zonegroup_name)
                 for policy in sync_policy['groups']:
@@ -183,6 +192,7 @@ class RgwMultisiteController(RESTController):
                     sync_policy_list.append(policy)
             other_sync_policy = multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
             for policy in other_sync_policy['groups']:
+                policy['zonegroup'] = default_zonegroup
                 sync_policy_list.append(policy)
             return sync_policy_list
         return multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
@@ -244,11 +254,13 @@ class RgwMultisiteController(RESTController):
                          source_zones: Dict[str, Any],
                          destination_zones: Dict[str, Any],
                          source_bucket: str = '',
-                         destination_bucket: str = '', bucket_name: str = ''):
+                         destination_bucket: str = '', bucket_name: str = '',
+                         user: str = '', mode: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.create_sync_pipe(group_id, pipe_id, source_zones,
                                                    destination_zones, source_bucket,
-                                                   destination_bucket, bucket_name, True)
+                                                   destination_bucket, bucket_name, True,
+                                                   user, mode)
 
     @Endpoint(method='DELETE', path='/sync-pipe')
     @EndpointDoc("Remove the sync pipe")
@@ -256,12 +268,10 @@ class RgwMultisiteController(RESTController):
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '',
                          bucket_name: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.remove_sync_pipe(group_id, pipe_id, source_zones,
-                                                   destination_zones, destination_bucket,
-                                                   bucket_name, True)
+                                                   destination_zones, bucket_name, True)
 
 
 @APIRouter('/rgw/daemon', Scope.RGW)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
index c966da9b9c2..0ddb8e2f611 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
@@ -52,7 +52,7 @@ export class NvmeofGatewayComponent {
         prop: 'id'
       },
       {
-        name: $localize`Host name`,
+        name: $localize`Hostname`,
         prop: 'hostname'
       },
       {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
index f88442e1bd6..974727ad062 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
@@ -76,7 +76,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteListenerModal()
       }
     ];
   }
@@ -101,7 +101,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
       });
   }
 
-  deleteSubsystemModal() {
+  deleteListenerModal() {
     const listener = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Listener',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
index 61e28274048..269e427be50 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
@@ -79,25 +79,8 @@ export class NvmeofSubsystemsComponent extends ListWithDetails implements OnInit
           this.router.navigate([BASE_URL, { outlets: { modal: [URLVerbs.CREATE] } }], {
             queryParams: { group: this.group }
           }),
-        canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
-      },
-      {
-        name: this.actionLabels.EDIT,
-        permission: 'update',
-        icon: Icons.edit,
-        click: () =>
-          this.router.navigate([
-            BASE_URL,
-            {
-              outlets: {
-                modal: [
-                  URLVerbs.EDIT,
-                  this.selection.first().nqn,
-                  this.selection.first().max_namespaces
-                ]
-              }
-            }
-          ])
+        canBePrimary: (selection: CdTableSelection) => !selection.hasSelection,
+        disable: () => !this.group
       },
       {
         name: this.actionLabels.DELETE,
@@ -114,12 +97,16 @@ export class NvmeofSubsystemsComponent extends ListWithDetails implements OnInit
   }
 
   getSubsystems() {
-    this.nvmeofService
-      .listSubsystems(this.group)
-      .subscribe((subsystems: NvmeofSubsystem[] | NvmeofSubsystem) => {
-        if (Array.isArray(subsystems)) this.subsystems = subsystems;
-        else this.subsystems = [subsystems];
-      });
+    if (this.group) {
+      this.nvmeofService
+        .listSubsystems(this.group)
+        .subscribe((subsystems: NvmeofSubsystem[] | NvmeofSubsystem) => {
+          if (Array.isArray(subsystems)) this.subsystems = subsystems;
+          else this.subsystems = [subsystems];
+        });
+    } else {
+      this.subsystems = [];
+    }
   }
 
   deleteSubsystemModal() {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts
new file mode 100644
index 00000000000..c5b25191594
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts
@@ -0,0 +1,14 @@
+export const RBDActionHelpers = {
+  moveToTrash: $localize`Move an image to the trash. Images, even ones actively in-use by clones, can be moved to the trash and deleted at a later time.`,
+  delete: $localize`Delete an rbd image (including all data blocks). If the image has snapshots, this fails and nothing is deleted.`,
+  copy: $localize`Copy the content of a source image into the newly created destination image`,
+  flatten: $localize`If the image is a clone, copy all shared blocks from the parent snapshot and make the child independent of the parent, severing the link between parent snap and child. `,
+  enableMirroring: $localize`Mirroring needs to be enabled on the image to perform this action`,
+  clonedSnapshot: $localize`This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD`,
+  secondayImageDelete: $localize`The image cannot be deleted as it is secondary`,
+  primaryImageResync: $localize`Primary RBD images cannot be resynced`,
+  invalidNameDisable: $localize`This RBD image has an invalid name and can't be managed by ceph.`,
+  removingStatus: $localize`Action not possible for an RBD in status 'Removing'`,
+  journalTooltipText: $localize`'Ensures reliable replication by logging changes before updating the image, but doubles write time, impacting performance. Not recommended for high-speed data processing tasks.`,
+  snapshotTooltipText: $localize`This mode replicates RBD images between clusters using snapshots, efficiently copying data changes but requiring complete delta syncing during failover. Ideal for less demanding tasks due to its less granular approach compared to journaling.`
+};
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
index 85c56cbf0d4..29a2008567e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
@@ -7,7 +7,12 @@
           novalidate>
 
       <div i18n="form title"
-           class="form-header">{{ action | titlecase }} {{ resource | upperFirst }}</div>
+           class="form-header">{{ action | titlecase }} {{ resource | upperFirst }}
+        <cd-help-text>
+          <div *ngIf="action === 'Copy'">{{copyMessage}}
+          </div>
+        </cd-help-text>
+      </div>
 
       <!-- Parent -->
       <div class="form-item"
@@ -103,7 +108,7 @@
         <cd-alert-panel *ngIf="showMirrorDisableMessage"
                         spacingClass="mt-2"
                         [showTitle]="false"
-                        type="info">Mirroring can not be disabled on <b>Pool</b> mirror mode.
+                        type="info">Mirroring can not be disabled on <b>&nbsp;Pool&nbsp;</b> mirror mode.
                                     You need to change the mirror mode to enable this option.
         </cd-alert-panel>
         <cd-alert-panel *ngIf="currentPoolMirrorMode === 'disabled'"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
index d9c1c8925fc..7d694e2cab4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
@@ -34,6 +34,7 @@ import { RbdFormEditRequestModel } from './rbd-form-edit-request.model';
 import { RbdFormMode } from './rbd-form-mode.enum';
 import { RbdFormResponseModel } from './rbd-form-response.model';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
+import { RBDActionHelpers } from '../rbd-contants';
 
 class ExternalData {
   rbd: RbdFormResponseModel;
@@ -69,34 +70,28 @@ export class RbdFormComponent extends CdForm implements OnInit {
 
   pool: string;
   peerConfigured = false;
-
   advancedEnabled = false;
-
   public rbdFormMode = RbdFormMode;
   mode: RbdFormMode;
-
   response: RbdFormResponseModel;
   snapName: string;
-
   defaultObjectSize = '4 MiB';
 
   mirroringOptions = [
     {
       value: 'journal',
-      text:
-        'Ensures reliable replication by logging changes before updating the image, but doubles write time, impacting performance. Not recommended for high-speed data processing tasks.'
+      text: RBDActionHelpers.journalTooltipText
     },
     {
       value: 'snapshot',
-      text:
-        'This mode replicates RBD images between clusters using snapshots, efficiently copying data changes but requiring complete delta syncing during failover. Ideal for less demanding tasks due to its less granular approach compared to journaling.'
+      text: RBDActionHelpers.snapshotTooltipText
     }
   ];
   poolMirrorMode: string;
   mirroring = false;
   currentPoolName = '';
   currentPoolMirrorMode = '';
-
+  copyMessage: string = RBDActionHelpers.copy;
   objectSizes: Array<string> = [
     '4 KiB',
     '8 KiB',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
index d71027bde3d..c775333a407 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
@@ -128,7 +128,7 @@ describe('RbdListComponent', () => {
         ]
       });
       expect(component.getDeleteDisableDesc(component.selection)).toBe(
-        'This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD.'
+        'This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD'
       );
     });
 
@@ -268,11 +268,11 @@ describe('RbdListComponent', () => {
           'Copy',
           'Flatten',
           'Resync',
-          'Delete',
-          'Move to Trash',
           'Remove Scheduling',
           'Promote',
-          'Demote'
+          'Demote',
+          'Move to Trash',
+          'Delete'
         ],
         primary: {
           multiple: 'Create',
@@ -300,7 +300,7 @@ describe('RbdListComponent', () => {
         }
       },
       'create,delete': {
-        actions: ['Create', 'Copy', 'Delete', 'Move to Trash'],
+        actions: ['Create', 'Copy', 'Move to Trash', 'Delete'],
         primary: {
           multiple: 'Create',
           executing: 'Create',
@@ -322,11 +322,11 @@ describe('RbdListComponent', () => {
           'Edit',
           'Flatten',
           'Resync',
-          'Delete',
-          'Move to Trash',
           'Remove Scheduling',
           'Promote',
-          'Demote'
+          'Demote',
+          'Move to Trash',
+          'Delete'
         ],
         primary: {
           multiple: '',
@@ -345,7 +345,7 @@ describe('RbdListComponent', () => {
         }
       },
       delete: {
-        actions: ['Delete', 'Move to Trash'],
+        actions: ['Move to Trash', 'Delete'],
         primary: {
           multiple: '',
           executing: '',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
index 1a4bb4e0cf8..52d9ff819e2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
@@ -23,7 +23,6 @@ import { DimlessBinaryPipe } from '~/app/shared/pipes/dimless-binary.pipe';
 import { DimlessPipe } from '~/app/shared/pipes/dimless.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { CdTableServerSideService } from '~/app/shared/services/cd-table-server-side.service';
-// import { ModalService } from '~/app/shared/services/modal.service';
 import { TaskListService } from '~/app/shared/services/task-list.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { URLBuilderService } from '~/app/shared/services/url-builder.service';
@@ -32,7 +31,7 @@ import { RbdParentModel } from '../rbd-form/rbd-parent.model';
 import { RbdTrashMoveModalComponent } from '../rbd-trash-move-modal/rbd-trash-move-modal.component';
 import { RBDImageFormat, RbdModel } from './rbd-model';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-
+import { RBDActionHelpers } from '../rbd-contants';
 const BASE_URL = 'block/rbd';
 
 @Component({
@@ -83,7 +82,6 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
   count = 0;
   private tableContext: CdTableFetchDataContext = null;
   errorMessage: string;
-
   builders = {
     'rbd/create': (metadata: object) =>
       this.createRbdFromTask(metadata['pool_name'], metadata['namespace'], metadata['image_name']),
@@ -159,8 +157,20 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       icon: Icons.destroy,
       click: () => this.deleteRbdModal(),
       name: this.actionLabels.DELETE,
+      title: RBDActionHelpers.delete,
       disable: (selection: CdTableSelection) => this.getDeleteDisableDesc(selection)
     };
+    const moveAction: CdTableAction = {
+      permission: 'delete',
+      icon: Icons.trash,
+      title: RBDActionHelpers.moveToTrash,
+      click: () => this.trashRbdModal(),
+      name: this.actionLabels.TRASH,
+      disable: (selection: CdTableSelection) =>
+        this.getRemovingStatusDesc(selection) ||
+        this.getInvalidNameDisable(selection) ||
+        selection.first().image_format === RBDImageFormat.V1
+    };
     const resyncAction: CdTableAction = {
       permission: 'update',
       icon: Icons.refresh,
@@ -177,7 +187,8 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
         !!selection.first().cdExecuting,
       icon: Icons.copy,
       routerLink: () => `/block/rbd/copy/${getImageUri()}`,
-      name: this.actionLabels.COPY
+      name: this.actionLabels.COPY,
+      title: RBDActionHelpers.copy
     };
     const flattenAction: CdTableAction = {
       permission: 'update',
@@ -188,18 +199,10 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
         !selection.first().parent,
       icon: Icons.flatten,
       click: () => this.flattenRbdModal(),
-      name: this.actionLabels.FLATTEN
-    };
-    const moveAction: CdTableAction = {
-      permission: 'delete',
-      icon: Icons.trash,
-      click: () => this.trashRbdModal(),
-      name: this.actionLabels.TRASH,
-      disable: (selection: CdTableSelection) =>
-        this.getRemovingStatusDesc(selection) ||
-        this.getInvalidNameDisable(selection) ||
-        selection.first().image_format === RBDImageFormat.V1
+      name: this.actionLabels.FLATTEN,
+      title: RBDActionHelpers.flatten
     };
+
     const removeSchedulingAction: CdTableAction = {
       permission: 'update',
       icon: Icons.edit,
@@ -217,9 +220,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       name: this.actionLabels.PROMOTE,
       visible: () => this.selection.first() != null && !this.selection.first().primary,
       disable: () =>
-        this.selection.first().mirror_mode === 'Disabled'
-          ? 'Mirroring needs to be enabled on the image to perform this action'
-          : ''
+        this.selection.first().mirror_mode === 'Disabled' ? RBDActionHelpers.enableMirroring : ''
     };
     const demoteAction: CdTableAction = {
       permission: 'update',
@@ -228,9 +229,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       name: this.actionLabels.DEMOTE,
       visible: () => this.selection.first() != null && this.selection.first().primary,
       disable: () =>
-        this.selection.first().mirror_mode === 'Disabled'
-          ? 'Mirroring needs to be enabled on the image to perform this action'
-          : ''
+        this.selection.first().mirror_mode === 'Disabled' ? RBDActionHelpers.enableMirroring : ''
     };
     this.tableActions = [
       addAction,
@@ -238,11 +237,11 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       copyAction,
       flattenAction,
       resyncAction,
-      deleteAction,
-      moveAction,
       removeSchedulingAction,
       promoteAction,
-      demoteAction
+      demoteAction,
+      moveAction,
+      deleteAction
     ];
   }
 
@@ -624,17 +623,23 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
     const first = selection.first();
 
     if (first && this.hasClonedSnapshots(first)) {
-      return $localize`This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD.`;
+      return RBDActionHelpers.clonedSnapshot;
     }
-
-    return this.getInvalidNameDisable(selection) || this.hasClonedSnapshots(selection.first());
+    if (first && first.primary === false) {
+      return RBDActionHelpers.secondayImageDelete;
+    }
+    return (
+      this.getInvalidNameDisable(selection) ||
+      this.hasClonedSnapshots(selection.first()) ||
+      first.primary === false
+    );
   }
 
   getResyncDisableDesc(selection: CdTableSelection): string | boolean {
     const first = selection.first();
 
     if (first && this.imageIsPrimary(first)) {
-      return $localize`Primary RBD images cannot be resynced`;
+      return RBDActionHelpers.primaryImageResync;
     }
 
     return this.getInvalidNameDisable(selection);
@@ -647,7 +652,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
     const first = selection.first();
 
     if (first?.name?.match(/[@/]/)) {
-      return $localize`This RBD image has an invalid name and can't be managed by ceph.`;
+      return RBDActionHelpers.invalidNameDisable;
     }
 
     return !selection.first() || !selection.hasSingleSelection;
@@ -656,7 +661,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
   getRemovingStatusDesc(selection: CdTableSelection): string | boolean {
     const first = selection.first();
     if (first?.source === 'REMOVING') {
-      return $localize`Action not possible for an RBD in status 'Removing'`;
+      return RBDActionHelpers.removingStatus;
     }
     return false;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
index a9961f72ff6..b05d07fb31b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
@@ -34,6 +34,7 @@
                     [columns]="columns"
                     columnMode="flex"
                     selectionType="single"
+                    (fetchData)="refresh()"
                     [hasDetails]="true"
                     (setExpandedRow)="setExpandedRow($event)"
                     [maxLimit]="25"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
index 9f05ab668ab..78b4c9c1859 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
@@ -273,4 +273,9 @@ export class MultiClusterListComponent extends ListWithDetails implements OnInit
     super.setExpandedRow(expandedRow);
     this.router.navigate(['performance-details'], { relativeTo: this.route });
   }
+
+  refresh() {
+    this.multiClusterService.refresh();
+    this.multiClusterService.refreshTokenStatus();
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
index 5f5f91dd0ed..a56877512f9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
@@ -6,13 +6,15 @@
        i18n>OSDs List</a>
     <ng-template ngbNavContent>
       <cd-table [data]="osds"
-                (fetchData)="getOsdList()"
+                (fetchData)="getOsdList($event)"
                 [columns]="columns"
                 selectionType="multiClick"
                 [hasDetails]="true"
                 (setExpandedRow)="setExpandedRow($event)"
                 (updateSelection)="updateSelection($event)"
-                [updateSelectionOnRefresh]="'never'">
+                [updateSelectionOnRefresh]="'never'"
+                [serverSide]="true"
+                [count]="count">
 
         <div class="table-actions">
           <cd-table-actions [permission]="permissions.osd"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
index 77facfe3f85..85ea9240414 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
@@ -33,6 +33,8 @@ import {
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdListComponent } from './osd-list.component';
 import { ResizeObserver as ResizeObserverPolyfill } from '@juggle/resize-observer';
+import { PaginateObservable } from '~/app/shared/api/paginate.model';
+import { Osd } from '~/app/shared/models/osd.model';
 
 describe('OsdListComponent', () => {
   let component: OsdListComponent;
@@ -141,38 +143,42 @@ describe('OsdListComponent', () => {
   });
 
   describe('getOsdList', () => {
-    let osds: any[];
+    let osds: Osd[];
     let flagsSpy: jasmine.Spy;
 
-    const createOsd = (n: number) =>
-      <Record<string, any>>{
-        in: 'in',
-        up: 'up',
-        tree: {
-          device_class: 'ssd'
-        },
-        stats_history: {
-          op_out_bytes: [
-            [n, n],
-            [n * 2, n * 2]
-          ],
-          op_in_bytes: [
-            [n * 3, n * 3],
-            [n * 4, n * 4]
-          ]
-        },
-        stats: {
-          stat_bytes_used: n * n,
-          stat_bytes: n * n * n
-        },
-        state: []
-      };
+    const createOsd = (n: number): Osd => ({
+      id: n,
+      host: {
+        id: 0,
+        name: 'test_host'
+      },
+      in: 1,
+      up: 1,
+      tree: {
+        device_class: 'ssd'
+      },
+      stats_history: {
+        op_out_bytes: [
+          [n, n],
+          [n * 2, n * 2]
+        ],
+        op_in_bytes: [
+          [n * 3, n * 3],
+          [n * 4, n * 4]
+        ]
+      },
+      stats: {
+        stat_bytes_used: n * n,
+        stat_bytes: n * n * n
+      },
+      state: []
+    });
 
     const expectAttributeOnEveryOsd = (attr: string) =>
       expect(component.osds.every((osd) => Boolean(_.get(osd, attr)))).toBeTruthy();
 
     beforeEach(() => {
-      spyOn(osdService, 'getList').and.callFake(() => of(osds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(osds)));
       flagsSpy = spyOn(osdService, 'getFlags').and.callFake(() => of([]));
       osds = [createOsd(1), createOsd(2), createOsd(3)];
       component.getOsdList();
@@ -556,8 +562,9 @@ describe('OsdListComponent', () => {
 
     beforeEach(() => {
       component.permissions = fakeAuthStorageService.getPermissions();
-      spyOn(osdService, 'getList').and.callFake(() => of(fakeOsds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(fakeOsds)));
       spyOn(osdService, 'getFlags').and.callFake(() => of([]));
+      component.getOsdList();
     });
 
     const testTableActions = async (
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
index 103b61e79f0..91cb0193f3c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
@@ -39,6 +39,8 @@ import { OsdRecvSpeedModalComponent } from '../osd-recv-speed-modal/osd-recv-spe
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdScrubModalComponent } from '../osd-scrub-modal/osd-scrub-modal.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { Osd } from '~/app/shared/models/osd.model';
 
 const BASE_URL = 'osd';
 
@@ -71,6 +73,7 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
   clusterWideActions: CdTableAction[];
   icons = Icons;
   osdSettings = new OsdSettings();
+  count = 0;
 
   selection = new CdTableSelection();
   osds: any[] = [];
@@ -426,10 +429,13 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
     }
   }
 
-  getOsdList() {
-    const observables = [this.osdService.getList(), this.osdService.getFlags()];
-    observableForkJoin(observables).subscribe((resp: [any[], string[]]) => {
-      this.osds = resp[0].map((osd) => {
+  getOsdList(context?: CdTableFetchDataContext) {
+    if (!context) context = new CdTableFetchDataContext();
+    const pagination_obs = this.osdService.getList(context.toParams());
+    const observables = [pagination_obs.observable, this.osdService.getFlags()];
+    observableForkJoin(observables).subscribe((resp: any) => {
+      this.osds = resp[0].map((osd: Osd) => {
+        this.count = pagination_obs.count;
         osd.collectedStates = OsdListComponent.collectStates(osd);
         osd.stats_history.out_bytes = osd.stats_history.op_out_bytes.map((i: string) => i[1]);
         osd.stats_history.in_bytes = osd.stats_history.op_in_bytes.map((i: string) => i[1]);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
index d3ea8c018f6..367418c752e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
@@ -27,7 +27,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '003c10beafc8c27b635bcdfed1ed832e4c1005be89bb1bb05ad4cc6c2b98e41b',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '3',
       daemon_type: 'osd',
       daemon_name: 'osd.3',
@@ -47,7 +47,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: 'baeec41a01374b3ed41016d542d19aef4a70d69c27274f271e26381a0cc58e7a',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '4',
       daemon_type: 'osd',
       daemon_name: 'osd.4',
@@ -63,7 +63,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '8483de277e365bea4365cee9e1f26606be85c471e4da5d51f57e4b85a42c616e',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '5',
       daemon_type: 'osd',
       daemon_name: 'osd.5',
@@ -79,7 +79,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'mon0',
       container_id: '6ca0574f47e300a6979eaf4e7c283a8c4325c2235ae60358482fc4cd58844a21',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: 'a',
       daemon_name: 'mon.a',
       daemon_type: 'mon',
@@ -99,7 +99,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'osd',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 3,
         running: 3,
         last_refresh: '2020-02-25T04:33:26.465699'
@@ -111,7 +111,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'crash',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 1,
         running: 1,
         last_refresh: '2020-02-25T04:33:26.465766'
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
index 1a73490175d..0da4913e9b8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
@@ -106,7 +106,6 @@
                       [invalid]="nfsForm.controls.fsal.controls.user_id.invalid && (nfsForm.controls.fsal.controls.user_id.dirty)"
                       [invalidText]="userIdError"
                       [skeleton]="allRGWUsers === null"
-                      (valueChange)="pathChangeHandler()"
                       i18n>
             <option *ngIf="allRGWUsers === null"
                     value="">Loading...</option>
@@ -223,8 +222,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="pathDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #pathError>
@@ -259,8 +256,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="bucketDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #bucketPathError>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
index 2317671b022..d502524256e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
@@ -434,7 +434,7 @@ export class NfsFormComponent extends CdForm implements OnInit {
           fs_name: this.selectedFsName
         }
       });
-      this.volumeChangeHandler();
+      this.getSubVolGrp(this.selectedFsName);
     }
     if (!_.isEmpty(this.selectedSubvolGroup)) {
       this.nfsForm.patchValue({
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
index fbe3110b978..22da2851d5a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
@@ -40,6 +40,7 @@ import { Router } from '@angular/router';
 import { RgwMultisiteWizardComponent } from '../rgw-multisite-wizard/rgw-multisite-wizard.component';
 import { RgwMultisiteSyncPolicyComponent } from '../rgw-multisite-sync-policy/rgw-multisite-sync-policy.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { RgwMultisiteService } from '~/app/shared/api/rgw-multisite.service';
 
 const BASE_URL = 'rgw/multisite/configuration';
 
@@ -121,7 +122,8 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
     public rgwDaemonService: RgwDaemonService,
     public mgrModuleService: MgrModuleService,
     private notificationService: NotificationService,
-    private cdsModalService: ModalCdsService
+    private cdsModalService: ModalCdsService,
+    private rgwMultisiteService: RgwMultisiteService
   ) {
     this.permission = this.authStorageService.getPermissions().rgw;
   }
@@ -412,22 +414,30 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
     this.realmIds = [];
     this.zoneIds = [];
     this.evaluateMigrateAndReplicationActions();
+    this.rgwMultisiteService.restartGatewayMessage$.subscribe((value) => {
+      if (value !== null) {
+        this.restartGatewayMessage = value;
+      } else {
+        this.checkRestartGatewayMessage();
+      }
+    });
+    return allNodes;
+  }
+
+  checkRestartGatewayMessage() {
     this.rgwDaemonService.list().subscribe((data: any) => {
-      const hasEmptyRealmName = data.some(
-        (item: { [x: string]: any }) =>
-          item['realm_name'] === '' &&
-          !data.some((i: { [x: string]: any }) => i['id'] === item['id'] && i['realm_name'] !== '')
-      );
+      const realmName = data.map((item: { [x: string]: any }) => item['realm_name']);
       if (
         this.defaultRealmId !== '' &&
         this.defaultZonegroupId !== '' &&
         this.defaultZoneId !== '' &&
-        hasEmptyRealmName
+        realmName.includes('')
       ) {
         this.restartGatewayMessage = true;
+      } else {
+        this.restartGatewayMessage = false;
       }
     });
-    return allNodes;
   }
 
   getDefaultsEntities(
@@ -546,20 +556,20 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
 
   delete(node: TreeNode) {
     if (node.data.type === 'realm') {
-      this.modalRef = this.modalService.show(CriticalConfirmationModalComponent, {
+      const modalRef = this.cdsModalService.show(CriticalConfirmationModalComponent, {
         itemDescription: $localize`${node.data.type} ${node.data.name}`,
         itemNames: [`${node.data.name}`],
         submitAction: () => {
           this.rgwRealmService.delete(node.data.name).subscribe(
             () => {
-              this.modalRef.close();
               this.notificationService.show(
                 NotificationType.success,
                 $localize`Realm: '${node.data.name}' deleted successfully`
               );
+              this.cdsModalService.dismissAll();
             },
             () => {
-              this.modalRef.componentInstance.stopLoadingSpinner();
+              this.cdsModalService.stopLoadingSpinner(modalRef.deletionForm);
             }
           );
         }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
index e50666cdeaa..767305958d4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
@@ -64,6 +64,9 @@
                    i18n-placeholder
                    placeholder="Source Bucket Name..."
                    formControlName="source_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
           </div>
         <div class="form-group row">
@@ -78,6 +81,9 @@
                    i18n-placeholder
                    placeholder="Destination Bucket Name..."
                    formControlName="destination_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
         </div>
       </div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
index 369658d7d42..1127db1c59a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
@@ -89,6 +89,47 @@ describe('RgwMultisiteSyncPipeModalComponent', () => {
     component.submit();
     expect(spy).toHaveBeenCalled();
     expect(putDataSpy).toHaveBeenCalled();
-    expect(putDataSpy).toHaveBeenCalledWith(component.pipeForm.getRawValue());
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: '',
+      user: ''
+    });
+  });
+
+  it('should pass "user" and "mode" while creating/editing pipe', () => {
+    component.editing = true;
+    component.pipeForm.patchValue({
+      pipe_id: 'pipe1',
+      group_id: 's3-bucket-replication:enabled',
+      source_bucket: '',
+      source_zones: { added: ['zone1-zg1-realm1'], removed: [] },
+      destination_bucket: '',
+      destination_zones: { added: ['zone2-zg1-realm1'], removed: [] }
+    });
+    component.pipeSelectedRow = {
+      dest: { bucket: '*', zones: ['zone2-zg1-realm1'] },
+      id: 'pipi1',
+      params: {
+        dest: {},
+        mode: 'user',
+        priority: 0,
+        source: { filter: { tags: [] } },
+        user: 'dashboard'
+      },
+      source: { bucket: '*', zones: ['zone1-zg1-realm1'] }
+    };
+
+    component.sourceZones.data.selected = ['zone1-zg1-realm1'];
+    component.destZones.data.selected = ['zone2-zg1-realm1'];
+    const spy = jest.spyOn(component, 'submit');
+    const putDataSpy = jest.spyOn(multisiteServiceMock, 'createEditSyncPipe');
+    component.submit();
+    expect(spy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: 'user',
+      user: 'dashboard'
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
index 2f41dbd23c8..43742ef60b8 100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
@@ -18,6 +18,8 @@ import { ZoneData } from '../models/rgw-multisite-zone-selector';
 import { SucceededActionLabelsI18n } from '~/app/shared/constants/app.constants';
 
 const ALL_ZONES = $localize`All zones (*)`;
+const ALL_BUCKET_SELECTED_HELP_TEXT =
+  'If no value is provided, all the buckets in the zone group will be selected.';
 
 @Component({
   selector: 'cd-rgw-multisite-sync-pipe-modal',
@@ -33,6 +35,7 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
   sourceZones = new ZoneData(false, 'Filter Zones');
   destZones = new ZoneData(false, 'Filter Zones');
   icons = Icons;
+  allBucketSelectedHelpText = ALL_BUCKET_SELECTED_HELP_TEXT;
 
   constructor(
     public activeModal: NgbActiveModal,
@@ -187,7 +190,9 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
       .createEditSyncPipe({
         ...this.pipeForm.getRawValue(),
         source_zones: sourceZones,
-        destination_zones: destZones
+        destination_zones: destZones,
+        user: this.editing ? this.pipeSelectedRow?.params?.user : '',
+        mode: this.editing ? this.pipeSelectedRow?.params?.mode : ''
       })
       .subscribe(
         () => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
index ee261db5042..03228856125 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
@@ -88,12 +88,22 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
       {
         name: $localize`Zonegroup`,
         prop: 'zonegroup',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       },
       {
         name: $localize`Bucket`,
         prop: 'bucket',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       }
     ];
     this.rgwDaemonService.list().subscribe();
@@ -137,7 +147,7 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
           groupName: policy['id'],
           status: policy['status'],
           bucket: policy['bucketName'],
-          zonegroup: ''
+          zonegroup: policy['zonegroup']
         });
       });
       this.syncPolicyData = [...this.syncPolicyData];
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
index 3d4b06528c1..2fbe1163ef8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
@@ -236,6 +236,7 @@ export class RgwMultisiteWizardComponent extends BaseModal implements OnInit {
         )
         .subscribe((data: object[]) => {
           this.setupCompleted = true;
+          this.rgwMultisiteService.setRestartGatewayMessage(false);
           this.loading = false;
           this.realms = data;
           this.showSuccessNotification();
@@ -258,6 +259,7 @@ export class RgwMultisiteWizardComponent extends BaseModal implements OnInit {
         .subscribe(
           () => {
             this.setupCompleted = true;
+            this.rgwMultisiteService.setRestartGatewayMessage(false);
             this.loading = false;
             this.showSuccessNotification();
           },
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
index 8b5901769c3..00037a7235b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
@@ -91,7 +91,9 @@ export class RgwOverviewDashboardComponent implements OnInit, OnDestroy {
         this.totalPoolUsedBytes = data['total_pool_bytes_used'];
         this.averageObjectSize = data['average_object_size'];
       });
-      this.getSyncStatus();
+      setTimeout(() => {
+        this.getSyncStatus();
+      });
     });
     this.BucketSub = this.rgwBucketService
       .getTotalBucketsAndUsersLength()
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
index d1f9997791a..c81c9193a2e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
@@ -3,6 +3,7 @@ import { TestBed } from '@angular/core/testing';
 
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { OsdService } from './osd.service';
+import { CdTableFetchDataContext } from '../models/cd-table-fetch-data-context';
 
 describe('OsdService', () => {
   let service: OsdService;
@@ -64,8 +65,9 @@ describe('OsdService', () => {
   });
 
   it('should call getList', () => {
-    service.getList().subscribe();
-    const req = httpTesting.expectOne('api/osd');
+    const context = new CdTableFetchDataContext(() => {});
+    service.getList(context.toParams()).observable.subscribe();
+    const req = httpTesting.expectOne('api/osd?offset=0&limit=10&search=&sort=%2Bname');
     expect(req.request.method).toBe('GET');
   });
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
index f2ed4d7cc9e..85a75073dea 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
@@ -1,4 +1,4 @@
-import { HttpClient } from '@angular/common/http';
+import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 
 import _ from 'lodash';
@@ -12,6 +12,9 @@ import { OsdSettings } from '../models/osd-settings';
 import { SmartDataResponseV1 } from '../models/smart';
 import { DeviceService } from '../services/device.service';
 import { CdFormGroup } from '../forms/cd-form-group';
+import { PaginateObservable } from './paginate.model';
+import { PaginateParams } from '../classes/paginate-params.class';
+import { Osd } from '../models/osd.model';
 
 @Injectable({
   providedIn: 'root'
@@ -80,8 +83,10 @@ export class OsdService {
     return this.http.post(this.path, request, { observe: 'response' });
   }
 
-  getList() {
-    return this.http.get(`${this.path}`);
+  getList(params: HttpParams): PaginateObservable<Osd[]> {
+    return new PaginateObservable<Osd[]>(
+      this.http.get<Osd[]>(this.path, new PaginateParams(params, 1, 1))
+    );
   }
 
   getOsdSettings(): Observable<OsdSettings> {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
index 703792a7571..77ec4e43f7c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
@@ -9,7 +9,7 @@ export class PaginateObservable<Type> {
     this.observable = obs.pipe(
       map((response: any) => {
         this.count = Number(response.headers?.get('X-Total-Count'));
-        return response['body'];
+        return response['body'] || response;
       })
     );
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
index d57cd523a4d..3dc886e172f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
@@ -2,6 +2,7 @@ import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 import { RgwRealm, RgwZone, RgwZonegroup } from '~/app/ceph/rgw/models/rgw-multisite';
 import { RgwDaemonService } from './rgw-daemon.service';
+import { BehaviorSubject } from 'rxjs';
 
 @Injectable({
   providedIn: 'root'
@@ -10,6 +11,9 @@ export class RgwMultisiteService {
   private uiUrl = 'ui-api/rgw/multisite';
   private url = 'api/rgw/multisite';
 
+  private restartGatewayMessageSource = new BehaviorSubject<boolean>(null);
+  restartGatewayMessage$ = this.restartGatewayMessageSource.asObservable();
+
   constructor(private http: HttpClient, public rgwDaemonService: RgwDaemonService) {}
 
   migrate(realm: RgwRealm, zonegroup: RgwZonegroup, zone: RgwZone) {
@@ -28,7 +32,9 @@ export class RgwMultisiteService {
   }
 
   getSyncStatus() {
-    return this.http.get(`${this.url}/sync_status`);
+    return this.rgwDaemonService.request((params: HttpParams) => {
+      return this.http.get(`${this.url}/sync_status`, { params: params });
+    });
   }
 
   status() {
@@ -123,8 +129,15 @@ export class RgwMultisiteService {
     );
   }
 
-  createEditSyncPipe(payload: any) {
-    return this.http.put(`${this.url}/sync-pipe`, payload);
+  createEditSyncPipe(payload: any, user?: string, mode?: string) {
+    let params = new HttpParams();
+    if (user) {
+      params = params.append('user', user);
+    }
+    if (mode) {
+      params = params.append('mode', mode);
+    }
+    return this.http.put(`${this.url}/sync-pipe`, payload, { params });
   }
 
   removeSyncPipe(pipe_id: string, group_id: string, bucket_name?: string) {
@@ -137,4 +150,8 @@ export class RgwMultisiteService {
       { params }
     );
   }
+
+  setRestartGatewayMessage(value: boolean): void {
+    this.restartGatewayMessageSource.next(value);
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
new file mode 100644
index 00000000000..a1b079b426b
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
@@ -0,0 +1,15 @@
+import { HttpParams } from '@angular/common/http';
+
+export class PaginateParams {
+  constructor(params: HttpParams, majorVersion = 1, minorVersion = 0) {
+    const options = {
+      params: params,
+      headers: {
+        Accept: `application/vnd.ceph.api.v${majorVersion}.${minorVersion}+json`
+      }
+    };
+
+    options['observe'] = 'response';
+    return options;
+  }
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
index f7be01cd929..653ea5993a2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
@@ -1,3 +1,3 @@
-::ng-deep legend .text-muted {
+.form-text {
   font-size: small;
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
index d8304127fab..51120f623f2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
@@ -147,7 +147,9 @@ export class TableActionsComponent implements OnChanges, OnInit {
   useDisableDesc(action: CdTableAction) {
     if (action.disable) {
       const result = action.disable(this.selection);
-      return _.isString(result) ? result : undefined;
+      return _.isString(result) ? result : action.title ? action.title : undefined;
+    } else if (action.title) {
+      return action.title;
     }
     return undefined;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
index e832665c5dc..f773422ac19 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
@@ -19,6 +19,8 @@ export class CdTableAction {
   // The font awesome icon that will be used
   icon: string;
 
+  // For adding the default tooltip
+  title?: string;
   /**
    * You can define the condition to disable the action.
    * By default all 'update' and 'delete' actions will only be enabled
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
index 0df2d2ebbe0..6ea415bfee9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
@@ -18,7 +18,7 @@ export class CdTableFetchDataContext {
   search = '';
   sort = '+name';
 
-  constructor(error: () => void) {
+  constructor(error?: () => void) {
     this.error = error;
   }
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
new file mode 100644
index 00000000000..f22987e439e
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
@@ -0,0 +1,49 @@
+/* We will need to check what are all the value that the
+   UI need and only make them the mandatory parameters here.
+   For now based on what I saw in the unit test file;
+   osd-list.component.spec.ts, I've made the decision to make
+   things optional and non-optional. This should be re-evaluated. */
+
+export interface Osd {
+  id: number;
+  host: Host;
+  stats_history: StatsHistory;
+  state: string[];
+  stats: Stats;
+  collectedStates?: string[];
+  in?: number;
+  out?: number;
+  up?: number;
+  down?: number;
+  destroyed?: number;
+  cdIsBinary?: boolean;
+  cdIndivFlags?: string[];
+  cdClusterFlags?: string[];
+  cdExecuting?: any;
+  tree?: Tree;
+  operational_status?: string;
+}
+
+interface Tree {
+  device_class: string;
+}
+
+interface Host {
+  id: number;
+  name: string;
+}
+
+interface StatsHistory {
+  op_out_bytes: any[];
+  op_in_bytes: any[];
+  out_bytes?: any[];
+  in_bytes?: any[];
+}
+
+interface Stats {
+  stat_bytes_used: number;
+  stat_bytes: number;
+  op_w?: number;
+  op_r?: number;
+  usage?: number;
+}
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index e8ab663d0d5..aedee7e493d 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8293,6 +8293,7 @@ paths:
                   description: Enable high availability
                   type: boolean
                 gw_group:
+                  description: NVMeoF gateway group
                   type: string
                 max_namespaces:
                   default: 1024
@@ -8346,6 +8347,7 @@ paths:
         schema:
           type: boolean
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8384,6 +8386,7 @@ paths:
         schema:
           type: string
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8417,6 +8420,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8446,6 +8455,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8479,6 +8494,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_nqn:
                   description: NVMeoF host NQN. Use "*" to allow any host.
                   type: string
@@ -8525,6 +8543,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8559,6 +8583,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8596,6 +8626,9 @@ paths:
                   default: 0
                   description: NVMeoF address family (0 - IPv4, 1 - IPv6)
                   type: integer
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_name:
                   description: NVMeoF hostname
                   type: string
@@ -8673,6 +8706,12 @@ paths:
         name: force
         schema:
           type: boolean
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8707,6 +8746,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8748,6 +8793,9 @@ paths:
                   default: true
                   description: Create RBD image
                   type: boolean
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8805,6 +8853,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8844,6 +8898,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8883,6 +8943,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8937,6 +9000,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8959,11 +9028,31 @@ paths:
       - NVMe-oF Subsystem Namespace
   /api/osd:
     get:
-      parameters: []
+      parameters:
+      - default: 0
+        in: query
+        name: offset
+        schema:
+          type: integer
+      - default: 10
+        in: query
+        name: limit
+        schema:
+          type: integer
+      - default: ''
+        in: query
+        name: search
+        schema:
+          type: string
+      - default: ''
+        in: query
+        name: sort
+        schema:
+          type: string
       responses:
         '200':
           content:
-            application/vnd.ceph.api.v1.0+json:
+            application/vnd.ceph.api.v1.1+json:
               type: object
           description: OK
         '400':
@@ -11384,6 +11473,9 @@ paths:
                   type: string
                 group_id:
                   type: string
+                mode:
+                  default: ''
+                  type: string
                 pipe_id:
                   type: string
                 source_bucket:
@@ -11391,6 +11483,9 @@ paths:
                   type: string
                 source_zones:
                   type: string
+                user:
+                  default: ''
+                  type: string
               required:
               - group_id
               - pipe_id
@@ -11447,11 +11542,6 @@ paths:
           type: string
       - default: ''
         in: query
-        name: destination_bucket
-        schema:
-          type: string
-      - default: ''
-        in: query
         name: bucket_name
         schema:
           type: string
@@ -11677,7 +11767,12 @@ paths:
       - RgwMultisite
   /api/rgw/multisite/sync_status:
     get:
-      parameters: []
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: daemon_name
+        schema:
+          type: string
       responses:
         '200':
           content:
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_client.py b/src/pybind/mgr/dashboard/services/nvmeof_client.py
index d6b126500b0..e0ea6d1e48b 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_client.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_client.py
@@ -22,7 +22,7 @@ else:
     class NVMeoFClient(object):
         pb2 = pb2
 
-        def __init__(self, gw_group: Optional[str] = None):
+        def __init__(self, gw_group: Optional[str] = None, traddr: Optional[str] = None):
             logger.info("Initiating nvmeof gateway connection...")
             try:
                 if not gw_group:
@@ -36,6 +36,23 @@ else:
                     f'Unable to retrieve the gateway info: {e}'
                 )
 
+            # While creating listener need to direct request to the gateway
+            # address where listener is supposed to be added.
+            if traddr:
+                gateways_info = NvmeofGatewaysConfig.get_gateways_config()
+                matched_gateway = next(
+                    (
+                        gateway
+                        for gateways in gateways_info['gateways'].values()
+                        for gateway in gateways
+                        if traddr in gateway['service_url']
+                    ),
+                    None
+                )
+                if matched_gateway:
+                    self.gateway_addr = matched_gateway.get('service_url')
+                    logger.debug("Gateway address set to: %s", self.gateway_addr)
+
             root_ca_cert = NvmeofGatewaysConfig.get_root_ca_cert(service_name)
             if root_ca_cert:
                 client_key = NvmeofGatewaysConfig.get_client_key(service_name)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index a5a9979af25..2426c599078 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -93,9 +93,9 @@ class NvmeofGatewaysConfig(object):
                 return None
 
             if group:
-                return cls._get_name_url_for_group(gateways, group)
+                return _get_name_url_for_group(gateways, group)
 
-            return cls._get_default_service(gateways)
+            return _get_default_service(gateways)
 
         except (KeyError, IndexError) as e:
             raise DashboardException(
@@ -129,52 +129,66 @@ class NvmeofGatewaysConfig(object):
             orch = OrchClient.instance()
             if orch.available():
                 if key:
-                    return orch.cert_store.get_key(entity, service_name)
-                return orch.cert_store.get_cert(entity, service_name)
+                    return orch.cert_store.get_key(entity, service_name,
+                                                   ignore_missing_exception=True)
+                return orch.cert_store.get_cert(entity, service_name,
+                                                ignore_missing_exception=True)
             return None
         except OrchestratorError:
             # just return None if any orchestrator error is raised
             # otherwise nvmeof api will raise this error and doesn't proceed.
             return None
 
-    @classmethod
-    def _get_name_url_for_group(cls, gateways, group):
-        try:
-            orch = OrchClient.instance()
-            for service_name, svc_config in gateways.items():
-                # get the group name of the service and match it against the
-                # group name provided
-                group_name_from_svc = orch.services.get(service_name)[0].spec.group
-                if group == group_name_from_svc:
-                    running_daemons = cls._get_running_daemons(orch, service_name)
-                    config = cls._get_running_daemon_svc_config(svc_config, running_daemons)
-
-                    if config:
-                        return service_name, config['service_url']
-            return None
 
-        except OrchestratorError:
-            return cls._get_default_service(gateways)
+def _get_name_url_for_group(gateways, group):
+    try:
+        orch = OrchClient.instance()
+        for service_name, svc_config in gateways.items():
+            # get the group name of the service and match it against the
+            # group name provided
+            group_name_from_svc = orch.services.get(service_name)[0].spec.group
+            if group == group_name_from_svc:
+                running_daemons = _get_running_daemons(orch, service_name)
+                config = _get_running_daemon_svc_config(svc_config, running_daemons)
 
-    @classmethod
-    def _get_running_daemons(cls, orch, service_name):
-        # get the running nvmeof daemons
-        daemons = [d.to_dict()
-                   for d in orch.services.list_daemons(service_name=service_name)]
-        return [d['daemon_name'] for d in daemons
-                if d['status_desc'] == 'running']
+                if config:
+                    return service_name, config['service_url']
+        return None
 
-    @classmethod
-    def _get_running_daemon_svc_config(cls, svc_config, running_daemons):
-        try:
-            return next(config for config in svc_config
-                        if config['daemon_name'] in running_daemons)
-        except StopIteration:
-            return None
+    except OrchestratorError:
+        return _get_default_service(gateways)
 
-    @classmethod
-    def _get_default_service(cls, gateways):
-        if gateways:
-            service_name = list(gateways.keys())[0]
-            return service_name, gateways[service_name][0]['service_url']
+
+def _get_running_daemons(orch, service_name):
+    # get the running nvmeof daemons
+    daemons = [d.to_dict()
+               for d in orch.services.list_daemons(service_name=service_name)]
+    return [d['daemon_name'] for d in daemons
+            if d['status_desc'] == 'running']
+
+
+def _get_running_daemon_svc_config(svc_config, running_daemons):
+    try:
+        return next(config for config in svc_config
+                    if config['daemon_name'] in running_daemons)
+    except StopIteration:
         return None
+
+
+def _get_default_service(gateways):
+    if gateways:
+        gateway_keys = list(gateways.keys())
+        # if there are more than 1 gateway, rather than chosing a random gateway
+        # from any of the group, raise an exception to make it clear that we need
+        # to specify the group name in the API request.
+        if len(gateway_keys) > 1:
+            raise DashboardException(
+                msg=(
+                    "Multiple NVMe-oF gateway groups are configured. "
+                    "Please specify the 'gw_group' parameter in the request."
+                ),
+                component="nvmeof"
+            )
+        service_name = gateway_keys[0]
+        return service_name, gateways[service_name][0]['service_url']
+    return None
diff --git a/src/pybind/mgr/dashboard/services/orchestrator.py b/src/pybind/mgr/dashboard/services/orchestrator.py
index 1f77b3c0ab5..38859167568 100644
--- a/src/pybind/mgr/dashboard/services/orchestrator.py
+++ b/src/pybind/mgr/dashboard/services/orchestrator.py
@@ -209,13 +209,17 @@ class CertStoreManager(ResourceManager):
 
     @wait_api_result
     def get_cert(self, entity: str, service_name: Optional[str] = None,
-                 hostname: Optional[str] = None) -> str:
-        return self.api.cert_store_get_cert(entity, service_name, hostname)
+                 hostname: Optional[str] = None,
+                 ignore_missing_exception: bool = False) -> str:
+        return self.api.cert_store_get_cert(entity, service_name, hostname,
+                                            no_exception_when_missing=ignore_missing_exception)
 
     @wait_api_result
     def get_key(self, entity: str, service_name: Optional[str] = None,
-                hostname: Optional[str] = None) -> str:
-        return self.api.cert_store_get_key(entity, service_name, hostname)
+                hostname: Optional[str] = None,
+                ignore_missing_exception: bool = False) -> str:
+        return self.api.cert_store_get_key(entity, service_name, hostname,
+                                           no_exception_when_missing=ignore_missing_exception)
 
 
 class OrchClient(object):
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361..f0352b490f9 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -288,21 +288,22 @@ class RgwClient(RestClient):
 
         daemon_keys = RgwClient._daemons.keys()
         if not daemon_name:
-            if len(daemon_keys) > 1:
-                try:
-                    multiiste = RgwMultisite()
-                    default_zonegroup = multiiste.get_all_zonegroups_info()['default_zonegroup']
-
-                    # Iterate through _daemons.values() to find the daemon with the
-                    # matching zonegroup_id
-                    for daemon in RgwClient._daemons.values():
-                        if daemon.zonegroup_id == default_zonegroup:
-                            daemon_name = daemon.name
-                            break
-                except Exception:  # pylint: disable=broad-except
-                    daemon_name = next(iter(daemon_keys))
-            else:
-                # Handle the case where there is only one or no key in _daemons
+            try:
+                if len(daemon_keys) > 1:
+                    default_zonegroup = (
+                        RgwMultisite()
+                        .get_all_zonegroups_info()['default_zonegroup']
+                    )
+                    if default_zonegroup:
+                        daemon_name = next(
+                            (daemon.name
+                             for daemon in RgwClient._daemons.values()
+                             if daemon.zonegroup_id == default_zonegroup),
+                            None
+                        )
+                daemon_name = daemon_name or next(iter(daemon_keys))
+            except Exception as e:  # pylint: disable=broad-except
+                logger.exception('Failed to determine default RGW daemon: %s', str(e))
                 daemon_name = next(iter(daemon_keys))
 
         # Discard all cached instances if any rgw setting has changed
@@ -1298,7 +1299,7 @@ class RgwMultisiteAutomation:
         user_found = False
         start_time = time.time()
         while not user_found:
-            if time.time() - start_time > 120:  # Timeout after 2 minutes
+            if time.time() - start_time > 300:  # Timeout after 5 minutes
                 logger.error("Timeout reached while waiting for user %s to appear \
                              in the second cluster", username)
                 raise DashboardException(code='user_replication_timeout',
@@ -1981,8 +1982,16 @@ class RgwMultisite:
             is_multisite_configured = False
         return is_multisite_configured
 
-    def get_multisite_sync_status(self):
+    def get_multisite_sync_status(self, daemon_name: str):
         rgw_multisite_sync_status_cmd = ['sync', 'status']
+        daemons = _get_daemons()
+        try:
+            realm_name = daemons[daemon_name].realm_name
+        except (KeyError, AttributeError):
+            raise DashboardException('Unable to get realm name from daemon',
+                                     http_status_code=500, component='rgw')
+        if realm_name:
+            rgw_multisite_sync_status_cmd.extend(['--rgw-realm', realm_name])
         try:
             exit_code, out, _ = mgr.send_rgwadmin_command(rgw_multisite_sync_status_cmd, False)
             if exit_code > 0:
@@ -2236,7 +2245,8 @@ class RgwMultisite:
                          source_bucket: str = '',
                          destination_bucket: str = '',
                          bucket_name: str = '',
-                         update_period=False):
+                         update_period=False,
+                         user: str = '', mode: str = ''):
 
         if source_zones['added'] or destination_zones['added']:
             rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'create',
@@ -2245,11 +2255,9 @@ class RgwMultisite:
             if bucket_name:
                 rgw_sync_policy_cmd += ['--bucket', bucket_name]
 
-            if source_bucket:
-                rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
+            rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
 
-            if destination_bucket:
-                rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
+            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
 
             if source_zones['added']:
                 rgw_sync_policy_cmd += ['--source-zones', ','.join(source_zones['added'])]
@@ -2257,6 +2265,12 @@ class RgwMultisite:
             if destination_zones['added']:
                 rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones['added'])]
 
+            if user:
+                rgw_sync_policy_cmd += ['--uid', user]
+
+            if mode:
+                rgw_sync_policy_cmd += ['--mode', mode]
+
             logger.info("Creating sync pipe!")
             try:
                 exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
@@ -2271,13 +2285,13 @@ class RgwMultisite:
         if ((source_zones['removed'] and '*' not in source_zones['added'])
                 or (destination_zones['removed'] and '*' not in destination_zones['added'])):
             self.remove_sync_pipe(group_id, pipe_id, source_zones['removed'],
-                                  destination_zones['removed'], destination_bucket,
-                                  bucket_name)
+                                  destination_zones['removed'],
+                                  bucket_name, True)
 
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '', bucket_name: str = '',
+                         bucket_name: str = '',
                          update_period=False):
         rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'remove',
                                '--group-id', group_id, '--pipe-id', pipe_id]
@@ -2291,9 +2305,6 @@ class RgwMultisite:
         if destination_zones:
             rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones)]
 
-        if destination_bucket:
-            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
-
         logger.info("Removing sync pipe! %s", rgw_sync_policy_cmd)
         try:
             exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
diff --git a/src/pybind/mgr/dashboard/tests/test_osd.py b/src/pybind/mgr/dashboard/tests/test_osd.py
index c3cd0dca88d..9b6dbd10de1 100644
--- a/src/pybind/mgr/dashboard/tests/test_osd.py
+++ b/src/pybind/mgr/dashboard/tests/test_osd.py
@@ -8,6 +8,7 @@ from ceph.deployment.drive_group import DeviceSelection, DriveGroupSpec  # type:
 from ceph.deployment.service_spec import PlacementSpec
 
 from .. import mgr
+from ..controllers._version import APIVersion
 from ..controllers.osd import Osd, OsdUi
 from ..services.osd import OsdDeploymentOptions
 from ..tests import ControllerTestCase
@@ -274,7 +275,7 @@ class OsdTest(ControllerTestCase):
         osds_leftover = [0, 1, 2]
         with self._mock_osd_list(osd_stat_ids=osds_actual, osdmap_tree_node_ids=osds_leftover,
                                  osdmap_ids=osds_actual):
-            self._get('/api/osd')
+            self._get('/api/osd', version=APIVersion(1, 1))
             self.assertEqual(len(self.json_body()), 2, 'It should display two OSDs without failure')
             self.assertStatus(200)
 
diff --git a/src/pybind/mgr/dashboard/tools.py b/src/pybind/mgr/dashboard/tools.py
index 51ed9c471aa..14de970cceb 100644
--- a/src/pybind/mgr/dashboard/tools.py
+++ b/src/pybind/mgr/dashboard/tools.py
@@ -9,9 +9,9 @@ import threading
 import time
 import urllib
 from datetime import datetime, timedelta
-from distutils.util import strtobool
 
 import cherrypy
+from ceph.utils import strtobool
 from mgr_util import build_url
 
 from . import mgr
diff --git a/src/pybind/mgr/mirroring/fs/snapshot_mirror.py b/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
index 2bfb6482674..c348ce82de1 100644
--- a/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
+++ b/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
@@ -722,6 +722,20 @@ class FSSnapshotMirror:
         except Exception as e:
             return e.args[0], '', 'failed to remove directory'
 
+    def list_dirs(self, filesystem):
+        try:
+            with self.lock:
+                if not self.filesystem_exist(filesystem):
+                    raise MirrorException(-errno.ENOENT, f'filesystem {filesystem} does not exist')
+                fspolicy = self.pool_policy.get(filesystem, None)
+                if not fspolicy:
+                    raise MirrorException(-errno.EINVAL, f'filesystem {filesystem} is not mirrored')
+                return 0, json.dumps(list(fspolicy.policy.dir_states.keys()), indent=4, sort_keys=True), ''
+        except MirrorException as me:
+            return me.args[0], '', me.args[1]
+        except Exception as e:
+            return e.args[0], '', 'failed to list directories'
+
     def status(self,filesystem, dir_path):
         try:
             with self.lock:
diff --git a/src/pybind/mgr/mirroring/module.py b/src/pybind/mgr/mirroring/module.py
index 4b4354ab2b9..67f0942147e 100644
--- a/src/pybind/mgr/mirroring/module.py
+++ b/src/pybind/mgr/mirroring/module.py
@@ -84,6 +84,12 @@ class Module(MgrModule):
         """Remove a snapshot mirrored directory"""
         return self.fs_snapshot_mirror.remove_dir(fs_name, path)
 
+    @CLIWriteCommand('fs snapshot mirror ls')
+    def snapshot_mirror_ls(self,
+                           fs_name: str):
+        """List the snapshot mirrored directories"""
+        return self.fs_snapshot_mirror.list_dirs(fs_name)
+
     @CLIReadCommand('fs snapshot mirror dirmap')
     def snapshot_mirror_dirmap(self,
                                fs_name: str,
diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
index 726a7ac7937..3247b06a399 100644
--- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
+++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
@@ -102,7 +102,7 @@ placement:
   host_pattern: '*'
 status:
   container_image_id: 74803e884bea289d2d2d3ebdf6d37cd560499e955595695b1390a89800f4e37a
-  container_image_name: docker.io/ceph/daemon-base:latest-master-devel
+  container_image_name: quay.io/ceph/daemon-base:latest-main-devel
   created: '2020-06-10T10:37:31.051288Z'
   last_refresh: '2020-06-10T10:57:40.715637Z'
   running: 1
diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index dea45f951f8..3e8544f43cf 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -21,7 +21,7 @@ class CephFSStorageProvider(_StrEnum):
 
     def expand(self) -> 'CephFSStorageProvider':
         """Expand abbreviated/default values into the full/expanded form."""
-        if self == self.SAMBA_VFS:
+        if self is self.SAMBA_VFS:
             # mypy gets confused by enums
             return self.__class__(self.SAMBA_VFS_NEW)
         return self
@@ -89,9 +89,9 @@ class LoginAccess(_StrEnum):
     def expand(self) -> 'LoginAccess':
         """Exapend abbreviated enum values into their full forms."""
         # the extra LoginAccess(...) calls are to appease mypy
-        if self == self.READ_ONLY_SHORT:
+        if self is self.READ_ONLY_SHORT:
             return LoginAccess(self.READ_ONLY)
-        if self == self.READ_WRITE_SHORT:
+        if self is self.READ_WRITE_SHORT:
             return LoginAccess(self.READ_WRITE)
         return self
 
diff --git a/src/pybind/mgr/smb/handler.py b/src/pybind/mgr/smb/handler.py
index 670cb15a587..7b993d5b60d 100644
--- a/src/pybind/mgr/smb/handler.py
+++ b/src/pybind/mgr/smb/handler.py
@@ -834,6 +834,19 @@ def _check_cluster(cluster: ClusterRef, staging: _Staging) -> None:
             )
 
 
+def _parse_earmark(earmark: str) -> dict:
+    parts = earmark.split('.')
+
+    # If it only has one part (e.g., 'smb'), return None for cluster_id
+    if len(parts) == 1:
+        return {'scope': parts[0], 'cluster_id': None}
+
+    return {
+        'scope': parts[0],
+        'cluster_id': parts[2] if len(parts) > 2 else None,
+    }
+
+
 def _check_share(
     share: ShareRef,
     staging: _Staging,
@@ -878,19 +891,28 @@ def _check_share(
                 smb_earmark,
             )
         else:
+            parsed_earmark = _parse_earmark(earmark)
+
+            # Check if the top-level scope is not SMB
             if not earmark_resolver.check_earmark(
                 earmark, EarmarkTopScope.SMB
             ):
                 raise ErrorResult(
                     share,
-                    msg=f"earmark has already been set by {earmark.split('.')[0]}",
+                    msg=f"earmark has already been set by {parsed_earmark['scope']}",
                 )
-            # Check if earmark is set by same cluster
-            if earmark.split('.')[2] != share.cluster_id:
+
+            # Check if the earmark is set by a different cluster
+            if (
+                parsed_earmark['cluster_id']
+                and parsed_earmark['cluster_id'] != share.cluster_id
+            ):
                 raise ErrorResult(
                     share,
-                    msg=f"earmark has already been set by smb cluster {earmark.split('.')[2]}",
+                    msg="earmark has already been set by smb cluster "
+                    f"{parsed_earmark['cluster_id']}",
                 )
+
     name_used_by = _share_name_in_use(staging, share)
     if name_used_by:
         raise ErrorResult(
diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py
index 7483eb7964b..77a08229cf0 100644
--- a/src/pybind/mgr/smb/module.py
+++ b/src/pybind/mgr/smb/module.py
@@ -171,6 +171,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         custom_dns: Optional[List[str]] = None,
         placement: Optional[str] = None,
         clustering: Optional[SMBClustering] = None,
+        public_addrs: Optional[List[str]] = None,
     ) -> results.Result:
         """Create an smb cluster"""
         domain_settings = None
@@ -255,6 +256,18 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
                 )
             )
 
+        c_public_addrs = []
+        if public_addrs:
+            for pa in public_addrs:
+                pa_arr = pa.split('%', 1)
+                address = pa_arr[0]
+                destination = pa_arr[1] if len(pa_arr) > 1 else None
+                c_public_addrs.append(
+                    resources.ClusterPublicIPAssignment(
+                        address=address, destination=destination
+                    )
+                )
+
         pspec = resources.WrappedPlacementSpec.wrap(
             PlacementSpec.from_string(placement)
         )
@@ -266,6 +279,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             custom_dns=custom_dns,
             placement=pspec,
             clustering=clustering,
+            public_addrs=c_public_addrs,
         )
         to_apply.append(cluster)
         return self._handler.apply(to_apply, create_only=True).squash(cluster)
diff --git a/src/pybind/mgr/status/module.py b/src/pybind/mgr/status/module.py
index 85e65266a55..2b59132c1cb 100644
--- a/src/pybind/mgr/status/module.py
+++ b/src/pybind/mgr/status/module.py
@@ -161,7 +161,7 @@ class Module(MgrModule):
 
                 if output_format in ('json', 'json-pretty'):
                     json_output['mdsmap'].append({
-                        'rank': rank,
+                        'rank': f"{daemon_info['rank']}-s",
                         'name': daemon_info['name'],
                         'state': 'standby-replay',
                         'events': events,
diff --git a/src/pybind/mgr/telemetry/tox.ini b/src/pybind/mgr/telemetry/tox.ini
index a887590eed8..b2210da54ea 100644
--- a/src/pybind/mgr/telemetry/tox.ini
+++ b/src/pybind/mgr/telemetry/tox.ini
@@ -1,7 +1,6 @@
 [tox]
 envlist =
     py3
-    mypy
 skipsdist = true
 
 [testenv]
diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini
index a8a2d39d01a..f39ececa93d 100644
--- a/src/pybind/mgr/tox.ini
+++ b/src/pybind/mgr/tox.ini
@@ -160,7 +160,8 @@ modules =
 commands =
     flake8 --config=tox.ini {posargs} \
       {posargs:{[testenv:flake8]modules}}
-    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 13'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26'
 
 [testenv:jinjalint]
 deps =
diff --git a/src/pybind/mgr/volumes/fs/operations/pin_util.py b/src/pybind/mgr/volumes/fs/operations/pin_util.py
index a12ab5b4d4b..631fdd8fcaa 100644
--- a/src/pybind/mgr/volumes/fs/operations/pin_util.py
+++ b/src/pybind/mgr/volumes/fs/operations/pin_util.py
@@ -3,7 +3,7 @@ import errno
 import cephfs
 
 from ..exception import VolumeException
-from distutils.util import strtobool
+from ceph.utils import strtobool
 
 _pin_value = {
     "export": lambda x: int(x),
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index d1ef9f5ac95..459ab7df1a0 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1804,6 +1804,7 @@ class MgmtGatewaySpec(ServiceSpec):
                  ssl_stapling_verify: Optional[str] = None,
                  ssl_protocols: Optional[List[str]] = None,
                  ssl_ciphers: Optional[List[str]] = None,
+                 enable_health_check_endpoint: bool = False,
                  preview_only: bool = False,
                  unmanaged: bool = False,
                  extra_container_args: Optional[GeneralArgList] = None,
@@ -1849,6 +1850,7 @@ class MgmtGatewaySpec(ServiceSpec):
         self.ssl_protocols = ssl_protocols
         #: List of supported secure SSL ciphers. Changing this list may reduce system security.
         self.ssl_ciphers = ssl_ciphers
+        self.enable_health_check_endpoint = enable_health_check_endpoint
 
     def get_port_start(self) -> List[int]:
         ports = []
diff --git a/src/python-common/ceph/utils.py b/src/python-common/ceph/utils.py
index e92a2d1de7d..0544e9f4173 100644
--- a/src/python-common/ceph/utils.py
+++ b/src/python-common/ceph/utils.py
@@ -167,3 +167,18 @@ def http_req(hostname: str = '',
         log.error(e)
         # handle error here if needed
         raise
+
+
+_TRUE_VALS = {'y', 'yes', 't', 'true', 'on', '1'}
+_FALSE_VALS = {'n', 'no', 'f', 'false', 'off', '0'}
+
+
+def strtobool(value: str) -> bool:
+    """Convert a string to a boolean value.
+    Based on a simlilar function once available at distutils.util.strtobool.
+    """
+    if value.lower() in _TRUE_VALS:
+        return True
+    if value.lower() in _FALSE_VALS:
+        return False
+    raise ValueError(f'invalid truth value {value!r}')
diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc
index cf6820a9111..a87d88c4b85 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.cc
+++ b/src/rgw/driver/daos/rgw_sal_daos.cc
@@ -1028,6 +1028,22 @@ int DaosObject::transition_to_cloud(
   return DAOS_NOT_IMPLEMENTED_LOG(dpp);
 }
 
+int DaosObject::restore_obj_from_cloud(Bucket* bucket,
+          rgw::sal::PlacementTier* tier,
+          rgw_placement_rule& placement_rule,
+          rgw_bucket_dir_entry& o,
+	  CephContext* cct,
+          RGWObjTier& tier_config,
+          real_time& mtime,
+          uint64_t olh_epoch,
+          std::optional<uint64_t> days,
+          const DoutPrefixProvider* dpp, 
+          optional_yield y,
+          uint32_t flags)
+{
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
 bool DaosObject::placement_rules_match(rgw_placement_rule& r1,
                                        rgw_placement_rule& r2) {
   /* XXX: support single default zone and zonegroup for now */
diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h
index 7cc20260227..e382fdb04ae 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.h
+++ b/src/rgw/driver/daos/rgw_sal_daos.h
@@ -649,6 +649,18 @@ class DaosObject : public StoreObject {
                                   CephContext* cct, bool update_object,
                                   const DoutPrefixProvider* dpp,
                                   optional_yield y) override;
+  virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+			   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+			   std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+			   uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1,
                                      rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y,
diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc
index 0ce02bcff13..1345468210f 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.cc
+++ b/src/rgw/driver/posix/rgw_sal_posix.cc
@@ -3039,6 +3039,22 @@ int POSIXObject::transition_to_cloud(Bucket* bucket,
   return -ERR_NOT_IMPLEMENTED;
 }
 
+int POSIXObject::restore_obj_from_cloud(Bucket* bucket,
+          rgw::sal::PlacementTier* tier,
+          rgw_placement_rule& placement_rule,
+          rgw_bucket_dir_entry& o,
+	  CephContext* cct,
+          RGWObjTier& tier_config,
+          real_time& mtime,
+          uint64_t olh_epoch,
+          std::optional<uint64_t> days,
+          const DoutPrefixProvider* dpp, 
+          optional_yield y,
+          uint32_t flags)
+{
+  return -ERR_NOT_IMPLEMENTED;
+}
+
 bool POSIXObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
 {
   return (r1 == r2);
diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h
index efe3bfd7a50..8ec72bbc1bc 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.h
+++ b/src/rgw/driver/posix/rgw_sal_posix.h
@@ -681,6 +681,18 @@ public:
 			 bool update_object,
 			 const DoutPrefixProvider* dpp,
 			 optional_yield y) override;
+  virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+			   std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
   virtual int swift_versioning_restore(const ACLOwner& owner, const rgw_user& remote_user, bool& restored,
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
index 792671579b7..d5437f548c1 100644
--- a/src/rgw/driver/rados/rgw_data_sync.cc
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -6052,12 +6052,13 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
               } else {
                 tn->log(20, SSTR("logged prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
 	      }
-	    }
+	    } else {
             retcode = -EAGAIN;
             tn->log(10, SSTR("ERROR: requested sync of future generation "
                              << *gen << " > " << current_gen
                              << ", returning " << retcode << " for later retry"));
             return set_cr_error(retcode);
+            }
           } else if (*gen < current_gen) {
             tn->log(10, SSTR("WARNING: requested sync of past generation "
                              << *gen << " < " << current_gen
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h
index 58042df2c62..6cfaee9dc82 100644
--- a/src/rgw/driver/rados/rgw_datalog.h
+++ b/src/rgw/driver/rados/rgw_datalog.h
@@ -241,10 +241,7 @@ class RGWDataChangesLog {
   std::unique_ptr<DataLogBackends> bes;
 
   const int num_shards;
-  std::string get_prefix() {
-    auto prefix = cct->_conf->rgw_data_log_obj_prefix;
-    return prefix.empty() ? prefix : "data_log";
-  }
+  std::string get_prefix() { return "data_log"; }
   std::string metadata_log_oid() {
     return get_prefix() + "generations_metadata";
   }
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
index 64c55700eb2..e932c997621 100644
--- a/src/rgw/driver/rados/rgw_lc_tier.cc
+++ b/src/rgw/driver/rados/rgw_lc_tier.cc
@@ -14,6 +14,7 @@
 #include "rgw_common.h"
 #include "rgw_rest.h"
 #include "svc_zone.h"
+#include "rgw_rados.h"
 
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string.hpp>
@@ -231,18 +232,38 @@ static void init_headers(map<string, bufferlist>& attrs,
   }
 }
 
-/* Read object or just head from remote endpoint. For now initializes only headers,
- * but can be extended to fetch etag, mtime etc if needed.
+struct generic_attr {
+  const char *http_header;
+  const char *rgw_attr;
+};
+
+/*
+ * mapping between http env fields and rgw object attrs
+ */
+static const struct generic_attr generic_attrs[] = {
+  { "CONTENT_TYPE",             RGW_ATTR_CONTENT_TYPE },
+  { "HTTP_CONTENT_LANGUAGE",    RGW_ATTR_CONTENT_LANG },
+  { "HTTP_EXPIRES",             RGW_ATTR_EXPIRES },
+  { "HTTP_CACHE_CONTROL",       RGW_ATTR_CACHE_CONTROL },
+  { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP },
+  { "HTTP_CONTENT_ENCODING",    RGW_ATTR_CONTENT_ENC },
+  { "HTTP_X_ROBOTS_TAG",        RGW_ATTR_X_ROBOTS_TAG },
+  { "ETAG",                     RGW_ATTR_ETAG },
+};
+
+/* Read object or just head from remote endpoint.
  */
-static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
-                         std::map<std::string, std::string>& headers) {
+int rgw_cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers,
+                         real_time* pset_mtime, std::string& etag,
+                         uint64_t& accounted_size, rgw::sal::Attrs& attrs,
+                         void* cb) {
   RGWRESTConn::get_obj_params req_params;
   std::string target_obj_name;
   int ret = 0;
   rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
         tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
         tier_ctx.target_storage_class);
-  std::string etag;
   RGWRESTStreamRWRequest *in_req;
 
   rgw_bucket dest_bucket;
@@ -261,20 +282,57 @@ static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
   req_params.rgwx_stat = true;
   req_params.sync_manifest = true;
   req_params.skip_decrypt = true;
+  req_params.cb = (RGWHTTPStreamRWRequest::ReceiveCB *)cb;
 
-  ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
-    return ret;
+  ldpp_dout(tier_ctx.dpp, 20) << __func__ << "(): fetching object from cloud bucket:" << dest_bucket << ", object: " << target_obj_name << dendl;
+
+  static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20;
+  for (int tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) {
+    ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* fetch headers */
+    // accounted_size in complete_request() reads from RGWX_OBJECT_SIZE which is set
+    // only for internal ops/sync. So instead read from headers[CONTENT_LEN].
+    // Same goes for pattrs.
+    ret = tier_ctx.conn.complete_request(tier_ctx.dpp, in_req, &etag, pset_mtime, nullptr, nullptr, &headers, null_yield);
+    if (ret < 0) {
+      if (ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
+        ldpp_dout(tier_ctx.dpp, 20) << __func__  << "(): failed to fetch object from remote. retries=" << tries << dendl;
+        continue;
+      }
+      return ret;
+    }
+    break;
   }
 
-  /* fetch headers */
-  ret = tier_ctx.conn.complete_request(tier_ctx.dpp, in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
-    return ret;
+  static map<string, string> generic_attrs_map;
+  for (const auto& http2rgw : generic_attrs) {
+    generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr;
   }
-  return 0;
+
+  for (auto header: headers) {
+    const char* name = header.first.c_str();
+    const string& val = header.second;
+    bufferlist bl;
+    bl.append(val.c_str(), val.size());
+
+    const auto aiter = generic_attrs_map.find(name);
+    if (aiter != std::end(generic_attrs_map)) {
+      ldpp_dout(tier_ctx.dpp, 20) << __func__ << " Received attrs aiter->first = " << aiter->first << ", aiter->second = " << aiter->second << ret << dendl;
+     attrs[aiter->second] = bl;
+    }
+    
+    if (header.first == "CONTENT_LENGTH") {
+      accounted_size = atoi(val.c_str());
+    }
+  }
+
+  ldpp_dout(tier_ctx.dpp, 20) << __func__ << "(): Sucessfully fetched object from cloud bucket:" << dest_bucket << ", object: " << target_obj_name << dendl;
+  return ret;
 }
 
 static bool is_already_tiered(const DoutPrefixProvider *dpp,
@@ -1184,9 +1242,12 @@ static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
 static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
   int ret;
   std::map<std::string, std::string> headers;
+  std::string etag;
+  uint64_t accounted_size;
+  rgw::sal::Attrs attrs;
 
   /* Fetch Head object */
-  ret = cloud_tier_get_object(tier_ctx, true, headers);
+  ret = rgw_cloud_tier_get_object(tier_ctx, true, headers, nullptr, etag, accounted_size, attrs, nullptr);
 
   if (ret < 0) {
     ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h
index 729c4c304cd..fd8013eb000 100644
--- a/src/rgw/driver/rados/rgw_lc_tier.h
+++ b/src/rgw/driver/rados/rgw_lc_tier.h
@@ -49,3 +49,9 @@ struct RGWLCCloudTierCtx {
 
 /* Transition object to cloud endpoint */
 int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
+
+int rgw_cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers,
+                         real_time* pset_mtime, std::string& etag,
+                         uint64_t& accounted_size, rgw::sal::Attrs& attrs,
+                         void* cb);
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
index a5d788ea469..09a544df805 100644
--- a/src/rgw/driver/rados/rgw_object_expirer_core.cc
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -219,13 +219,9 @@ int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objex
   }
 
   rgw_obj_key key = hint.obj_key;
-  if (key.instance.empty()) {
-    key.instance = "null";
-  }
 
   std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-  obj->set_atomic();
-  ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
+  ret = static_cast<rgw::sal::RadosObject*>(obj.get())->handle_obj_expiry(dpp, null_yield);
 
   return ret;
 }
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 99751be392f..d154082994e 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -37,6 +37,7 @@
 #include "rgw_cr_rest.h"
 #include "rgw_datalog.h"
 #include "rgw_putobj_processor.h"
+#include "rgw_lc_tier.h"
 
 #include "cls/rgw/cls_rgw_ops.h"
 #include "cls/rgw/cls_rgw_client.h"
@@ -3212,6 +3213,30 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
     op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
   }
 
+  /* For temporary restored copies, storage-class returned
+   * in GET/list-objects should correspond to original
+   * cloudtier storage class. For GET its handled in its REST
+   * response by verifying RESTORE_TYPE in attrs. But the same
+   * cannot be done for list-objects response and hence this
+   * needs to be updated in bi entry itself.
+   */
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+  if (attr_iter != attrs.end()) {
+    rgw::sal::RGWRestoreType rt;
+    bufferlist bl = attr_iter->second;
+    auto iter = bl.cbegin();
+    decode(rt, iter);
+
+    if (rt == rgw::sal::RGWRestoreType::Temporary) {
+      // temporary restore; set storage-class to cloudtier storage class
+      auto c_iter = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+      if (c_iter != attrs.end()) {
+        storage_class = rgw_bl_str(c_iter->second);
+      }
+    }
+  }
+
   if (!op.size())
     return 0;
 
@@ -3248,7 +3273,7 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
   auto& ioctx = ref.ioctx;
 
   tracepoint(rgw_rados, operate_enter, req_id.c_str());
-  r = rgw_rados_operate(rctx.dpp, ref.ioctx, ref.obj.oid, &op, rctx.y, 0, &trace);
+  r = rgw_rados_operate(rctx.dpp, ref.ioctx, ref.obj.oid, &op, rctx.y, 0, &trace, &epoch);
   tracepoint(rgw_rados, operate_exit, req_id.c_str());
   if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
                 or -ENOENT if was removed, or -EEXIST if it did not exist
@@ -3260,7 +3285,6 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
     goto done_cancel;
   }
 
-  epoch = ioctx.get_last_version();
   poolid = ioctx.get_id();
 
   r = target->complete_atomic_modification(rctx.dpp, rctx.y);
@@ -5127,6 +5151,199 @@ int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
   return 0;
 }
 
+int RGWRados::restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
+                             RGWObjectCtx& obj_ctx,
+                             RGWBucketInfo& dest_bucket_info,
+                             const rgw_obj& dest_obj,
+                             rgw_placement_rule& dest_placement,
+                             RGWObjTier& tier_config,
+                             real_time& mtime,
+                             uint64_t olh_epoch,
+                             std::optional<uint64_t> days,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y,
+                             bool log_op){
+
+  //XXX: read below from attrs .. check transition_obj()
+  ACLOwner owner;
+  rgw::sal::Attrs attrs;
+  const req_context rctx{dpp, y, nullptr};
+  int ret = 0;
+  bufferlist t, t_tier;
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+  auto aio = rgw::make_throttle(cct->_conf->rgw_put_obj_min_window_size, y);
+  using namespace rgw::putobj;
+  jspan_context no_trace{false, false};
+  rgw::putobj::AtomicObjectProcessor processor(aio.get(), this, dest_bucket_info, nullptr,
+                                  owner, obj_ctx, dest_obj, olh_epoch, tag, dpp, y, no_trace);
+ 
+  void (*progress_cb)(off_t, void *) = NULL;
+  void *progress_data = NULL;
+  bool cb_processed = false;
+  RGWFetchObjFilter *filter;
+  RGWFetchObjFilter_Default source_filter;
+  if (!filter) {
+    filter = &source_filter;
+  }
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+  RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+                    [&](map<string, bufferlist> obj_attrs) {
+                      // XXX: do we need filter() like in fetch_remote_obj() cb
+                      dest_placement.inherit_from(dest_bucket_info.placement_rule);
+                      /* For now we always restore to STANDARD storage-class.
+                       * Later we will add support to take restore-target-storage-class
+                       * for permanent restore
+                       */
+                      dest_placement.storage_class = RGW_STORAGE_CLASS_STANDARD;
+
+                      processor.set_tail_placement(dest_placement);
+
+                      ret = processor.prepare(rctx.y);
+                      if (ret < 0) {
+                        return ret;
+                      }
+                      cb_processed = true;
+                      return 0;
+                    });
+
+  uint64_t accounted_size = 0;
+  string etag;
+  real_time set_mtime;
+  std::map<std::string, std::string> headers;
+  ldpp_dout(dpp, 20) << "Fetching from cloud, object:" << dest_obj << dendl;
+  ret = rgw_cloud_tier_get_object(tier_ctx, false,  headers,
+                                &set_mtime, etag, accounted_size,
+                                attrs, &cb);
+
+  if (ret < 0) { 
+    ldpp_dout(dpp, 20) << "Fetching from cloud failed, object:" << dest_obj << dendl;
+    return ret; 
+  }
+
+  if (!cb_processed) { 
+    ldpp_dout(dpp, 20) << "Callback not processed, object:" << dest_obj << dendl;
+    return -EIO; 
+  }
+
+  ret = cb.flush();
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (cb.get_data_len() != accounted_size) {
+    ret = -EIO;
+    ldpp_dout(dpp, -1) << "ERROR: object truncated during fetching, expected "
+        << accounted_size << " bytes but received " << cb.get_data_len() << dendl;
+    return ret;
+  }
+
+  {
+    bufferlist bl;
+    encode(rgw::sal::RGWRestoreStatus::CloudRestored, bl);
+    attrs[RGW_ATTR_RESTORE_STATUS] = std::move(bl);
+  }
+
+  ceph::real_time restore_time = real_clock::now();
+  {
+    char buf[32];
+    utime_t ut(restore_time);
+    snprintf(buf, sizeof(buf), "%lld.%09lld",
+          (long long)ut.sec(),
+          (long long)ut.nsec());
+    bufferlist bl;
+    bl.append(buf, 32);
+    encode(restore_time, bl);
+    attrs[RGW_ATTR_RESTORE_TIME] = std::move(bl);
+  }
+
+  real_time delete_at = real_time();
+  if (days) { //temp copy; do not change mtime and set expiry date
+    int expiry_days = days.value();
+    constexpr int32_t secs_in_a_day = 24 * 60 * 60;
+    ceph::real_time expiration_date ;
+
+    if (cct->_conf->rgw_restore_debug_interval > 0) {
+      expiration_date = restore_time + make_timespan(double(expiry_days)*cct->_conf->rgw_restore_debug_interval);
+      ldpp_dout(dpp, 20) << "Setting expiration time to rgw_restore_debug_interval: " << double(expiry_days)*cct->_conf->rgw_restore_debug_interval << ", days:" << expiry_days << dendl;
+    } else {
+        expiration_date = restore_time + make_timespan(double(expiry_days) * secs_in_a_day);
+    }
+    delete_at = expiration_date;
+
+    {
+      char buf[32];
+      utime_t ut(expiration_date);
+      snprintf(buf, sizeof(buf), "%lld.%09lld",
+            (long long)ut.sec(),
+            (long long)ut.nsec());
+      bufferlist bl;
+      bl.append(buf, 32);
+      encode(expiration_date, bl);
+      attrs[RGW_ATTR_RESTORE_EXPIRY_DATE] = std::move(bl);
+    }
+    {
+      bufferlist bl;
+      bl.clear();
+      using ceph::encode;
+      encode(rgw::sal::RGWRestoreType::Temporary, bl);
+      attrs[RGW_ATTR_RESTORE_TYPE] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Temporary restore, object:" << dest_obj << dendl;
+    }
+    {
+      string sc = tier_ctx.storage_class;
+      bufferlist bl;
+      bl.append(sc.c_str(), sc.size());
+      attrs[RGW_ATTR_CLOUDTIER_STORAGE_CLASS] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Setting RGW_ATTR_CLOUDTIER_STORAGE_CLASS: " << tier_ctx.storage_class << dendl;
+    }
+    //set same old mtime as that of transition time
+    set_mtime = mtime;
+
+    // set tier-config only for temp restored objects, as
+    // permanent copies will be treated as regular objects
+    {
+      t.append("cloud-s3");
+      encode(tier_config, t_tier);
+      attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
+      attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+    }
+
+  } else { // permanent restore
+    {
+      bufferlist bl;
+      bl.clear();
+      using ceph::encode;
+      encode(rgw::sal::RGWRestoreType::Permanent, bl);
+      attrs[RGW_ATTR_RESTORE_TYPE] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Permanent restore, object:" << dest_obj << dendl;
+    }
+    //set mtime to now()
+    set_mtime = real_clock::now();
+  }
+
+  {
+    string sc = dest_placement.get_storage_class(); //"STANDARD";
+    bufferlist bl;
+    bl.append(sc.c_str(), sc.size());
+    attrs[RGW_ATTR_STORAGE_CLASS] = std::move(bl);
+  }
+
+  // XXX: handle COMPLETE_RETRY like in fetch_remote_obj
+  bool canceled = false;
+  rgw_zone_set zone_set{};
+  ret = processor.complete(accounted_size, etag, &mtime, set_mtime,
+                           attrs, rgw::cksum::no_cksum, delete_at , nullptr, nullptr, nullptr,
+                           (rgw_zone_set *)&zone_set, &canceled, rctx, log_op ? rgw::sal::FLAG_LOG_OP : 0);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // XXX: handle olh_epoch for versioned objects like in fetch_remote_obj
+  return ret; 
+}
+
 int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
 {
   constexpr uint NUM_ENTRIES = 1000u;
@@ -5876,7 +6093,8 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
   }
 
   auto& ioctx = ref.ioctx;
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+  version_t epoch = 0;
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y, 0, nullptr, &epoch);
 
   /* raced with another operation, object state is indeterminate */
   const bool need_invalidate = (r == -ECANCELED);
@@ -5888,7 +6106,7 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
       tombstone_entry entry{*state};
       obj_tombstone_cache->add(obj, entry);
     }
-    r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs, y, log_op);
+    r = index_op.complete_del(dpp, poolid, epoch, state->mtime, params.remove_objs, y, log_op);
 
     int ret = target->complete_atomic_modification(dpp, y);
     if (ret < 0) {
@@ -6609,7 +6827,8 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu
   struct timespec mtime_ts = real_clock::to_timespec(mtime);
   op.mtime2(&mtime_ts);
   auto& ioctx = ref.ioctx;
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+  version_t epoch = 0;
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y, 0, nullptr, &epoch);
   if (state) {
     if (r >= 0) {
       ACLOwner owner;
@@ -6640,11 +6859,29 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu
                  iter != state->attrset.end()) {
         storage_class = rgw_bl_str(iter->second);
       }
-      uint64_t epoch = ioctx.get_last_version();
       int64_t poolid = ioctx.get_id();
+
+      // Retain Object category as CloudTiered while restore is in
+      // progress or failed
+      RGWObjCategory category = RGWObjCategory::Main;
+      auto r_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+      if (r_iter != attrs.end()) {
+        rgw::sal::RGWRestoreStatus st = rgw::sal::RGWRestoreStatus::None;
+        auto iter = r_iter->second.cbegin();
+
+        try {
+          using ceph::decode;
+          decode(st, iter);
+
+          if (st != rgw::sal::RGWRestoreStatus::CloudRestored) {
+            category = RGWObjCategory::CloudTiered;
+          }
+        } catch (buffer::error& err) {
+        }
+      }
       r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
                             mtime, etag, content_type, storage_class, owner,
-                            RGWObjCategory::Main, nullptr, y, nullptr, false, log_op);
+                            category, nullptr, y, nullptr, false, log_op);
     } else {
       int ret = index_op.cancel(dpp, nullptr, y, log_op);
       if (ret < 0) {
@@ -6809,6 +7046,13 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
   RGWBucketInfo& bucket_info = source->get_bucket_info();
 
   if (params.part_num) {
+    map<string, bufferlist> src_attrset;
+    for (auto& iter : astate->attrset) {
+      if (boost::algorithm::starts_with(iter.first, RGW_ATTR_CRYPT_PREFIX)) {
+        ldpp_dout(dpp, 4) << "get src crypt attr: " << iter.first << dendl;
+        src_attrset[iter.first] = iter.second;
+      }
+    }
     int parts_count = 0;
     // use the manifest to redirect to the requested part number
     r = get_part_obj_state(dpp, y, store, bucket_info, &source->get_ctx(),
@@ -6831,6 +7075,13 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
     } else {
       params.parts_count = parts_count;
     }
+
+    for (auto& iter : src_attrset) {
+      ldpp_dout(dpp, 4) << "copy crypt attr: " << iter.first << dendl;
+      if (astate->attrset.find(iter.first) == astate->attrset.end()) {
+        astate->attrset[iter.first] = std::move(iter.second);
+      }
+    }
   }
 
   state.obj = astate->obj;
@@ -8805,12 +9056,7 @@ int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
   }
 
   bufferlist outbl;
-  r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y);
-
-  if (epoch) {
-    *epoch = ref.ioctx.get_last_version();
-  }
-
+  r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y, 0, nullptr, epoch);
   if (r < 0)
     return r;
 
@@ -9662,6 +9908,12 @@ int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
     num_entries << " total entries" << dendl;
 
   auto& ioctx = index_pool;
+
+  // XXX: check_disk_state() relies on ioctx.get_last_version() but that
+  // returns 0 because CLSRGWIssueBucketList doesn't make any synchonous calls
+  rgw_bucket_entry_ver index_ver;
+  index_ver.pool = ioctx.get_id();
+
   std::map<int, rgw_cls_list_ret> shard_list_results;
   cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
   maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
@@ -9786,12 +10038,10 @@ int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
       /* there are uncommitted ops. We need to check the current
        * state, and if the tags are old we need to do clean-up as
        * well. */
-      librados::IoCtx sub_ctx;
-      sub_ctx.dup(ioctx);
       ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
 	" calling check_disk_state bucket=" << bucket_info.bucket <<
 	" entry=" << dirent.key << dendl_bitx;
-      r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+      r = check_disk_state(dpp, bucket_info, index_ver, dirent, dirent,
 			   updates[tracker.oid_name], y);
       if (r < 0 && r != -ENOENT) {
 	ldpp_dout(dpp, 0) << __func__ <<
@@ -10013,6 +10263,9 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
     }
   }
 
+  rgw_bucket_entry_ver index_ver;
+  index_ver.pool = ioctx.get_id();
+
   uint32_t count = 0u;
   std::map<std::string, bufferlist> updates;
   rgw_obj_index_key last_added_entry;
@@ -10027,7 +10280,7 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
     cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
 			   num_entries,
                            list_versions, &result);
-    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y, 0, nullptr, &index_ver.epoch);
     if (r < 0) {
       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
 	": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
@@ -10044,12 +10297,10 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
 	  force_check) {
 	/* there are uncommitted ops. We need to check the current state,
 	 * and if the tags are old we need to do cleanup as well. */
-	librados::IoCtx sub_ctx;
-	sub_ctx.dup(ioctx);
 	ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
 	  ": calling check_disk_state bucket=" << bucket_info.bucket <<
 	  " entry=" << dirent.key << dendl_bitx;
-	r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+	r = check_disk_state(dpp, bucket_info, index_ver, dirent, dirent, updates[oid], y);
 	if (r < 0 && r != -ENOENT) {
 	  ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
 	    ": error in check_disk_state, r=" << r << dendl;
@@ -10281,8 +10532,8 @@ int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
 }
 
 int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
-                               librados::IoCtx io_ctx,
                                RGWBucketInfo& bucket_info,
+                               const rgw_bucket_entry_ver& index_ver,
                                rgw_bucket_dir_entry& list_state,
                                rgw_bucket_dir_entry& object,
                                bufferlist& suggested_updates,
@@ -10310,8 +10561,6 @@ int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
     ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
   }
 
-  io_ctx.locator_set_key(list_state.locator);
-
   RGWObjState *astate = NULL;
   RGWObjManifest *manifest = nullptr;
   RGWObjectCtx octx(this->driver);
@@ -10332,8 +10581,7 @@ int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
     }
 
     // encode a suggested removal of that key
-    list_state.ver.epoch = io_ctx.get_last_version();
-    list_state.ver.pool = io_ctx.get_id();
+    list_state.ver = index_ver;
     ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
     cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
     return -ENOENT;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index f95b6654a93..9a2ba0af0e2 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -43,6 +43,7 @@
 #include "rgw_tools.h"
 
 struct D3nDataCache;
+struct RGWLCCloudTierCtx;
 
 class RGWWatcher;
 class ACLOwner;
@@ -1240,6 +1241,18 @@ public:
                      const DoutPrefixProvider *dpp,
                      optional_yield y,
                      bool log_op = true);
+int restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
+                             RGWObjectCtx& obj_ctx,
+                             RGWBucketInfo& dest_bucket_info,
+                             const rgw_obj& dest_obj,
+                             rgw_placement_rule& dest_placement,
+                             RGWObjTier& tier_config,
+                             real_time& mtime,
+                             uint64_t olh_epoch,
+                             std::optional<uint64_t> days,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y,
+                             bool log_op = true);
 
   int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
 
@@ -1642,8 +1655,8 @@ public:
    * will encode that info as a suggested update.)
    */
   int check_disk_state(const DoutPrefixProvider *dpp,
-                       librados::IoCtx io_ctx,
                        RGWBucketInfo& bucket_info,
+                       const rgw_bucket_entry_ver& index_ver,
                        rgw_bucket_dir_entry& list_state,
                        rgw_bucket_dir_entry& object,
                        bufferlist& suggested_updates,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index 91b3cc02648..bb416b0c2c3 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -55,6 +55,7 @@
 #include "rgw_rest_ratelimit.h"
 #include "rgw_rest_realm.h"
 #include "rgw_rest_user.h"
+#include "rgw_lc_tier.h"
 #include "services/svc_sys_obj.h"
 #include "services/svc_mdlog.h"
 #include "services/svc_cls.h"
@@ -2491,6 +2492,107 @@ int RadosObject::transition(Bucket* bucket,
                                            mtime, olh_epoch, dpp, y, flags & FLAG_LOG_OP);
 }
 
+int RadosObject::restore_obj_from_cloud(Bucket* bucket,
+                                  rgw::sal::PlacementTier* tier,
+                                  rgw_placement_rule& placement_rule,
+                            	  rgw_bucket_dir_entry& o,
+                          	  CephContext* cct,
+                                  RGWObjTier& tier_config,
+                                  real_time& mtime,
+                                  uint64_t olh_epoch,
+                                  std::optional<uint64_t> days,
+                                  const DoutPrefixProvider* dpp, 
+                                  optional_yield y,
+                                  uint32_t flags)
+{
+  /* init */
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  string id = "cloudid";
+  string endpoint = rtier->get_rt().t.s3.endpoint;
+  RGWAccessKey key = rtier->get_rt().t.s3.key;
+  string region = rtier->get_rt().t.s3.region;
+  HostStyle host_style = rtier->get_rt().t.s3.host_style;
+  string bucket_name = rtier->get_rt().t.s3.target_path;
+  const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
+  int ret = 0;
+  string src_storage_class = o.meta.storage_class; // or take src_placement also as input
+
+  // fetch mtime of the object
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+  read_op->params.lastmod = &mtime;
+
+  ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Restoring object(" << o.key << "): read_op failed ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (bucket_name.empty()) {
+    bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
+                    "-cloud-bucket";
+    boost::algorithm::to_lower(bucket_name);
+  }
+  /* Create RGW REST connection */
+  S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
+
+  // save source cloudtier storage class
+  RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
+           this, conn, bucket_name,
+           rtier->get_rt().t.s3.target_storage_class);
+  tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
+  tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
+  tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
+  tier_ctx.storage_class = tier->get_storage_class();
+
+  ldpp_dout(dpp, 20) << "Restoring object(" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+
+  if (days && days == 0) {
+    ldpp_dout(dpp, 0) << "Days = 0 not valid; Not restoring object (" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+    return 0;
+  }
+
+  // Note: For non-versioned objects, below should have already been set by the callers-
+  // o.current should be false; this(obj)->instance should have version-id.
+
+  // set restore_status as RESTORE_ALREADY_IN_PROGRESS
+  ret = set_cloud_restore_status(dpp, y, RGWRestoreStatus::RestoreAlreadyInProgress);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << " Setting cloud restore status to RESTORE_ALREADY_IN_PROGRESS for the object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* Restore object from the cloud endpoint.
+   * All restore related status and attrs are set as part of object download to
+   * avoid any races */
+  ret = store->getRados()->restore_obj_from_cloud(tier_ctx, *rados_ctx,
+                                bucket->get_info(), get_obj(), placement_rule,
+                                tier_config,
+                                mtime, olh_epoch, days, dpp, y, flags & FLAG_LOG_OP);
+
+  if (ret < 0) { //failed to restore
+    ldpp_dout(dpp, 0) << "Restoring object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << ret << dendl;
+    auto reset_ret = set_cloud_restore_status(dpp, y, RGWRestoreStatus::RestoreFailed);
+
+    rgw_placement_rule target_placement;
+    target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
+    target_placement.storage_class = tier->get_storage_class();
+
+    /* Reset HEAD object as CloudTiered */
+    reset_ret = write_cloud_tier(dpp, y, tier_ctx.o.versioned_epoch,
+			   tier, tier_ctx.is_multipart_upload,
+			   target_placement, tier_ctx.obj);
+
+    if (reset_ret < 0) {
+      ldpp_dout(dpp, 0) << " Reset to cloud_tier of object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << reset_ret << dendl;
+    }
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "Sucessfully restored object(" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+
+  return ret;
+}
+
 int RadosObject::transition_to_cloud(Bucket* bucket,
 			   rgw::sal::PlacementTier* tier,
 			   rgw_bucket_dir_entry& o,
@@ -2568,6 +2670,118 @@ int RadosObject::transition_to_cloud(Bucket* bucket,
   return ret;
 }
 
+int RadosObject::set_cloud_restore_status(const DoutPrefixProvider* dpp,
+				  optional_yield y,
+			          rgw::sal::RGWRestoreStatus restore_status)
+{
+  int ret = 0;
+  set_atomic();
+ 
+  bufferlist bl;
+  using ceph::encode;
+  encode(restore_status, bl);
+
+  ret = modify_obj_attrs(RGW_ATTR_RESTORE_STATUS, bl, y, dpp);
+
+  return ret;
+}
+
+/*
+ * If the object is restored temporarily and is expired, delete the data and
+ * reset the HEAD object as cloud-transitioned.
+ */
+int RadosObject::handle_obj_expiry(const DoutPrefixProvider* dpp, optional_yield y) {
+  int ret = 0;
+  real_time read_mtime;
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+  read_op->params.lastmod = &read_mtime;
+  ldpp_dout(dpp, 20) << "Entering handle_obj_expiry Obj:" << get_key() << dendl;
+
+  ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "handle_obj_expiry Obj:" << get_key() << 
+	    ", read_op failed ret=" << ret << dendl;
+    return ret;
+  }
+
+  set_atomic();
+  map<string, bufferlist> attrs = get_attrs();
+  RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Write obj_op(&op_target);
+	Object* obj = (Object*)this;
+
+  bufferlist bl;
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+  if (attr_iter != attrs.end()) {
+    using ceph::decode;
+    rgw::sal::RGWRestoreType restore_type;
+    decode(restore_type, attr_iter->second);
+    if (restore_type == rgw::sal::RGWRestoreType::Temporary) {
+      ldpp_dout(dpp, 10) << "Expiring temporary restored Obj:" << get_key() << dendl;
+
+      attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+      if (attr_iter != attrs.end()) {
+        RGWObjManifest m;
+        try {
+          using ceph::decode;
+          decode(m, attr_iter->second);
+          obj_op.meta.modify_tail = true;
+          obj_op.meta.flags = PUT_OBJ_CREATE;
+          obj_op.meta.category = RGWObjCategory::CloudTiered;
+          obj_op.meta.delete_at = real_time();
+          bufferlist blo;
+          obj_op.meta.data = &blo;
+          obj_op.meta.if_match = NULL;
+          obj_op.meta.if_nomatch = NULL;
+          obj_op.meta.user_data = NULL;
+          obj_op.meta.zones_trace = NULL;
+          obj_op.meta.set_mtime = read_mtime;
+
+          RGWObjManifest *pmanifest;
+          pmanifest = &m;
+
+	        Object* head_obj = (Object*)this;
+          RGWObjTier tier_config;
+          m.get_tier_config(&tier_config);
+	
+          rgw_placement_rule target_placement(pmanifest->get_head_placement_rule(), tier_config.name);
+
+          pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
+          pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
+          pmanifest->set_obj_size(0);
+          obj_op.meta.manifest = pmanifest;
+
+          // erase restore attrs
+          attrs.erase(RGW_ATTR_RESTORE_STATUS);
+          attrs.erase(RGW_ATTR_RESTORE_TYPE);
+          attrs.erase(RGW_ATTR_RESTORE_TIME);
+          attrs.erase(RGW_ATTR_RESTORE_EXPIRY_DATE);
+          attrs.erase(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+          bufferlist bl;
+          bl.append(tier_config.name);
+          attrs[RGW_ATTR_STORAGE_CLASS] = bl;
+
+          const req_context rctx{dpp, y, nullptr};
+          return obj_op.write_meta(0, 0, attrs, rctx, head_obj->get_trace());
+        } catch (const buffer::end_of_buffer&) {
+          // ignore empty manifest; it's not cloud-tiered
+        } catch (const std::exception& e) {
+        }
+      }
+      return 0;
+    }
+  }
+  // object is not restored/temporary; go for regular deletion
+  // ensure object is not overwritten and is really expired
+  if (is_expired()) {
+    ldpp_dout(dpp, 10) << "Deleting expired obj:" << get_key() << dendl;
+
+    ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
+  }
+
+  return ret;
+}
 int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
 				  optional_yield y,
 				  uint64_t olh_epoch,
@@ -2592,7 +2806,6 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
   obj_op.meta.if_nomatch = NULL;
   obj_op.meta.user_data = NULL;
   obj_op.meta.zones_trace = NULL;
-  obj_op.meta.delete_at = real_time();
   obj_op.meta.olh_epoch = olh_epoch;
 
   RGWObjManifest *pmanifest;
@@ -2621,6 +2834,13 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
   attrs.erase(RGW_ATTR_ID_TAG);
   attrs.erase(RGW_ATTR_TAIL_TAG);
 
+  // erase restore attrs
+  attrs.erase(RGW_ATTR_RESTORE_STATUS);
+  attrs.erase(RGW_ATTR_RESTORE_TYPE);
+  attrs.erase(RGW_ATTR_RESTORE_TIME);
+  attrs.erase(RGW_ATTR_RESTORE_EXPIRY_DATE);
+  attrs.erase(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
   const req_context rctx{dpp, y, nullptr};
   return obj_op.write_meta(0, 0, attrs, rctx, head_obj->get_trace());
 }
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
index 0372c5882aa..be681c9f975 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.h
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -626,6 +626,18 @@ class RadosObject : public StoreObject {
 			   bool update_object,
 			   const DoutPrefixProvider* dpp,
 			   optional_yield y) override;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+         		   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+  		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
     virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
     virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
 
@@ -664,6 +676,10 @@ class RadosObject : public StoreObject {
 			   bool is_multipart_upload,
 			   rgw_placement_rule& target_placement,
 			   Object* head_obj);
+    int handle_obj_expiry(const DoutPrefixProvider* dpp, optional_yield y);
+    int set_cloud_restore_status(const DoutPrefixProvider* dpp,
+			         optional_yield y,
+		                 RGWRestoreStatus restore_status);
     RGWObjManifest* get_manifest() { return manifest; }
     RGWObjectCtx& get_ctx() { return *rados_ctx; }
 
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
index 0af353b866f..f5cd193d815 100644
--- a/src/rgw/driver/rados/rgw_tools.cc
+++ b/src/rgw/driver/rados/rgw_tools.cc
@@ -198,36 +198,52 @@ int rgw_delete_system_obj(const DoutPrefixProvider *dpp,
 
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags, const jspan_context* trace_info)
+                      optional_yield y, int flags, const jspan_context* trace_info,
+                      version_t* pver)
 {
   // given a yield_context, call async_operate() to yield the coroutine instead
   // of blocking
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    auto bl = librados::async_operate(
+    auto [ver, bl] = librados::async_operate(
       yield, ioctx, oid, op, flags, trace_info, yield[ec]);
     if (pbl) {
       *pbl = std::move(bl);
     }
+    if (pver) {
+      *pver = ver;
+    }
     return -ec.value();
   }
   maybe_warn_about_blocking(dpp);
-  return ioctx.operate(oid, op, nullptr, flags);
+  int r = ioctx.operate(oid, op, nullptr, flags);
+  if (pver) {
+    *pver = ioctx.get_last_version();
+  }
+  return r;
 }
 
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectWriteOperation *op, optional_yield y,
-		      int flags, const jspan_context* trace_info)
+		      int flags, const jspan_context* trace_info, version_t* pver)
 {
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    librados::async_operate(yield, ioctx, oid, op, flags, trace_info, yield[ec]);
+    version_t ver = librados::async_operate(yield, ioctx, oid, op, flags,
+                                            trace_info, yield[ec]);
+    if (pver) {
+      *pver = ver;
+    }
     return -ec.value();
   }
   maybe_warn_about_blocking(dpp);
-  return ioctx.operate(oid, op, flags, trace_info);
+  int r = ioctx.operate(oid, op, flags, trace_info);
+  if (pver) {
+    *pver = ioctx.get_last_version();
+  }
+  return r;
 }
 
 int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
@@ -237,8 +253,8 @@ int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, cons
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    auto reply = librados::async_notify(yield, ioctx, oid,
-                                        bl, timeout_ms, yield[ec]);
+    auto [ver, reply] = librados::async_notify(yield, ioctx, oid,
+                                               bl, timeout_ms, yield[ec]);
     if (pbl) {
       *pbl = std::move(reply);
     }
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h
index 257e513a9f7..016da256263 100644
--- a/src/rgw/driver/rados/rgw_tools.h
+++ b/src/rgw/driver/rados/rgw_tools.h
@@ -93,10 +93,12 @@ void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, c
 /// perform the rados operation, using the yield context when given
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags = 0, const jspan_context* trace_info = nullptr);
+                      optional_yield y, int flags = 0, const jspan_context* trace_info = nullptr,
+                      version_t* pver = nullptr);
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectWriteOperation *op, optional_yield y,
-		      int flags = 0, const jspan_context* trace_info = nullptr);
+		      int flags = 0, const jspan_context* trace_info = nullptr,
+                      version_t* pver = nullptr);
 int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                      bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
                      optional_yield y);
diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc
index f9de570aa54..7d5fe3bcb21 100644
--- a/src/rgw/driver/rados/rgw_zone.cc
+++ b/src/rgw/driver/rados/rgw_zone.cc
@@ -1355,6 +1355,20 @@ int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
       retain_head_object = false;
     }
   }
+  if (config.exists("allow_read_through")) {
+    string s = config["allow_read_through"];
+    if (s == "true") {
+      allow_read_through = true;
+    } else {
+      allow_read_through = false;
+    }
+  }
+  if (config.exists("read_through_restore_days")) {
+    r = conf_to_uint64(config, "read_through_restore_days", &read_through_restore_days);
+    if (r < 0) {
+      read_through_restore_days = DEFAULT_READ_THROUGH_RESTORE_DAYS;
+    }
+  }
 
   if (tier_type == "cloud-s3") {
     r = t.s3.update_params(config);
@@ -1368,6 +1382,12 @@ int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
   if (config.exists("retain_head_object")) {
     retain_head_object = false;
   }
+  if (config.exists("allow_read_through")) {
+    allow_read_through = false;
+  }
+  if (config.exists("read_through_restore_days")) {
+    read_through_restore_days = DEFAULT_READ_THROUGH_RESTORE_DAYS;
+  }
 
   if (tier_type == "cloud-s3") {
     t.s3.clear_params(config);
diff --git a/src/rgw/rgw_aio.cc b/src/rgw/rgw_aio.cc
index 7fba58ad63f..d2e56c57298 100644
--- a/src/rgw/rgw_aio.cc
+++ b/src/rgw/rgw_aio.cc
@@ -76,12 +76,12 @@ struct Handler {
   librados::IoCtx ctx;
   AioResult& r;
   // write callback
-  void operator()(boost::system::error_code ec) const {
+  void operator()(boost::system::error_code ec, version_t) const {
     r.result = -ec.value();
     throttle->put(r);
   }
   // read callback
-  void operator()(boost::system::error_code ec, bufferlist bl) const {
+  void operator()(boost::system::error_code ec, version_t, bufferlist bl) const {
     r.result = -ec.value();
     r.data = std::move(bl);
     throttle->put(r);
diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc
index 290b9bb46b3..ac1ed8b75d6 100644
--- a/src/rgw/rgw_auth.cc
+++ b/src/rgw/rgw_auth.cc
@@ -505,12 +505,12 @@ rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strat
       ldpp_dout(dpp, 5) << "Failed the auth strategy, reason="
                        << result.get_reason() << dendl;
       // Special handling for expired pre-signed URL
-      if (result.get_reason() == ERR_PRESIGNED_URL_EXPIRED) {
+      if (result.get_reason() == -ERR_PRESIGNED_URL_EXPIRED) {
         result = result_t::deny(-EPERM);
         set_req_state_err(s, -EPERM, "The pre-signed URL has expired");
       }
       // Special handling for disabled presigned URL
-      if (result.get_reason() == ERR_PRESIGNED_URL_DISABLED) {
+      if (result.get_reason() == -ERR_PRESIGNED_URL_DISABLED) {
         result = result_t::deny(-EPERM);
         set_req_state_err(s, -EPERM, "Presigned URLs are disabled by admin");
       }
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
index 412f4bf759a..4fe1e39d0a8 100644
--- a/src/rgw/rgw_auth_s3.cc
+++ b/src/rgw/rgw_auth_s3.cc
@@ -191,6 +191,7 @@ static inline void get_v2_qs_map(const req_info& info,
  * compute a request's signature
  */
 bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    RGWOpType op_type,
                                     const req_info& info,
                                     utime_t* const header_time,
                                     std::string& dest,
@@ -253,7 +254,8 @@ bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
     request_uri = info.effective_uri;
   }
 
-  rgw_create_s3_canonical_header(dpp, info.method, content_md5, content_type,
+  auto method = rgw::auth::s3::get_canonical_method(dpp, op_type, info);
+  rgw_create_s3_canonical_header(dpp, method.c_str(), content_md5, content_type,
                                  date.c_str(), meta_map, qs_map,
 				 request_uri.c_str(), sub_resources, dest);
   return true;
@@ -704,35 +706,6 @@ std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op)
   return canonical_qs;
 }
 
-std::string get_v4_canonical_method(const req_state* s)
-{
-  /* If this is a OPTIONS request we need to compute the v4 signature for the
-   * intended HTTP method and not the OPTIONS request itself. */
-  if (s->op_type == RGW_OP_OPTIONS_CORS) {
-    const char *cors_method = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
-
-    if (cors_method) {
-      /* Validate request method passed in access-control-request-method is valid. */
-      auto cors_flags = get_cors_method_flags(cors_method);
-      if (!cors_flags) {
-          ldpp_dout(s, 1) << "invalid access-control-request-method header = "
-                          << cors_method << dendl;
-          throw -EINVAL;
-      }
-
-      ldpp_dout(s, 10) << "canonical req method = " << cors_method
-                       << ", due to access-control-request-method header" << dendl;
-      return cors_method;
-    } else {
-      ldpp_dout(s, 1) << "invalid http options req missing "
-                      << "access-control-request-method header" << dendl;
-      throw -EINVAL;
-    }
-  }
-
-  return s->info.method;
-}
-
 boost::optional<std::string>
 get_v4_canonical_headers(const req_info& info,
                          const std::string_view& signedheaders,
@@ -1740,4 +1713,32 @@ AWSv4ComplSingle::create(const req_state* const s,
   return std::make_shared<AWSv4ComplSingle>(s);
 }
 
+std::string get_canonical_method(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info)
+{
+  /* If this is a OPTIONS request we need to compute the v4 signature for the
+   * intended HTTP method and not the OPTIONS request itself. */
+  if (op_type == RGW_OP_OPTIONS_CORS) {
+    const char *cors_method = info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+
+    if (cors_method) {
+      /* Validate request method passed in access-control-request-method is valid. */
+      auto cors_flags = get_cors_method_flags(cors_method);
+      if (!cors_flags) {
+          ldpp_dout(dpp, 1) << "invalid access-control-request-method header = "
+                          << cors_method << dendl;
+          throw -EINVAL;
+      }
+
+      ldpp_dout(dpp, 10) << "canonical req method = " << cors_method
+                       << ", due to access-control-request-method header" << dendl;
+      return cors_method;
+    } else {
+      ldpp_dout(dpp, 1) << "invalid http options req missing "
+                      << "access-control-request-method header" << dendl;
+      throw -EINVAL;
+    }
+  }
+
+  return info.method;
+}
 } // namespace rgw::auth::s3
diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h
index e1fe5163f02..2f7fd2d7598 100644
--- a/src/rgw/rgw_auth_s3.h
+++ b/src/rgw/rgw_auth_s3.h
@@ -500,16 +500,17 @@ void rgw_create_s3_canonical_header(
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str);
 bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    RGWOpType op_type,
                                     const req_info& info,
                                     utime_t *header_time,       /* out */
                                     std::string& dest,          /* out */
                                     bool qsr);
 static inline std::tuple<bool, std::string, utime_t>
-rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, const req_info& info, const bool qsr) {
+rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info, const bool qsr) {
   std::string dest;
   utime_t header_time;
 
-  const bool ok = rgw_create_s3_canonical_header(dpp, info, &header_time, dest, qsr);
+  const bool ok = rgw_create_s3_canonical_header(dpp, op_type, info, &header_time, dest, qsr);
   return std::make_tuple(ok, dest, header_time);
 }
 
@@ -704,8 +705,6 @@ std::string get_v4_canonical_qs(const req_info& info, bool using_qs);
 
 std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op);
 
-std::string get_v4_canonical_method(const req_state* s);
-
 boost::optional<std::string>
 get_v4_canonical_headers(const req_info& info,
                          const std::string_view& signedheaders,
@@ -745,6 +744,8 @@ extern AWSEngine::VersionAbstractor::server_signature_t
 get_v2_signature(CephContext*,
                  const std::string& secret_key,
                  const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign);
+
+std::string get_canonical_method(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info);
 } /* namespace s3 */
 } /* namespace auth */
 } /* namespace rgw */
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index b9e969a06fa..a8f6a1107a9 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -118,6 +118,12 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_SOURCE_ZONE    RGW_ATTR_PREFIX "source_zone"
 #define RGW_ATTR_TAGS           RGW_ATTR_PREFIX RGW_AMZ_PREFIX "tagging"
 
+#define RGW_ATTR_CLOUDTIER_STORAGE_CLASS  RGW_ATTR_PREFIX "cloudtier_storage_class"
+#define RGW_ATTR_RESTORE_STATUS   RGW_ATTR_PREFIX "restore-status"
+#define RGW_ATTR_RESTORE_TYPE   RGW_ATTR_PREFIX "restore-type"
+#define RGW_ATTR_RESTORE_TIME   RGW_ATTR_PREFIX "restored-at"
+#define RGW_ATTR_RESTORE_EXPIRY_DATE   RGW_ATTR_PREFIX "restore-expiry-date"
+
 #define RGW_ATTR_TEMPURL_KEY1   RGW_ATTR_META_PREFIX "temp-url-key"
 #define RGW_ATTR_TEMPURL_KEY2   RGW_ATTR_META_PREFIX "temp-url-key-2"
 
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index 0c80ad1b765..78807888dfd 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -1991,6 +1991,12 @@ int RGWLC::process(LCWorker* worker,
     }
   }
 
+  ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->process_expire_objects(this, null_yield);
+  if (ret < 0) {
+    ldpp_dout(this, 5) << "RGWLC::process_expire_objects: failed, "
+	          << " worker ix: " << worker->ix << dendl;
+  }
+
   return 0;
 }
 
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
index b4c6ad4a86b..cc6a7e51a1d 100644
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -469,7 +469,7 @@ struct transition_action
   int days;
   boost::optional<ceph::real_time> date;
   std::string storage_class;
-  transition_action() : days(0) {}
+  transition_action() : days(-1) {}
   void dump(Formatter *f) const {
     if (!date) {
       f->dump_int("days", days);
diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h
index 7b8d12599f4..2973a753fff 100644
--- a/src/rgw/rgw_lua_background.h
+++ b/src/rgw/rgw_lua_background.h
@@ -153,9 +153,8 @@ private:
 
   void run();
 
-protected:
   std::string rgw_script;
-  virtual int read_script();
+  int read_script();
 
 public:
   Background(rgw::sal::Driver* _driver,
@@ -173,7 +172,7 @@ public:
     std::unique_lock cond_lock(table_mutex);
     rgw_map[key] = value;
   }
-   
+
   // update the manager after 
   void set_manager(rgw::sal::LuaManager* _lua_manager);
   void pause() override;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index b54805bdc7d..67829e6320a 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -941,37 +941,131 @@ void handle_replication_status_header(
 }
 
 /*
- * GET on CloudTiered objects is processed only when sent from the sync client.
- * In all other cases, fail with `ERR_INVALID_OBJECT_STATE`.
+ * GET on CloudTiered objects either it will synced to other zones.
+ * In all other cases, it will try to fetch the object from remote cloud endpoint.
  */
-int handle_cloudtier_obj(rgw::sal::Attrs& attrs, bool sync_cloudtiered) {
+int handle_cloudtier_obj(req_state* s, const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                         rgw::sal::Attrs& attrs, bool sync_cloudtiered, std::optional<uint64_t> days,
+                         bool restore_op, optional_yield y)
+{
   int op_ret = 0;
+  ldpp_dout(dpp, 20) << "reached handle cloud tier " << dendl;
   auto attr_iter = attrs.find(RGW_ATTR_MANIFEST);
-  if (attr_iter != attrs.end()) {
-    RGWObjManifest m;
-    try {
-      decode(m, attr_iter->second);
-      if (m.get_tier_type() == "cloud-s3") {
-        if (!sync_cloudtiered) {
-          /* XXX: Instead send presigned redirect or read-through */
+  if (attr_iter == attrs.end()) {
+    if (restore_op) {
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+      s->err.message = "only cloud tier object can be restored";
+      return op_ret;
+    } else { //ignore for read-through
+      return 0;
+    }
+  }
+  RGWObjManifest m;
+  try { 
+    decode(m, attr_iter->second);
+    if (m.get_tier_type() != "cloud-s3") {
+      ldpp_dout(dpp, 20) << "not a cloud tier object " <<  s->object->get_key().name << dendl;
+      if (restore_op) {
+        op_ret = -ERR_INVALID_OBJECT_STATE;
+        s->err.message = "only cloud tier object can be restored";
+        return op_ret;
+      } else { //ignore for read-through
+        return 0;
+      }
+    }
+    RGWObjTier tier_config;
+    m.get_tier_config(&tier_config);
+    if (sync_cloudtiered) {
+      bufferlist t, t_tier;
+      t.append("cloud-s3");
+      attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
+      encode(tier_config, t_tier);
+      attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+      return op_ret;
+    }
+    attr_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+    rgw::sal::RGWRestoreStatus restore_status = rgw::sal::RGWRestoreStatus::None;
+    if (attr_iter != attrs.end()) {
+      bufferlist bl = attr_iter->second;
+      auto iter = bl.cbegin();
+      decode(restore_status, iter);
+    }
+    if (attr_iter == attrs.end() || restore_status == rgw::sal::RGWRestoreStatus::RestoreFailed) {
+      // first time restore or previous restore failed
+      rgw::sal::Bucket* pbucket = NULL;
+      pbucket = s->bucket.get();
+
+      std::unique_ptr<rgw::sal::PlacementTier> tier;
+      rgw_placement_rule target_placement;
+      target_placement.inherit_from(pbucket->get_placement_rule());
+      attr_iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
+      if (attr_iter != attrs.end()) {
+        target_placement.storage_class = attr_iter->second.to_str();
+      }
+      op_ret = driver->get_zone()->get_zonegroup().get_placement_tier(target_placement, &tier);
+      ldpp_dout(dpp, 20) << "getting tier placement handle cloud tier" << op_ret <<
+                       " storage class " << target_placement.storage_class << dendl;
+      if (op_ret < 0) {
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier.get());
+      tier_config.tier_placement = rtier->get_rt();
+      if (!restore_op) {
+        if (tier_config.tier_placement.allow_read_through) {
+          days = tier_config.tier_placement.read_through_restore_days;
+        } else { //read-through is not enabled
           op_ret = -ERR_INVALID_OBJECT_STATE;
-        } else { // fetch object for sync and set cloud_tier attrs
-          bufferlist t, t_tier;
-          RGWObjTier tier_config;
-          m.get_tier_config(&tier_config);
-
-          t.append("cloud-s3");
-          attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
-          encode(tier_config, t_tier);
-          attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+          s->err.message = "Read through is not enabled for this config";
+          return op_ret;
         }
       }
-    } catch (const buffer::end_of_buffer&) {
-      // ignore empty manifest; it's not cloud-tiered
-    } catch (const std::exception& e) {
+      // fill in the entry. XXX: Maybe we can avoid it by passing only necessary params
+      rgw_bucket_dir_entry ent;
+      ent.key.name = s->object->get_key().name;
+      ent.meta.accounted_size = ent.meta.size = s->obj_size;
+      ent.meta.etag = "" ;
+      ceph::real_time mtime = s->object->get_mtime();
+      uint64_t epoch = 0;
+      op_ret = get_system_versioning_params(s, &epoch, NULL);
+      ldpp_dout(dpp, 20) << "getting versioning params tier placement handle cloud tier" << op_ret << dendl;
+      if (op_ret < 0) {
+	ldpp_dout(dpp, 20) << "failed to get versioning params, op_ret = " << op_ret << dendl;
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      op_ret = s->object->restore_obj_from_cloud(pbucket, tier.get(), target_placement, ent, s->cct, tier_config,
+                                                   mtime, epoch, days, dpp, y, s->bucket->get_info().flags);
+      if (op_ret < 0) {
+        ldpp_dout(dpp, 0) << "object " << ent.key.name << " fetching failed" << op_ret << dendl;
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      ldpp_dout(dpp, 20) << "object " << ent.key.name << " fetching succeed" << dendl;
+      /*  Even if restore is complete the first read through request will return but actually downloaded
+       * object asyncronously.
+       */
+      if (!restore_op) { //read-through
+        op_ret = -ERR_REQUEST_TIMEOUT;
+        ldpp_dout(dpp, 5) << "restore is still in progress, please check restore status and retry" << dendl;
+        s->err.message = "restore is still in progress";
+      }
+      return op_ret;
+    } else if ((!restore_op) && (restore_status == rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress)) {
+      op_ret = -ERR_REQUEST_TIMEOUT;
+      ldpp_dout(dpp, 5) << "restore is still in progress, please check restore status and retry" << dendl;
+      s->err.message = "restore is still in progress";
+    } else { // CloudRestored..return success
+      return 0;
     }
+  } catch (const buffer::end_of_buffer&) {
+    //empty manifest; it's not cloud-tiered
+    if (restore_op) {
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+      s->err.message = "only cloud tier object can be restored";
+    }
+  } catch (const std::exception& e) {
   }
-
   return op_ret;
 }
 
@@ -2366,15 +2460,12 @@ void RGWGetObj::execute(optional_yield y)
     } catch (const buffer::error&) {}
   }
 
-
   if (get_type() == RGW_OP_GET_OBJ && get_data) {
-    op_ret = handle_cloudtier_obj(attrs, sync_cloudtiered);
+    std::optional<uint64_t> days;
+    op_ret = handle_cloudtier_obj(s, this, driver, attrs, sync_cloudtiered, days, false, y);
     if (op_ret < 0) {
       ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object
-          <<". Failing with " << op_ret << dendl;
-      if (op_ret == -ERR_INVALID_OBJECT_STATE) {
-        s->err.message = "This object was transitioned to cloud-s3";
-      }
+                       <<". Failing with " << op_ret << dendl;
       goto done_err;
     }
   }
@@ -5155,6 +5246,73 @@ void RGWPutMetadataObject::execute(optional_yield y)
   op_ret = s->object->set_obj_attrs(this, &attrs, &rmattrs, s->yield, rgw::sal::FLAG_LOG_OP);
 }
 
+int RGWRestoreObj::init_processing(optional_yield y)
+{
+  int op_ret = get_params(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  return RGWOp::init_processing(y);
+}
+
+int RGWRestoreObj::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission(this, s, ARN(s->object->get_obj()),
+                                rgw::IAM::s3RestoreObject)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWRestoreObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWRestoreObj::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+  
+  s->object->set_atomic();
+  int op_ret = s->object->get_obj_attrs(y, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to fetch get_obj_attrs op ret = " << op_ret << dendl;
+    return;
+  }
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+  if (attr_iter != attrs.end()) {
+    RGWObjManifest m;
+    decode(m, attr_iter->second);
+    RGWObjTier tier_config;
+    m.get_tier_config(&tier_config);
+    if (m.get_tier_type() == "cloud-s3") {
+      ldpp_dout(this, 20) << "execute: expiry days" << expiry_days <<dendl;
+      op_ret = handle_cloudtier_obj(s, this, driver, attrs, false, expiry_days, true, y);
+      if (op_ret < 0) {
+        ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object
+        <<". Failing with " << op_ret << dendl;
+        if (op_ret == -ERR_INVALID_OBJECT_STATE) {
+          s->err.message = "This object was transitioned to cloud-s3";
+        }
+      }
+    } else {
+      ldpp_dout(this, 20) << "not cloud tier object erroring" << dendl;
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+    }
+  } else {
+    ldpp_dout(this, 20) << " manifest not found" << dendl;
+  }
+  ldpp_dout(this, 20) << "completed restore" << dendl;
+
+  return;
+} 
+
 int RGWDeleteObj::handle_slo_manifest(bufferlist& bl, optional_yield y)
 {
   RGWSLOInfo slo_info;
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 47a4c3da609..df05500a437 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -1461,6 +1461,24 @@ public:
   virtual bool need_object_expiration() { return false; }
 };
 
+class RGWRestoreObj : public RGWOp {
+protected:
+  std::optional<uint64_t> expiry_days;
+public:
+  RGWRestoreObj() {}
+
+  int init_processing(optional_yield y) override;
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual int get_params(optional_yield y) {return 0;}
+
+  void send_response() override = 0;
+  const char* name() const override { return "restore_obj"; }
+  RGWOpType get_type() override { return RGW_OP_RESTORE_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
 class RGWDeleteObj : public RGWOp {
 protected:
   bool delete_marker;
diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h
index 12291d64cb3..f0c3b072e47 100644
--- a/src/rgw/rgw_op_type.h
+++ b/src/rgw/rgw_op_type.h
@@ -25,6 +25,7 @@ enum RGWOpType {
   RGW_OP_PUT_METADATA_BUCKET,
   RGW_OP_PUT_METADATA_OBJECT,
   RGW_OP_SET_TEMPURL,
+  RGW_OP_RESTORE_OBJ,
   RGW_OP_DELETE_OBJ,
   RGW_OP_COPY_OBJ,
   RGW_OP_GET_ACLS,
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index fae60c50f4d..3abba0124a6 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -318,6 +318,12 @@ public:
   ~RGWPutMetadataObject_ObjStore() override {}
 };
 
+class RGWRestoreObj_ObjStore : public RGWRestoreObj {
+public:
+  RGWRestoreObj_ObjStore() {}
+  ~RGWRestoreObj_ObjStore() override {}
+};
+
 class RGWDeleteObj_ObjStore : public RGWDeleteObj {
 public:
   RGWDeleteObj_ObjStore() {}
diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc
index 45b5e3076f4..c16064a61c2 100644
--- a/src/rgw/rgw_rest_client.cc
+++ b/src/rgw/rgw_rest_client.cc
@@ -209,7 +209,7 @@ static int sign_request_v2(const DoutPrefixProvider *dpp, const RGWAccessKey& ke
   }
 
   string canonical_header;
-  if (!rgw_create_s3_canonical_header(dpp, info, NULL, canonical_header, false)) {
+  if (!rgw_create_s3_canonical_header(dpp, RGW_OP_UNKNOWN, info, NULL, canonical_header, false)) {
     ldpp_dout(dpp, 0) << "failed to create canonical s3 header" << dendl;
     return -EINVAL;
   }
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 4a50baf1cb2..a245fca9945 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -519,6 +519,22 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs,
 	}
       }
     } /* checksum_mode */
+    auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+    if (attr_iter != attrs.end()) {
+      rgw::sal::RGWRestoreType rt;
+      bufferlist bl = attr_iter->second;
+      auto iter = bl.cbegin();
+      decode(rt, iter);
+
+      if (rt == rgw::sal::RGWRestoreType::Temporary) {
+        // temporary restore; set storage-class to cloudtier storage class
+        auto c_iter = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+        if (c_iter != attrs.end()) {
+          attrs[RGW_ATTR_STORAGE_CLASS] = c_iter->second;
+        }
+      }
+    }
 
     for (struct response_attr_param *p = resp_attr_params; p->param; p++) {
       bool exists;
@@ -3435,6 +3451,106 @@ int RGWPostObj_ObjStore_S3::get_encrypt_filter(
   return res;
 }
 
+struct RestoreObjectRequest {
+  std::optional<uint64_t> days;
+
+  void decode_xml(XMLObj *obj) {
+    RGWXMLDecoder::decode_xml("Days", days, obj);
+  }
+
+  void dump_xml(Formatter *f) const {
+    encode_xml("Days", days, f);
+  }
+};
+
+int RGWRestoreObj_ObjStore_S3::get_params(optional_yield y)
+{ 
+  std::string expected_bucket_owner;
+
+  if (s->info.env->get("x-amz-expected-bucket-owner") != nullptr) {
+    expected_bucket_owner = s->info.env->get("x-amz-expected-bucket-owner");
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  RGWXMLDecoder::XMLParser parser;
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0) {
+    return r;
+  }
+
+  if(!parser.init()) {
+    return -EINVAL;
+  }
+
+   if (!parser.parse(data.c_str(), data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+
+  RestoreObjectRequest request;
+
+  try {
+    RGWXMLDecoder::decode_xml("RestoreRequest", request, &parser);
+  }
+  catch (RGWXMLDecoder::err &err) {
+    ldpp_dout(this, 5) << "Malformed restore request: " << err << dendl;
+    return -EINVAL;
+  }
+
+  if (request.days) {
+    expiry_days = request.days.value();
+    ldpp_dout(this, 10) << "expiry_days=" << expiry_days << dendl;
+  } else {
+    expiry_days=nullopt;
+    ldpp_dout(this, 10) << "expiry_days=" << expiry_days << dendl;
+  }
+
+  return 0;
+}
+
+void RGWRestoreObj_ObjStore_S3::send_response()
+{
+  if (op_ret < 0)
+  {
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    end_header(s, this);
+    dump_start(s);
+    return;
+  }
+
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+  rgw::sal::RGWRestoreStatus restore_status;
+  if (attr_iter != attrs.end()) {
+    bufferlist bl = attr_iter->second;
+    auto iter = bl.cbegin();
+    decode(restore_status, iter);
+  }
+  ldpp_dout(this, 10) << "restore_status=" << restore_status << dendl;
+  
+  if (attr_iter == attrs.end() || restore_status != rgw::sal::RGWRestoreStatus::None) {
+    s->err.http_ret = 202; //Accepted
+    dump_header(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else if (restore_status != rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress) {
+    s->err.http_ret = 409; // Conflict
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else if (restore_status != rgw::sal::RGWRestoreStatus::CloudRestored) {
+    s->err.http_ret = 200; // OK
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else {
+    s->err.http_ret = 202; // Accepted
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  }
+
+  dump_errno(s);
+  end_header(s, this);
+  dump_start(s);
+}
+
 int RGWDeleteObj_ObjStore_S3::get_params(optional_yield y)
 {
   const char *if_unmod = s->info.env->get("HTTP_X_AMZ_DELETE_IF_UNMODIFIED_SINCE");
@@ -4894,6 +5010,9 @@ RGWOp *RGWHandler_REST_Obj_S3::op_post()
   if (s->info.args.exists("uploads"))
     return new RGWInitMultipart_ObjStore_S3;
   
+  if (s->info.args.exists("restore"))
+    return new RGWRestoreObj_ObjStore_S3;
+  
   if (is_select_op())
     return rgw::s3select::create_s3select_op();
 
@@ -5845,7 +5964,7 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
   auto canonical_qs = rgw::auth::s3::get_v4_canonical_qs(s->info, using_qs);
 
   /* Craft canonical method. */
-  auto canonical_method = rgw::auth::s3::get_v4_canonical_method(s);
+  auto canonical_method = rgw::auth::s3::get_canonical_method(s, s->op_type, s->info);
 
   /* Craft canonical request. */
   auto canonical_req_hash = \
@@ -5945,6 +6064,7 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
 	case RGW_OP_PUT_BUCKET_TAGGING:
 	case RGW_OP_PUT_BUCKET_REPLICATION:
         case RGW_OP_PUT_LC:
+        case RGW_OP_RESTORE_OBJ:
         case RGW_OP_SET_REQUEST_PAYMENT:
         case RGW_OP_PUBSUB_NOTIF_CREATE:
         case RGW_OP_PUBSUB_NOTIF_DELETE:
@@ -6109,7 +6229,7 @@ AWSGeneralAbstractor::get_auth_data_v2(const req_state* const s) const
   /* Let's canonize the HTTP headers that are covered by the AWS auth v2. */
   std::string string_to_sign;
   utime_t header_time;
-  if (! rgw_create_s3_canonical_header(s, s->info, &header_time, string_to_sign,
+  if (! rgw_create_s3_canonical_header(s, s->op_type, s->info, &header_time, string_to_sign,
         qsr)) {
     ldpp_dout(s, 10) << "failed to create the canonized auth header\n"
                    << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl;
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index d86123a2525..63909f57036 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -327,6 +327,16 @@ public:
                          rgw::sal::DataProcessor *cb) override;
 };
 
+class RGWRestoreObj_ObjStore_S3 : public RGWRestoreObj_ObjStore {
+
+public:
+  RGWRestoreObj_ObjStore_S3() {}
+  ~RGWRestoreObj_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
 class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore {
 public:
   RGWDeleteObj_ObjStore_S3() {}
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
index f89aa6f4e66..769d7435442 100644
--- a/src/rgw/rgw_sal.h
+++ b/src/rgw/rgw_sal.h
@@ -153,6 +153,21 @@ enum AttrsMod {
 static constexpr uint32_t FLAG_LOG_OP = 0x0001;
 static constexpr uint32_t FLAG_PREVENT_VERSIONING = 0x0002;
 
+enum RGWRestoreStatus : uint8_t {
+  None  = 0,
+  RestoreAlreadyInProgress = 1,
+  CloudRestored = 2,
+  RestoreFailed = 3
+};
+
+
+enum class RGWRestoreType : uint8_t {
+  None = 0,
+  Temporary = 1,
+  Permanent = 2
+};
+
+
 // a simple streaming data processing abstraction
 /**
  * @brief A simple streaming data processing abstraction
@@ -1199,6 +1214,18 @@ class Object {
 			   bool update_object,
 			   const DoutPrefixProvider* dpp,
 			   optional_yield y) = 0;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+         		   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+			   uint32_t flags) = 0;
     /** Check to see if two placement rules match */
     virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) = 0;
     /** Dump driver-specific object layout info in JSON */
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
index 272862cb7e1..733bfa39ee2 100644
--- a/src/rgw/rgw_sal_filter.cc
+++ b/src/rgw/rgw_sal_filter.cc
@@ -1117,6 +1117,23 @@ int FilterObject::transition_to_cloud(Bucket* bucket,
 				   o, cloud_targets, cct, update_object, dpp, y);
 }
 
+int FilterObject::restore_obj_from_cloud(Bucket* bucket,
+		          rgw::sal::PlacementTier* tier,
+		          rgw_placement_rule& placement_rule,
+		          rgw_bucket_dir_entry& o,
+		          CephContext* cct,
+		          RGWObjTier& tier_config,
+		          real_time& mtime,
+		          uint64_t olh_epoch,
+		          std::optional<uint64_t> days,
+		          const DoutPrefixProvider* dpp, 
+		          optional_yield y,
+		          uint32_t flags)
+{
+  return next->restore_obj_from_cloud(nextBucket(bucket), nextPlacementTier(tier),
+           placement_rule, o, cct, tier_config, mtime, olh_epoch, days, dpp, y, flags);
+}
+
 bool FilterObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
 {
   return next->placement_rules_match(r1, r2);
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
index b12ea53a9bb..17b102f7619 100644
--- a/src/rgw/rgw_sal_filter.h
+++ b/src/rgw/rgw_sal_filter.h
@@ -789,6 +789,18 @@ public:
 				  bool update_object,
 				  const DoutPrefixProvider* dpp,
 				  optional_yield y) override;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y,
 			      Formatter* f) override;
diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h
index d9b2f80e1b6..47d031fbfc6 100644
--- a/src/rgw/rgw_sal_store.h
+++ b/src/rgw/rgw_sal_store.h
@@ -352,6 +352,20 @@ class StoreObject : public Object {
        * work with lifecycle */
       return -1;
     }
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+    		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override {
+      return -1;
+    }
     jspan_context& get_trace() override { return trace_ctx; }
     void set_trace (jspan_context&& _trace_ctx) override { trace_ctx = std::move(_trace_ctx); }
 
diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc
index 8d8b44cd961..1acaf9b3d4f 100644
--- a/src/rgw/rgw_zone.cc
+++ b/src/rgw/rgw_zone.cc
@@ -860,6 +860,8 @@ void RGWZoneGroupPlacementTier::decode_json(JSONObj *obj)
   JSONDecoder::decode_json("tier_type", tier_type, obj);
   JSONDecoder::decode_json("storage_class", storage_class, obj);
   JSONDecoder::decode_json("retain_head_object", retain_head_object, obj);
+  JSONDecoder::decode_json("allow_read_through", allow_read_through, obj);
+  JSONDecoder::decode_json("read_through_restore_days", read_through_restore_days, obj);
 
   if (tier_type == "cloud-s3") {
     JSONDecoder::decode_json("s3", t.s3, obj);
@@ -897,6 +899,8 @@ void RGWZoneGroupPlacementTier::dump(Formatter *f) const
   encode_json("tier_type", tier_type, f);
   encode_json("storage_class", storage_class, f);
   encode_json("retain_head_object", retain_head_object, f);
+  encode_json("allow_read_through", allow_read_through, f);
+  encode_json("read_through_restore_days", read_through_restore_days, f);
 
   if (tier_type == "cloud-s3") {
     encode_json("s3", t.s3, f);
diff --git a/src/rgw/rgw_zone_types.h b/src/rgw/rgw_zone_types.h
index 13fce000c41..d44761d7f5a 100644
--- a/src/rgw/rgw_zone_types.h
+++ b/src/rgw/rgw_zone_types.h
@@ -543,9 +543,13 @@ struct RGWZoneGroupPlacementTierS3 {
 WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3)
 
 struct RGWZoneGroupPlacementTier {
+#define DEFAULT_READ_THROUGH_RESTORE_DAYS 1
+
   std::string tier_type;
   std::string storage_class;
   bool retain_head_object = false;
+  bool allow_read_through = false;
+  uint64_t read_through_restore_days = 1;
 
   struct _tier {
     RGWZoneGroupPlacementTierS3 s3;
@@ -555,10 +559,12 @@ struct RGWZoneGroupPlacementTier {
   int clear_params(const JSONFormattable& config);
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     encode(tier_type, bl);
     encode(storage_class, bl);
     encode(retain_head_object, bl);
+    encode(allow_read_through, bl);
+    encode(read_through_restore_days, bl);
     if (tier_type == "cloud-s3") {
       encode(t.s3, bl);
     }
@@ -566,10 +572,14 @@ struct RGWZoneGroupPlacementTier {
   }
 
   void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     decode(tier_type, bl);
     decode(storage_class, bl);
     decode(retain_head_object, bl);
+    if (struct_v >= 2) {
+      decode(allow_read_through, bl);
+      decode(read_through_restore_days, bl);
+    }
     if (tier_type == "cloud-s3") {
       decode(t.s3, bl);
     }
diff --git a/src/rgw/services/svc_sys_obj_core.cc b/src/rgw/services/svc_sys_obj_core.cc
index 397709c5d99..cdbbf353832 100644
--- a/src/rgw/services/svc_sys_obj_core.cc
+++ b/src/rgw/services/svc_sys_obj_core.cc
@@ -169,21 +169,21 @@ int RGWSI_SysObj_Core::read(const DoutPrefixProvider *dpp,
     }
   }
 
-  rgw_rados_ref rados_obj;
-  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  rgw_rados_ref ref;
+  int r = get_rados_obj(dpp, zone_svc, obj, &ref);
   if (r < 0) {
     ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
     return r;
   }
-  r = rados_obj.operate(dpp, &op, nullptr, y);
+
+  version_t op_ver = 0;
+  r = rgw_rados_operate(dpp, ref.ioctx, obj.oid, &op, nullptr, y, 0, nullptr, &op_ver);
   if (r < 0) {
     ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
     return r;
   }
   ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
 
-  uint64_t op_ver = rados_obj.ioctx.get_last_version();
-
   if (read_state.last_ver > 0 &&
       read_state.last_ver != op_ver) {
     ldpp_dout(dpp, 5) << "raced with an object write, abort" << dendl;
diff --git a/src/test/cli-integration/rbd/gwcli_create.t b/src/test/cli-integration/rbd/gwcli_create.t
index b464681fba0..44c75082c94 100644
--- a/src/test/cli-integration/rbd/gwcli_create.t
+++ b/src/test/cli-integration/rbd/gwcli_create.t
@@ -1,43 +1,50 @@
-Podman find iSCSI container
-===========================
-  $ ISCSI_CONTAINER=$(sudo podman ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
+Cephadm prefers podman to docker
+================================
+  $ CENGINE=docker
+  > if command -v podman >/dev/null; then
+  >   CENGINE=podman
+  > fi
+
+Find iSCSI container
+====================
+  $ ISCSI_CONTAINER=$(sudo $CENGINE ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
 
 Dismiss the "could not load preferences file .gwcli/prefs.bin" warning
 ======================================================================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
 
 Create a datapool/block0 disk
 =============================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli disks/ create pool=datapool image=block0 size=300M wwn=36001405da17b74481464e9fa968746d3
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli disks/ create pool=datapool image=block0 size=300M wwn=36001405da17b74481464e9fa968746d3
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
   300M, Disks: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- datapool' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- datapool' | awk -F'[' '{print $2}'
   datapool (300M)]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- block0' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- block0' | awk -F'[' '{print $2}'
   datapool/block0 (Unknown, 300M)]
 
 Create the target IQN
 =====================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/ create target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/ create target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
   DiscoveryAuth: None, Targets: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' | awk -F'[' '{print $2}'
   Auth: None, Gateways: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 0/0, Portals: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- host-groups' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- host-groups' | awk -F'[' '{print $2}'
   Groups : 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 0]
 
 Create the first gateway
 ========================
   $ HOST=$(python3 -c "import socket; print(socket.getfqdn())")
   > IP=`hostname -i | awk '{print $1}'`
-  > sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  > sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 1/1, Portals: 1]
 
 Create the second gateway
@@ -45,34 +52,34 @@ Create the second gateway
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $3}'`
   > if [ "$IP" != `hostname -i | awk '{print $1}'` ]; then
   >   HOST=$(python3 -c "import socket; print(socket.getfqdn('$IP'))")
-  >   sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  >   sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
   > fi
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $4}'`
   > if [ "$IP" != `hostname -i | awk '{print $1}'` ]; then
   >   HOST=$(python3 -c "import socket; print(socket.getfqdn('$IP'))")
-  >   sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  >   sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
   > fi
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 2/2, Portals: 2]
 
 Attach the disk
 ===============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ add disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ add disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 1]
 
 Create a host
 =============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts create client_iqn=iqn.1994-05.com.redhat:client
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts create client_iqn=iqn.1994-05.com.redhat:client
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
   Auth: None, Disks: 0(0.00Y)]
 
 Map the LUN
 ===========
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts/iqn.1994-05.com.redhat:client disk disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts/iqn.1994-05.com.redhat:client disk disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
   Auth: None, Disks: 1(300M)]
diff --git a/src/test/cli-integration/rbd/gwcli_delete.t b/src/test/cli-integration/rbd/gwcli_delete.t
index e973d87a39f..64f75acdd56 100644
--- a/src/test/cli-integration/rbd/gwcli_delete.t
+++ b/src/test/cli-integration/rbd/gwcli_delete.t
@@ -1,31 +1,38 @@
-Podman find iSCSI container
-===========================
-  $ ISCSI_CONTAINER=$(sudo podman ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
+Cephadm prefers podman to docker
+================================
+  $ CENGINE=docker
+  > if command -v podman >/dev/null; then
+  >   CENGINE=podman
+  > fi
+
+Find iSCSI container
+====================
+  $ ISCSI_CONTAINER=$(sudo $CENGINE ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
 
 Dismiss the "could not load preferences file .gwcli/prefs.bin" warning
 ======================================================================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
 
 Delete the host
 ===============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts delete client_iqn=iqn.1994-05.com.redhat:client
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts delete client_iqn=iqn.1994-05.com.redhat:client
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 0]
 
 Delete the iscsi-targets disk
 =============================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ delete disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ delete disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 0]
 
 Delete the target IQN
 =====================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/ delete target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/ delete target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
   DiscoveryAuth: None, Targets: 0]
 
 Delete the disks
 ================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli disks/ delete image_id=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli disks/ delete image_id=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
   0.00Y, Disks: 0]
diff --git a/src/test/cli-integration/rbd/iscsi_client.t b/src/test/cli-integration/rbd/iscsi_client.t
index f636d540d89..9a659e49eca 100644
--- a/src/test/cli-integration/rbd/iscsi_client.t
+++ b/src/test/cli-integration/rbd/iscsi_client.t
@@ -1,7 +1,7 @@
 Login to the target
 ===================
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $3}'`
-  > sudo iscsiadm -m discovery -t st -p $IP -l 2&> /dev/null
+  $ sudo iscsiadm -m discovery -t st -p $IP -l >/dev/null 2>&1
   $ sleep 10
   $ sudo ls /dev/disk/by-path/ |grep 'iscsi-iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' |wc -l
   2
diff --git a/src/test/client/nonblocking.cc b/src/test/client/nonblocking.cc
index d4aecb10ffc..93bcfabd3fc 100644
--- a/src/test/client/nonblocking.cc
+++ b/src/test/client/nonblocking.cc
@@ -111,6 +111,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_a = iov_out_a[0].iov_len + iov_out_a[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_a, 2, 100, true, writefinish.get(), nullptr);
   ASSERT_EQ(0, rc);
@@ -130,6 +132,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_b = iov_out_b[0].iov_len + iov_out_b[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_b, 2, 1000, true, writefinish.get(), nullptr, true, false);
   ASSERT_EQ(0, rc);
diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 977dfe738a9..cee4b427770 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -1,5 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 &smarttab
+// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -57,21 +57,13 @@ TEST(MutexDebug, Lock) {
   test_lock<ceph::mutex_debug>();
 }
 
-TEST(MutexDebug, NotRecursive) {
+TEST(MutexDebugDeathTest, NotRecursive) {
   ceph::mutex_debug m("foo");
-  auto ttl = &test_try_lock<mutex_debug>;
-
-  ASSERT_NO_THROW(m.lock());
-  ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_THROW(m.lock(), std::system_error);
+  // avoid assert during test cleanup where the mutex is locked and cannot be
+  // pthread_mutex_destroy'd
+  std::unique_lock locker{m};
   ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_NO_THROW(m.unlock());
-  ASSERT_FALSE(m.is_locked());
-  ASSERT_TRUE(std::async(std::launch::async, ttl, &m).get());
+  ASSERT_DEATH(m.lock(), "FAILED ceph_assert(recursive || !is_locked_by_me())");
 }
 
 TEST(MutexRecursiveDebug, Lock) {
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 6648719c61c..df743327aaa 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -128,7 +128,8 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
 
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) override;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) override;
 
   void enqueue_drop(
     const pg_shard_t& target,
@@ -243,6 +244,10 @@ struct BackfillFixture::PeeringFacade
   void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) override {
   }
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {}
   bool is_backfilling() const override {
     return true;
   }
@@ -270,6 +275,9 @@ BackfillFixture::BackfillFixture(
                                                    this->backfill_targets),
                    std::make_unique<PGFacade>(this->backfill_source))
 {
+  seastar::global_logger_registry().set_all_loggers_level(
+    seastar::log_level::debug
+  );
   backfill_state.process_event(crimson::osd::BackfillState::Triggered{}.intrusive_from_this());
 }
 
@@ -303,7 +311,8 @@ void BackfillFixture::request_primary_scan(
 
 void BackfillFixture::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &)
 {
   for (auto& [ _, bt ] : backfill_targets) {
     bt.store.push(obj, v);
diff --git a/src/test/librados/asio.cc b/src/test/librados/asio.cc
index 9f8844eb7bb..01ebb957150 100644
--- a/src/test/librados/asio.cc
+++ b/src/test/librados/asio.cc
@@ -28,8 +28,6 @@
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
-using namespace std;
-
 // test fixture for global setup/teardown
 class AsioRados : public ::testing::Test {
   static constexpr auto poolname = "ceph_test_rados_api_asio";
@@ -73,6 +71,9 @@ librados::Rados AsioRados::rados;
 librados::IoCtx AsioRados::io;
 librados::IoCtx AsioRados::snapio;
 
+using boost::system::error_code;
+using read_result = std::tuple<version_t, bufferlist>;
+
 void rethrow(std::exception_ptr eptr) {
   if (eptr) std::rethrow_exception(eptr);
 }
@@ -81,14 +82,17 @@ TEST_F(AsioRados, AsyncReadCallback)
 {
   boost::asio::io_context service;
 
-  auto success_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+  auto success_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   librados::async_read(service, io, "exist", 256, 0, success_cb);
 
-  auto failure_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+  auto failure_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   librados::async_read(service, io, "noexist", 256, 0, failure_cb);
 
@@ -99,17 +103,17 @@ TEST_F(AsioRados, AsyncReadFuture)
 {
   boost::asio::io_context service;
 
-  std::future<bufferlist> f1 = librados::async_read(service, io, "exist", 256,
-                                                    0, boost::asio::use_future);
-  std::future<bufferlist> f2 = librados::async_read(service, io, "noexist", 256,
-                                                    0, boost::asio::use_future);
+  auto f1 = librados::async_read(service, io, "exist", 256,
+                                 0, boost::asio::use_future);
+  auto f2 = librados::async_read(service, io, "noexist", 256,
+                                 0, boost::asio::use_future);
 
   service.run();
 
-  EXPECT_NO_THROW({
-    auto bl = f1.get();
-    EXPECT_EQ("hello", bl.to_str());
-  });
+  auto [ver, bl] = f1.get();
+  EXPECT_LT(0, ver);
+  EXPECT_EQ("hello", bl.to_str());
+
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -118,17 +122,22 @@ TEST_F(AsioRados, AsyncReadYield)
   boost::asio::io_context service;
 
   auto success_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    auto bl = librados::async_read(service, io, "exist", 256, 0, yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_read(service, io, "exist", 256,
+                                          0, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    auto bl = librados::async_read(service, io, "noexist", 256, 0, yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_read(service, io, "noexist", 256,
+                                          0, yield[ec]);
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -142,14 +151,16 @@ TEST_F(AsioRados, AsyncWriteCallback)
   bufferlist bl;
   bl.append("hello");
 
-  auto success_cb = [&] (boost::system::error_code ec) {
+  auto success_cb = [&] (error_code ec, version_t ver) {
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
   };
   librados::async_write(service, io, "exist", bl, bl.length(), 0,
                         success_cb);
 
-  auto failure_cb = [&] (boost::system::error_code ec) {
+  auto failure_cb = [&] (error_code ec, version_t ver) {
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   librados::async_write(service, snapio, "exist", bl, bl.length(), 0,
                         failure_cb);
@@ -171,7 +182,7 @@ TEST_F(AsioRados, AsyncWriteFuture)
 
   service.run();
 
-  EXPECT_NO_THROW(f1.get());
+  EXPECT_LT(0, f1.get());
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -183,19 +194,21 @@ TEST_F(AsioRados, AsyncWriteYield)
   bl.append("hello");
 
   auto success_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    librados::async_write(service, io, "exist", bl, bl.length(), 0,
-                          yield[ec]);
+    error_code ec;
+    auto ver = librados::async_write(service, io, "exist", bl,
+                                     bl.length(), 0, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    librados::async_write(service, snapio, "exist", bl, bl.length(), 0,
-                          yield[ec]);
+    error_code ec;
+    auto ver = librados::async_write(service, snapio, "exist", bl,
+                                     bl.length(), 0, yield[ec]);
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -208,8 +221,9 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    auto success_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+    auto success_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
       EXPECT_FALSE(ec);
+      EXPECT_LT(0, ver);
       EXPECT_EQ("hello", bl.to_str());
     };
     librados::async_operate(service, io, "exist", &op, 0, nullptr, success_cb);
@@ -217,8 +231,10 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    auto failure_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+    auto failure_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
       EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+      EXPECT_EQ(0, ver);
+      EXPECT_EQ(0, bl.length());
     };
     librados::async_operate(service, io, "noexist", &op, 0, nullptr, failure_cb);
   }
@@ -228,14 +244,14 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
 TEST_F(AsioRados, AsyncReadOperationFuture)
 {
   boost::asio::io_context service;
-  std::future<bufferlist> f1;
+  std::future<read_result> f1;
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
     f1 = librados::async_operate(service, io, "exist", &op, 0, nullptr,
                                  boost::asio::use_future);
   }
-  std::future<bufferlist> f2;
+  std::future<read_result> f2;
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
@@ -244,10 +260,10 @@ TEST_F(AsioRados, AsyncReadOperationFuture)
   }
   service.run();
 
-  EXPECT_NO_THROW({
-    auto bl = f1.get();
-    EXPECT_EQ("hello", bl.to_str());
-  });
+  auto [ver, bl] = f1.get();
+  EXPECT_LT(0, ver);
+  EXPECT_EQ("hello", bl.to_str());
+
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -258,10 +274,11 @@ TEST_F(AsioRados, AsyncReadOperationYield)
   auto success_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    boost::system::error_code ec;
-    auto bl = librados::async_operate(service, io, "exist", &op, 0, nullptr,
-                                      yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_operate(service, io, "exist", &op,
+                                             0, nullptr, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
@@ -269,10 +286,12 @@ TEST_F(AsioRados, AsyncReadOperationYield)
   auto failure_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    boost::system::error_code ec;
-    auto bl = librados::async_operate(service, io, "noexist", &op, 0, nullptr,
-                                      yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_operate(service, io, "noexist", &op,
+                                             0, nullptr, yield[ec]);
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -289,16 +308,18 @@ TEST_F(AsioRados, AsyncWriteOperationCallback)
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    auto success_cb = [&] (boost::system::error_code ec) {
+    auto success_cb = [&] (error_code ec, version_t ver) {
       EXPECT_FALSE(ec);
+      EXPECT_LT(0, ver);
     };
     librados::async_operate(service, io, "exist", &op, 0, nullptr, success_cb);
   }
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    auto failure_cb = [&] (boost::system::error_code ec) {
+    auto failure_cb = [&] (error_code ec, version_t ver) {
       EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+      EXPECT_EQ(0, ver);
     };
     librados::async_operate(service, snapio, "exist", &op, 0, nullptr, failure_cb);
   }
@@ -312,14 +333,14 @@ TEST_F(AsioRados, AsyncWriteOperationFuture)
   bufferlist bl;
   bl.append("hello");
 
-  std::future<void> f1;
+  std::future<version_t> f1;
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
     f1 = librados::async_operate(service, io, "exist", &op, 0, nullptr,
                                  boost::asio::use_future);
   }
-  std::future<void> f2;
+  std::future<version_t> f2;
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
@@ -328,7 +349,7 @@ TEST_F(AsioRados, AsyncWriteOperationFuture)
   }
   service.run();
 
-  EXPECT_NO_THROW(f1.get());
+  EXPECT_LT(0, f1.get());
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -342,18 +363,22 @@ TEST_F(AsioRados, AsyncWriteOperationYield)
   auto success_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    boost::system::error_code ec;
-    librados::async_operate(service, io, "exist", &op, 0, nullptr, yield[ec]);
+    error_code ec;
+    auto ver = librados::async_operate(service, io, "exist", &op,
+                                       0, nullptr, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    boost::system::error_code ec;
-    librados::async_operate(service, snapio, "exist", &op, 0, nullptr, yield[ec]);
+    error_code ec;
+    auto ver = librados::async_operate(service, snapio, "exist", &op,
+                                       0, nullptr, yield[ec]);
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt
index 31e82944bf5..f2d1471e22e 100644
--- a/src/test/osd/CMakeLists.txt
+++ b/src/test/osd/CMakeLists.txt
@@ -19,6 +19,14 @@ install(TARGETS
   ceph_test_rados
   DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+add_executable(ceph_test_rados_io_sequence
+  ${CMAKE_CURRENT_SOURCE_DIR}/ceph_test_rados_io_sequence.cc)
+target_link_libraries(ceph_test_rados_io_sequence
+  librados global object_io_exerciser)
+install(TARGETS
+  ceph_test_rados_io_sequence
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 # test_stale_read
 add_executable(ceph_test_osd_stale_read
   ceph_test_osd_stale_read.cc
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc
index d28d428fc06..f93ed7ff67a 100644
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -230,3 +230,28 @@ TEST(ECCommon, get_min_want_to_read_shards)
     ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
   }
 }
+
+TEST(ECCommon, get_min_want_to_read_shards_bug67087)
+{
+  const uint64_t swidth = 4096;
+  const uint64_t ssize = 4;
+
+  ECUtil::stripe_info_t s(ssize, swidth);
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), 1024);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+
+  std::set<int> want_to_read;
+
+  // multitple calls with the same want_to_read can happen during
+  // multi-region reads.
+  {
+    ECCommon::ReadPipeline::get_min_want_to_read_shards(
+      512, 512, s, chunk_mapping, &want_to_read);
+    ASSERT_EQ(want_to_read, std::set<int>{0});
+    ECCommon::ReadPipeline::get_min_want_to_read_shards(
+      512+16*1024, 512, s, chunk_mapping, &want_to_read);
+    ASSERT_EQ(want_to_read, std::set<int>{0});
+  }
+}
diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc
new file mode 100644
index 00000000000..dfc0304d00b
--- /dev/null
+++ b/src/test/osd/ceph_test_rados_io_sequence.cc
@@ -0,0 +1,695 @@
+#include "ceph_test_rados_io_sequence.h"
+
+#include <iostream>
+#include <vector>
+
+#include <boost/asio/io_context.hpp>
+
+#include "include/random.h"
+
+#include "librados/librados_asio.h"
+#include "common/ceph_argparse.h"
+#include "include/interval_set.h"
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "common/Thread.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/split.h"
+
+#include "common/io_exerciser/DataGenerator.h"
+#include "common/io_exerciser/Model.h"
+#include "common/io_exerciser/ObjectModel.h"
+#include "common/io_exerciser/RadosIo.h"
+#include "common/io_exerciser/IoOp.h"
+#include "common/io_exerciser/IoSequence.h"
+
+#define dout_subsys ceph_subsys_rados
+#define dout_context g_ceph_context
+
+template <typename T, int N, const std::array<T, N>& Ts>
+ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>
+  ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                          po::variables_map vm,
+                          const std::string& option_name,
+                          bool set_forced,
+                          bool select_first) 
+  : rng(rng),
+    // choices(choices),
+    option_name(option_name) {
+  if (set_forced && vm.count(option_name)) {
+    force_value = vm[option_name].as<T>();
+  }
+  if (select_first) {
+    ceph_assert(choices.size() > 0);
+    first_value = choices[0];
+  }
+}
+
+template <typename T, int N, const std::array<T, N>& Ts>
+bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced()
+{
+  return force_value.has_value();
+}
+
+template <typename T, int N, const std::array<T, N>& Ts>
+const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose()
+{
+  if (force_value.has_value()) {
+    return *force_value;
+  } else if (first_value.has_value()) {
+    return *std::exchange(first_value, std::nullopt);
+  } else {
+    return choices[rng(N-1)];
+  }
+}
+
+
+
+ceph::io_sequence::tester::SelectObjectSize::SelectObjectSize(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "objectsize", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "blocksize", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm) 
+  : ProgramOptionSelector(rng, vm, "threads", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm) 
+  : ProgramOptionSelector(rng, vm, "sequence", false, false)
+{
+  if (vm.count(option_name)) {
+    ceph::io_exerciser::Sequence s = 
+      static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>());
+    if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN || 
+        s >= ceph::io_exerciser::Sequence::SEQUENCE_END) {
+      dout(0) << "Sequence argument out of range" << dendl;
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    ceph::io_exerciser::Sequence e = s;
+    force_value = std::make_optional<std::pair<ceph::io_exerciser::Sequence,
+                                               ceph::io_exerciser::Sequence>>(
+                    std::make_pair(s, ++e));
+  }
+}
+
+const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence>
+  ceph::io_sequence::tester::SelectSeqRange::choose() {
+  if (force_value.has_value())
+  {
+    return *force_value;
+  } else {
+    return std::make_pair(ceph::io_exerciser::Sequence::SEQUENCE_BEGIN,
+                          ceph::io_exerciser::Sequence::SEQUENCE_END);
+  }
+}
+
+
+
+ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM(
+  ceph::util::random_number_generator<int>& rng, 
+  po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "km", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectErasurePlugin::SelectErasurePlugin(
+  ceph::util::random_number_generator<int>& rng,
+  po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "plugin", true, false)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "stripe_unit", true, false)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectECPool::SelectECPool(
+  ceph::util::random_number_generator<int>& rng,
+  po::variables_map vm,
+  librados::Rados& rados,
+  bool dry_run)
+  : ProgramOptionSelector(rng, vm, "pool", false, false),
+    rados(rados),
+    dry_run(dry_run),
+    skm(SelectErasureKM(rng, vm)),
+    spl(SelectErasurePlugin(rng, vm)),
+    scs(SelectErasureChunkSize(rng, vm))
+{
+  if (!skm.isForced()) {
+    if (vm.count("pool")) {
+      force_value = vm["pool"].as<std::string>();
+    }
+  }
+}
+
+const std::string ceph::io_sequence::tester::SelectECPool::choose()
+{
+  std::pair<int,int> value;
+  if (!skm.isForced() && force_value.has_value()) {
+    return *force_value;
+  } else {
+    value = skm.choose();
+  }
+  int k = value.first;
+  int m = value.second;
+  
+  const std::string plugin = std::string(spl.choose());
+  const uint64_t chunk_size = scs.choose();
+
+  std::string pool_name = "ec_" + plugin + 
+                          "_cs" + std::to_string(chunk_size) +
+                          "_k" + std::to_string(k) +
+                          "_m" + std::to_string(m);
+  if (!dry_run)
+  {
+    create_pool(rados, pool_name, plugin, chunk_size, k, m);
+  }
+  return pool_name;
+}
+
+void ceph::io_sequence::tester::SelectECPool::create_pool(
+  librados::Rados& rados,
+  const std::string& pool_name,
+  const std::string& plugin,
+  uint64_t chunk_size,
+  int k, int m)
+{
+  int rc;
+  bufferlist inbl, outbl;
+  std::string profile_create =
+    "{\"prefix\": \"osd erasure-code-profile set\", \
+    \"name\": \"testprofile-" + pool_name + "\", \
+    \"profile\": [ \"plugin=" + plugin + "\", \
+    \"k=" + std::to_string(k) + "\", \
+    \"m=" + std::to_string(m) + "\", \
+    \"stripe_unit=" + std::to_string(chunk_size) + "\", \
+    \"crush-failure-domain=osd\"]}";
+  rc = rados.mon_command(profile_create, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+  std::string cmdstr =
+    "{\"prefix\": \"osd pool create\", \
+    \"pool\": \"" + pool_name + "\", \
+    \"pool_type\": \"erasure\", \
+    \"pg_num\": 8, \
+    \"pgp_num\": 8, \
+    \"erasure_code_profile\": \"testprofile-" + pool_name + "\"}";
+  rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+}
+
+
+
+ceph::io_sequence::tester::TestObject::TestObject( const std::string oid,
+                        librados::Rados& rados,
+                        boost::asio::io_context& asio,
+                        SelectBlockSize& sbs,
+                        SelectECPool& spo,
+                        SelectObjectSize& sos,
+                        SelectNumThreads& snt,
+                        SelectSeqRange & ssr,
+                        ceph::util::random_number_generator<int>& rng,
+                        ceph::mutex& lock,
+                        ceph::condition_variable& cond,
+                        bool dryrun,
+                        bool verbose,
+                        bool has_seqseed,
+                        int  seqseed) :
+  rng(rng), verbose(verbose), has_seqseed(has_seqseed), seqseed(seqseed)
+{
+  if (dryrun) {
+    verbose = true;
+    exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid,
+                                        sbs.choose(),
+                                        rng());
+  } else {
+    const std::string pool = spo.choose();
+    int threads = snt.choose();
+    exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados,
+                                    asio,
+                                    pool,
+                                    oid,
+                                    sbs.choose(),
+                                    rng(),
+                                    threads,
+                                    lock,
+                                    cond);
+    dout(0) << "= " << oid << " pool=" << pool
+            << " threads=" << threads
+            << " blocksize=" << exerciser_model->get_block_size()
+            << " =" << dendl;
+  }
+  obj_size_range = sos.choose();
+  seq_range = ssr.choose();
+  curseq = seq_range.first;
+  seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
+                                                          obj_size_range,
+                                                          has_seqseed ?
+                                                            seqseed :
+                                                            rng());
+  op = seq->next();
+  done = false;
+  dout(0) << "== " << exerciser_model->get_oid() << " " 
+          << curseq << " " 
+          << seq->get_name() 
+          << " ==" <<dendl;
+}
+
+bool ceph::io_sequence::tester::TestObject::readyForIo()
+{
+  return exerciser_model->readyForIoOp(*op);
+}
+
+bool ceph::io_sequence::tester::TestObject::next()
+{
+  if (!done) {
+    if (verbose) {
+      dout(0) << exerciser_model->get_oid() 
+              << " Step " << seq->get_step() << ": "
+              << op->to_string(exerciser_model->get_block_size()) << dendl;
+    } else {
+      dout(5) << exerciser_model->get_oid() 
+              << " Step " << seq->get_step() << ": " 
+              << op->to_string(exerciser_model->get_block_size()) << dendl;
+    }
+    exerciser_model->applyIoOp(*op);
+    if (op->done()) {
+      ++curseq;
+      if (curseq == seq_range.second) {
+        done = true;
+        dout(0) << exerciser_model->get_oid()
+                << " Number of IOs = " << exerciser_model->get_num_io()
+                << dendl;
+      } else {
+        seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
+                                                                obj_size_range,
+                                                                has_seqseed ?
+                                                                  seqseed : 
+                                                                  rng());
+        dout(0) << "== " << exerciser_model->get_oid() << " " 
+                << curseq << " " << seq->get_name()
+                << " ==" <<dendl;
+        op = seq->next();
+      }
+    } else {
+      op = seq->next();
+    }
+  }
+  return done;
+}
+
+bool ceph::io_sequence::tester::TestObject::finished()
+{
+  return done;
+}
+
+int ceph::io_sequence::tester::TestObject::get_num_io()
+{
+  return exerciser_model->get_num_io();
+}
+
+struct Size {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Size *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+
+  std::string parse_error;
+  uint64_t size = strict_iecstrtoll(s, &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+  v = boost::any(size);
+}
+
+struct Pair {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Pair *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  auto part = ceph::split(s).begin();
+  std::string parse_error;
+  int first = strict_iecstrtoll(*part++, &parse_error);
+  int second = strict_iecstrtoll(*part, &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+  v = boost::any(std::pair<int,int>{first,second});
+}
+
+struct PluginString {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              PluginString *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+
+  const std::string_view* pluginIt = std::find(
+        ceph::io_sequence::tester::pluginChoices.begin(),
+        ceph::io_sequence::tester::pluginChoices.end(), 
+        s
+  );
+  if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt)
+  {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+
+  v = boost::any(*pluginIt);
+}
+
+int parse_io_seq_options(
+    po::variables_map& vm,
+    const po::options_description& desc,
+    int argc,
+    char** argv)
+{  
+  std::vector<std::string> unrecognized_options;
+  try {
+    auto parsed = po::command_line_parser(argc, argv)
+      .options(desc)
+      .allow_unregistered()
+      .run();
+    po::store(parsed, vm);
+    po::notify(vm);
+    unrecognized_options = po::collect_unrecognized(parsed.options,
+						    po::include_positional);
+
+    if (!unrecognized_options.empty())
+    {
+      std::stringstream ss;
+      ss << "Unrecognised command options supplied: ";
+      while (unrecognized_options.size() > 1)
+      {
+        ss << unrecognized_options.back().c_str() << ", ";
+        unrecognized_options.pop_back();
+      }
+      ss << unrecognized_options.back();
+      dout(0) << ss.str() << dendl;
+      return 1;
+    }
+  } catch(const po::error& e) {
+    std::cerr << "error: " << e.what() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+void run_test(const std::vector<
+    std::shared_ptr<ceph::io_sequence::tester::TestObject>
+  >& test_objects,
+  ceph::mutex& lock)
+{
+  // Main loop of test - while not all test objects have finished
+  // check to see if any are able to start a new I/O. If all test
+  // objects are waiting for I/O to complete then wait on a cond
+  // that is signalled each time an I/O completes
+
+  bool started_io = true;
+  bool need_wait = true;
+  while (started_io || need_wait) {
+    started_io = false;
+    need_wait = false;
+    for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+      std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+      if (!to->finished()) {
+	lock.lock();
+	bool ready = to->readyForIo();
+	lock.unlock();
+	if (ready)
+	{
+	  to->next();
+	  started_io = true;
+	} else {
+	  need_wait = true;
+	}
+      }
+    }
+    if (!started_io && need_wait) {
+      std::unique_lock l(lock);
+      // Recheck with lock incase anything has changed
+      for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+        std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+        if (!to->finished()) {
+          need_wait = !to->readyForIo();
+          if (!need_wait)
+          {
+            break;
+          }
+        }
+      }
+      need_wait = true;
+    }
+  }
+
+  int total_io = 0;
+  for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+    std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+    total_io += to->get_num_io();
+    ceph_assert(to->finished());
+  }
+  dout(0) << "Total number of IOs = " << total_io << dendl;
+}
+
+namespace {
+  constexpr std::string_view usage[] = {
+    "Basic usage:",
+    "",
+    "ceph_test_rados_io_sequence",
+    "\t Test I/O to a single object using default settings. Good for",
+    "\t testing boundary conditions",
+    "",
+    "ceph_test_rados_io_sequence --parallel <n>",
+    "\t Run parallel test to multiple objects. First object is tested with",
+    "\t default settings, other objects are tested with random settings",
+    "",
+    "Advanced usage:",
+    "",
+    "ceph_test_rados_io_sequence --blocksize <b> --km <k,m> --plugin <p>",
+    "                            --objectsize <min,max> --threads <t>",
+    "ceph_test_rados_io_sequence --blocksize <b> --pool <p> --object <oid>",
+    "                            --objectsize <min,max> --threads <t>",
+    "\tCustomize the test, if a pool is specified then it defines the",
+    "\tReplica/EC configuration",
+    "",
+    "ceph_test_rados_io_sequence --listsequence",
+    "\t Display list of supported I/O sequences",
+    "",
+    "ceph_test_rados_io_sequence --dryrun --sequence <n>",
+    "\t Show I/O that will be generated for a sequence, validate",
+    "\t seqeunce has correct I/O barriers to restrict concurrency",
+    "",
+    "ceph_test_rados_io_sequence --seed <seed>",
+    "\t Repeat a previous test with the same random numbers (seed is",
+    "\t displayed at start of test), if threads = 1 then this will produce",
+    "\t the exact same sequence of I/O, if threads > 1 then I/Os are issued",
+    "\t in parallel so ordering might be slightly different",
+    "",
+    "ceph_test_rados_io_sequence --sequence <n> --seqseed <n>",
+    "\t Repeat a sequence from a previous test with the same random",
+    "\t numbers (seqseed is displayed at start of sequence)",
+    "",
+    "ceph_test_rados_io_sequence --pool <p> --object <oid> --interactive",
+    "\t Execute sequence of I/O commands from stdin. Offset and length",
+    "\t are specified with unit of blocksize. Supported commands:",
+    "\t\t create <len>",
+    "\t\t remove",
+    "\t\t read|write <off> <len>",
+    "\t\t read2|write2 <off> <len> <off> <len>",
+    "\t\t read3|write3 <off> <len> <off> <len> <off> <len>",
+    "\t\t done"
+  };
+}
+
+int main(int argc, char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  env_to_vec(args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(cct.get());
+
+  librados::Rados rados;
+  boost::asio::io_context asio;
+  std::thread thread;
+  std::optional<boost::asio::executor_work_guard<
+                  boost::asio::io_context::executor_type>> guard;
+  ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
+  ceph::condition_variable cond;
+
+  po::options_description desc("ceph_test_rados_io options");
+
+  desc.add_options()
+    ("help,h",
+      "show help message")
+    ("listsequence,l",
+      "show list of sequences")
+    ("dryrun,d",
+      "test sequence, do not issue any I/O")
+    ("verbose",
+      "more verbose output during test")
+    ("sequence,s", po::value<int>(),
+      "test specified sequence")
+    ("seed", po::value<int>(),
+      "seed for whole test")
+    ("seqseed", po::value<int>(),
+      "seed for sequence")
+    ("blocksize,b", po::value<Size>(),
+      "block size (default 2048)")
+    ("chunksize,c", po::value<Size>(),
+      "chunk size (default 4096)")
+    ("pool,p", po::value<std::string>(),
+      "pool name")
+    ("km", po::value<Pair>(),
+      "k,m EC pool profile (default 2,2)")
+    ("plugin", po::value<PluginString>(),
+      "EC plugin (isa or jerasure)")
+    ("objectsize", po::value<Pair>(),
+      "min,max object size in blocks (default 1,32)")
+    ("threads,t", po::value<int>(),
+      "number of threads of I/O per object (default 1)")
+    ("objects,o", po::value<int>()->default_value(1),
+      "number of objects to exercise in parallel");
+
+  po::variables_map vm;
+  int rc = parse_io_seq_options(vm, desc, argc, argv);
+  if (rc != 0)
+  {
+    return rc;
+  }
+
+  if (vm.count("help")) {
+    std::cout << desc << std::endl;
+    for (auto line : usage) {
+      std::cout << line << std::endl;
+    }
+    return 0;
+  }
+
+  // Seed
+  int seed = time(nullptr);
+  if (vm.count("seed")) {
+    seed = vm["seed"].as<int>();
+  }
+  dout(0) << "Test using seed " << seed << dendl;
+  auto rng = ceph::util::random_number_generator<int>(seed);
+
+  bool verbose = vm.count("verbose");
+  bool dryrun = vm.count("dryrun");
+  bool has_seqseed = vm.count("seqseed");
+  int seqseed = 0;
+  if (has_seqseed) {
+    seqseed = vm["seqseed"].as<int>();
+  }
+  int num_objects = vm["objects"].as<int>();
+
+  if (!dryrun) {
+    rc = rados.init_with_context(g_ceph_context);
+    ceph_assert(rc == 0);
+    rc = rados.connect();
+    ceph_assert(rc == 0);
+
+    guard.emplace(boost::asio::make_work_guard(asio));
+    thread = make_named_thread("io_thread",[&asio] { asio.run(); });
+  }
+  
+  // Select block size
+  std::unique_ptr<ceph::io_sequence::tester::SelectBlockSize> sbs
+        = std::make_unique<ceph::io_sequence::tester::SelectBlockSize>(rng, vm);
+
+  // Select pool
+  std::unique_ptr<ceph::io_sequence::tester::SelectECPool> spo
+        = std::make_unique<ceph::io_sequence::tester::SelectECPool>(rng, vm,
+                                                                    rados,
+                                                                    dryrun);
+
+  // Select object size range
+  std::unique_ptr<ceph::io_sequence::tester::SelectObjectSize> sos
+        = std::make_unique<ceph::io_sequence::tester::SelectObjectSize>(rng,
+                                                                        vm);
+
+  // Select number of threads
+  std::unique_ptr<ceph::io_sequence::tester::SelectNumThreads> snt =
+        std::make_unique<ceph::io_sequence::tester::SelectNumThreads>(rng, vm);
+
+  // Select range of sequences
+  std::unique_ptr<ceph::io_sequence::tester::SelectSeqRange> ssr;
+  try {
+    ssr = std::make_unique<ceph::io_sequence::tester::SelectSeqRange>(rng, vm);
+  } catch(const po::error& e) {
+    return 1;
+  }
+
+  // List seqeunces
+  if (vm.count("listsequence")) {
+    std::pair<int,int> obj_size_range = sos->choose();
+    for (ceph::io_exerciser::Sequence s
+          = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; 
+         s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) {
+      std::unique_ptr<ceph::io_exerciser::IoSequence> seq =
+      ceph::io_exerciser::IoSequence::generate_sequence(s,
+                                                        obj_size_range,
+                                                        has_seqseed ?
+                                                          seqseed :
+                                                          rng());
+      dout(0) << s << " " << seq->get_name() << dendl;
+    }
+    return 0;
+  }
+
+  // Create a test for each object
+  std::vector<std::shared_ptr<
+    ceph::io_sequence::tester::TestObject>> test_objects;
+    
+  for (int obj = 0; obj < num_objects; obj++) {
+    test_objects.push_back(
+      std::make_shared<ceph::io_sequence::tester::TestObject>(
+            "test" + std::to_string(obj),
+            rados, asio,
+            *sbs, *spo, *sos, *snt, *ssr,
+            rng, lock, cond, 
+            dryrun, verbose,
+            has_seqseed, seqseed
+      )
+    );
+  }
+  if (!dryrun) {
+    rados.wait_for_latest_osdmap();
+  }
+
+  run_test(test_objects, lock); 
+
+  if (!dryrun) {
+    guard = std::nullopt;
+    asio.stop();
+    thread.join();
+    rados.shutdown();
+  }
+  return 0;
+}
diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h
new file mode 100644
index 00000000000..3a84b7bc824
--- /dev/null
+++ b/src/test/osd/ceph_test_rados_io_sequence.h
@@ -0,0 +1,293 @@
+#include <utility>
+
+#include "global/global_init.h"
+#include "global/global_context.h"
+
+#include "common/io_exerciser/IoOp.h"
+#include "common/io_exerciser/IoSequence.h"
+#include "common/io_exerciser/Model.h"
+
+#include "librados/librados_asio.h"
+
+#include <boost/program_options.hpp>
+
+/* Overview
+ *
+ * class ProgramOptionSelector
+ *   Base class for selector objects below with common code for 
+ *   selecting options
+ * 
+ * class SelectObjectSize
+ *   Selects min and max object sizes for a test
+ *
+ * class SelectErasureKM
+ *   Selects an EC k and m value for a test
+ * 
+ * class SelectErasurePlugin
+ *   Selects an plugin for a test
+ * 
+ * class SelectECPool
+ *   Selects an EC pool (plugin,k and m) for a test. Also creates the
+ *   pool as well.
+ *
+ * class SelectBlockSize
+ *   Selects a block size for a test
+ *
+ * class SelectNumThreads
+ *   Selects number of threads for a test
+ *
+ * class SelectSeqRange
+ *   Selects a sequence or range of sequences for a test
+ *
+ * class TestObject
+ *   Runs a test against an object, generating IOSequence
+ *   and applying them to an IoExerciser
+ *
+ * main
+ *   Run sequences of I/O with data integrity checking to
+ *   one or more objects in parallel. Without arguments
+ *   runs a default configuration against one object.
+ *   Command arguments can select alternative
+ *   configurations. Alternatively running against
+ *   multiple objects with --objects <n> will select a
+ *   random configuration for all but the first object.
+ */
+
+namespace po = boost::program_options;
+
+namespace ceph
+{
+  namespace io_sequence::tester
+  {
+    // Choices for min and max object size
+    inline constexpr size_t objectSizeSize = 10;
+    inline constexpr std::array<std::pair<int,int>,objectSizeSize> 
+                        objectSizeChoices = {{
+      {1,32},  // Default - best for boundary checking
+      {12,14},
+      {28,30},
+      {36,38},
+      {42,44},
+      {52,54},
+      {66,68},
+      {72,74},
+      {83,83},
+      {97,97}
+    }};
+
+    // Choices for block size
+    inline constexpr int blockSizeSize = 5;
+    inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {{
+      2048, // Default - test boundaries for EC 4K chunk size
+      512,
+      3767,
+      4096,
+      32768
+    }};
+
+    // Choices for number of threads
+    inline constexpr int threadArraySize = 4;
+    inline constexpr std::array<int, threadArraySize> threadCountChoices = {{
+      1, // Default
+      2,
+      4,
+      8
+    }};
+
+    // Choices for EC k+m profile
+    inline constexpr int kmSize = 6;
+    inline constexpr std::array<std::pair<int,int>, kmSize> kmChoices = {{
+      {2,2}, // Default - reasonable coverage
+      {2,1},
+      {2,3},
+      {3,2},
+      {4,2},
+      {5,1}
+    }};
+
+    // Choices for EC chunk size
+    inline constexpr int chunkSizeSize = 3;
+    inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {{
+      4*1024,
+      64*1024,
+      256*1024
+    }};
+
+    // Choices for plugin
+    inline constexpr int pluginListSize = 2;
+    inline constexpr std::array<std::string_view,
+                                pluginListSize> pluginChoices = {{
+      "jerasure",
+      "isa"
+    }};
+
+    inline constexpr std::array<std::pair<ceph::io_exerciser::Sequence,
+                                          ceph::io_exerciser::Sequence>, 
+                                0> sequencePairs = {{}};
+
+    inline constexpr std::array<std::string, 0> poolChoices = {{}};
+
+    template <typename T, int N, const std::array<T, N>& Ts>
+    class ProgramOptionSelector
+    {
+    public:
+      ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                            po::variables_map vm,
+                            const std::string& option_name,
+                            bool set_forced,
+                            bool select_first
+                           );
+      virtual ~ProgramOptionSelector() = default;
+      bool isForced();
+      virtual const T choose();
+
+    protected:
+      ceph::util::random_number_generator<int>& rng;
+      static constexpr std::array<T, N> choices = Ts;
+
+      std::optional<T> force_value;
+      std::optional<T> first_value;
+
+      std::string option_name;
+    };
+
+    class SelectObjectSize
+      : public ProgramOptionSelector<std::pair<int, int>,
+                                     io_sequence::tester::objectSizeSize,
+                                     io_sequence::tester::objectSizeChoices>
+    {
+    public:
+      SelectObjectSize(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);  
+    };
+
+    class SelectBlockSize
+      : public ProgramOptionSelector<uint64_t, 
+                                     io_sequence::tester::blockSizeSize,
+                                     io_sequence::tester::blockSizeChoices>
+    {
+    public:
+      SelectBlockSize(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);
+    };
+
+    class SelectNumThreads
+      : public ProgramOptionSelector<int, 
+                                     io_sequence::tester::threadArraySize,
+                                     io_sequence::tester::threadCountChoices>
+    {
+    public:
+      SelectNumThreads(ceph::util::random_number_generator<int>& rng,
+                       po::variables_map vm);
+    };
+
+    class SelectSeqRange
+      : public ProgramOptionSelector<std::pair<ceph::io_exerciser::Sequence,
+                                               ceph::io_exerciser::Sequence>,
+                                     0, io_sequence::tester::sequencePairs>
+    {
+    public:
+      SelectSeqRange(ceph::util::random_number_generator<int>& rng,
+                     po::variables_map vm);
+
+      const std::pair<ceph::io_exerciser::Sequence,
+                      ceph::io_exerciser::Sequence> choose() override;
+    };
+
+    class SelectErasureKM
+      : public ProgramOptionSelector<std::pair<int,int>,
+                                     io_sequence::tester::kmSize,
+                                     io_sequence::tester::kmChoices>
+    {
+    public:
+      SelectErasureKM(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);
+    };
+
+    class SelectErasurePlugin
+      : public ProgramOptionSelector<std::string_view,
+                                     io_sequence::tester::pluginListSize,
+                                     io_sequence::tester::pluginChoices>
+        {
+    public:
+      SelectErasurePlugin(ceph::util::random_number_generator<int>& rng,
+                          po::variables_map vm);
+    };
+
+    class SelectErasureChunkSize 
+      : public ProgramOptionSelector<uint64_t, 
+                                     io_sequence::tester::chunkSizeSize,
+                                     io_sequence::tester::chunkSizeChoices>
+    {
+    public:
+      SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm);
+    };
+
+    class SelectECPool
+      : public ProgramOptionSelector<std::string,
+                                     0,
+                                     io_sequence::tester::poolChoices>
+    { 
+    public:
+      SelectECPool(ceph::util::random_number_generator<int>& rng,
+                   po::variables_map vm,
+                   librados::Rados& rados,
+                   bool dry_run);
+      const std::string choose() override;
+
+    private:
+      void create_pool(librados::Rados& rados,
+                       const std::string& pool_name,
+                       const std::string& plugin,
+                       uint64_t chunk_size,
+                       int k, int m);
+
+    protected:
+      librados::Rados& rados;
+      bool dry_run;
+      
+      SelectErasureKM skm;
+      SelectErasurePlugin spl;
+      SelectErasureChunkSize scs;
+    };
+
+    class TestObject
+    {
+    public:
+      TestObject( const std::string oid,
+                  librados::Rados& rados,
+                  boost::asio::io_context& asio,
+                  ceph::io_sequence::tester::SelectBlockSize& sbs,
+                  ceph::io_sequence::tester::SelectECPool& spl,
+                  ceph::io_sequence::tester::SelectObjectSize& sos,
+                  ceph::io_sequence::tester::SelectNumThreads& snt,
+                  ceph::io_sequence::tester::SelectSeqRange& ssr,
+                  ceph::util::random_number_generator<int>& rng,
+                  ceph::mutex& lock,
+                  ceph::condition_variable& cond,
+                  bool dryrun,
+                  bool verbose,
+                  bool has_seqseed,
+                  int  seqseed);
+      
+      int get_num_io();
+      bool readyForIo();
+      bool next();
+      bool finished();
+
+    protected:
+      std::unique_ptr<ceph::io_exerciser::Model> exerciser_model;
+      std::pair<int,int> obj_size_range;
+      std::pair<ceph::io_exerciser::Sequence,
+                ceph::io_exerciser::Sequence> seq_range;
+      ceph::io_exerciser::Sequence curseq;
+      std::unique_ptr<ceph::io_exerciser::IoSequence> seq;
+      std::unique_ptr<ceph::io_exerciser::IoOp> op;
+      bool done;
+      ceph::util::random_number_generator<int>& rng;
+      bool verbose;
+      bool has_seqseed;
+      int seqseed;
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/test/rgw/test_rgw_lua.cc b/src/test/rgw/test_rgw_lua.cc
index b2e11e442a2..ad923023a6d 100644
--- a/src/test/rgw/test_rgw_lua.cc
+++ b/src/test/rgw/test_rgw_lua.cc
@@ -9,6 +9,7 @@
 #include "rgw_lua_background.h"
 #include "rgw_lua_data_filter.h"
 #include "rgw_sal_config.h"
+#include "rgw_perf_counters.h"
 
 using namespace std;
 using namespace rgw;
@@ -184,9 +185,51 @@ inline std::unique_ptr<sal::RadosStore> make_store() {
   return std::make_unique<StoreBundle>(std::move(context_pool));
 };
 
+class TestLuaManager : public rgw::sal::StoreLuaManager {
+  public:
+    std::string lua_script;
+    unsigned read_time = 0;
+    TestLuaManager() {
+      rgw_perf_start(g_cct);
+    }
+    int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override {
+      std::this_thread::sleep_for(std::chrono::seconds(read_time));
+      script = lua_script;
+      return 0;
+    }
+    int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override {
+      return 0;
+    }
+    int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override {
+      return 0;
+    }
+    int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override {
+      return 0;
+    }
+    int reload_packages(const DoutPrefixProvider* dpp, optional_yield y) override {
+      return 0;
+    }
+    ~TestLuaManager() {
+      rgw_perf_stop(g_cct);
+    }
+};
+
+void set_script(rgw::sal::LuaManager* manager, const std::string& script) {
+  static_cast<TestLuaManager*>(manager)->lua_script = script;
+}
+void set_read_time(rgw::sal::LuaManager* manager, unsigned read_time) {
+  static_cast<TestLuaManager*>(manager)->read_time = read_time;
+}
+
 #define DEFINE_REQ_STATE RGWProcessEnv pe; \
   auto store = make_store();                   \
-  pe.lua.manager = store->get_lua_manager(""); \
+  pe.lua.manager = std::make_unique<TestLuaManager>(); \
   RGWEnv e; \
   req_state s(g_cct, pe, &e, 0);
 
@@ -850,24 +893,12 @@ TEST(TestRGWLua, OpsLog)
 }
 
 class TestBackground : public rgw::lua::Background {
-  const unsigned read_time;
-
-protected:
-  int read_script() override {
-    // don't read the object from the store
-    std::this_thread::sleep_for(std::chrono::seconds(read_time));
-    return 0;
-  }
-
 public:
-  TestBackground(sal::RadosStore* store, const std::string& script, rgw::sal::LuaManager* manager, unsigned read_time = 0) : 
+  TestBackground(sal::RadosStore* store, rgw::sal::LuaManager* manager) : 
     rgw::lua::Background(store, 
         g_cct, 
         manager,
-        1 /* run every second */),
-    read_time(read_time) {
-      // the script is passed in the constructor
-      rgw_script = script;
+        1 /* run every second */) {
     }
 
   ~TestBackground() override {
@@ -878,20 +909,19 @@ public:
 TEST(TestRGWLuaBackground, Start)
 {
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
+  auto manager = std::make_unique<TestLuaManager>();
   {
     // ctr and dtor without running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
   }
   {
     // ctr and dtor with running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
     lua_background.start();
   }
 }
 
-
-constexpr auto wait_time = std::chrono::seconds(3);
+constexpr auto wait_time = std::chrono::milliseconds(100);
 
 template<typename T>
 const T& get_table_value(const TestBackground& b, const std::string& index) {
@@ -903,6 +933,15 @@ const T& get_table_value(const TestBackground& b, const std::string& index) {
   }
 }
 
+#define WAIT_FOR_BACKGROUND \
+{ \
+  unsigned max_tries = 100; \
+  do { \
+    std::this_thread::sleep_for(wait_time); \
+    --max_tries; \
+  } while (perfcounter->get(l_rgw_lua_script_ok) + perfcounter->get(l_rgw_lua_script_fail) == 0 && max_tries > 0); \
+}
+
 TEST(TestRGWLuaBackground, Script)
 {
   const std::string script = R"(
@@ -912,10 +951,11 @@ TEST(TestRGWLuaBackground, Script)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -928,9 +968,10 @@ TEST(TestRGWLuaBackground, RequestScript)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), background_script, pe.lua.manager.get());
+  set_script(pe.lua.manager.get(), background_script);
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
 
   const std::string request_script = R"(
     local key = "hello"
@@ -947,8 +988,9 @@ TEST(TestRGWLuaBackground, RequestScript)
   ASSERT_EQ(rc, 0);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from request");
   // now we resume and let the background set the value
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from background");
 }
 
@@ -965,14 +1007,16 @@ TEST(TestRGWLuaBackground, Pause)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -991,15 +1035,17 @@ TEST(TestRGWLuaBackground, PauseWhileReading)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get(), 2);
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  set_read_time(manager.get(), 2);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  constexpr auto long_wait_time = std::chrono::seconds(6);
-  std::this_thread::sleep_for(long_wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(long_wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // one execution might occur after pause
   EXPECT_TRUE(value_len + 1 >= get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -1013,14 +1059,16 @@ TEST(TestRGWLuaBackground, ReadWhilePaused)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.pause();
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "");
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -1037,18 +1085,21 @@ TEST(TestRGWLuaBackground, PauseResume)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1066,18 +1117,19 @@ TEST(TestRGWLuaBackground, MultipleStarts)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.start();
   lua_background.shutdown();
   lua_background.shutdown();
-  std::this_thread::sleep_for(wait_time);
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1085,7 +1137,7 @@ TEST(TestRGWLuaBackground, MultipleStarts)
 TEST(TestRGWLuaBackground, TableValues)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1107,7 +1159,7 @@ TEST(TestRGWLuaBackground, TableValues)
 TEST(TestRGWLuaBackground, TablePersist)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1137,7 +1189,7 @@ TEST(TestRGWLuaBackground, TablePersist)
 TEST(TestRGWLuaBackground, TableValuesFromRequest)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1165,7 +1217,7 @@ TEST(TestRGWLuaBackground, TableValuesFromRequest)
 TEST(TestRGWLuaBackground, TableInvalidValue)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1191,7 +1243,7 @@ TEST(TestRGWLuaBackground, TableInvalidValue)
 TEST(TestRGWLuaBackground, TableErase)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["size"] = 0
@@ -1229,7 +1281,7 @@ TEST(TestRGWLuaBackground, TableErase)
 TEST(TestRGWLuaBackground, TableIterate)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1256,7 +1308,7 @@ TEST(TestRGWLuaBackground, TableIterate)
 TEST(TestRGWLuaBackground, TableIterateWrite)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["a"] = 1
@@ -1286,7 +1338,7 @@ TEST(TestRGWLuaBackground, TableIterateWrite)
 TEST(TestRGWLuaBackground, TableIncrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1306,7 +1358,7 @@ TEST(TestRGWLuaBackground, TableIncrement)
 TEST(TestRGWLuaBackground, TableIncrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1328,7 +1380,7 @@ TEST(TestRGWLuaBackground, TableIncrementBy)
 TEST(TestRGWLuaBackground, TableDecrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1348,7 +1400,7 @@ TEST(TestRGWLuaBackground, TableDecrement)
 TEST(TestRGWLuaBackground, TableDecrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1370,7 +1422,7 @@ TEST(TestRGWLuaBackground, TableDecrementBy)
 TEST(TestRGWLuaBackground, TableIncrementValueError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- cannot increment string values
@@ -1405,7 +1457,7 @@ TEST(TestRGWLuaBackground, TableIncrementValueError)
 TEST(TestRGWLuaBackground, TableIncrementError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- missing argument
@@ -1494,7 +1546,7 @@ TEST(TestRGWLua, Data)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   s.host_id = "foo";
   pe.lua.background = &lua_background;
   lua::RGWObjFilter filter(&s, script);
diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc
index b56ca9a2f1c..91117cf5f2b 100644
--- a/src/tools/cephfs_mirror/PeerReplayer.cc
+++ b/src/tools/cephfs_mirror/PeerReplayer.cc
@@ -1282,6 +1282,12 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu
       break;
     }
 
+    r = pre_sync_check_and_open_handles(dir_root, current, boost::none, &fh);
+    if (r < 0) {
+      dout(5) << ": cannot proceed with sync: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
     dout(20) << ": " << sync_stack.size() << " entries in stack" << dendl;
     std::string e_name;
     auto &entry = sync_stack.top();
@@ -1687,7 +1693,7 @@ int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
   double duration = 0;
   for (; it != local_snap_map.end(); ++it) {
     if (m_perf_counters) {
-      start = std::chrono::duration_cast<std::chrono::milliseconds>(clock::now().time_since_epoch()).count();
+      start = std::chrono::duration_cast<std::chrono::seconds>(clock::now().time_since_epoch()).count();
       utime_t t;
       t.set_from_double(start);
       m_perf_counters->tset(l_cephfs_mirror_peer_replayer_last_synced_start, t);
@@ -1706,7 +1712,7 @@ int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
     }
     if (m_perf_counters) {
       m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snaps_synced);
-      end = std::chrono::duration_cast<std::chrono::milliseconds>(clock::now().time_since_epoch()).count();
+      end = std::chrono::duration_cast<std::chrono::seconds>(clock::now().time_since_epoch()).count();
       utime_t t;
       t.set_from_double(end);
       m_perf_counters->tset(l_cephfs_mirror_peer_replayer_last_synced_end, t);
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index da54d441e0c..9dfe9d36c0c 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -136,10 +136,11 @@ void usage(ostream& out)
 "   getomapval <obj-name> <key> [file] show the value for the specified key\n"
 "                                    in the object's object map\n"
 "   setomapval <obj-name> <key> <val | --input-file file>\n"
-"   rmomapkey <obj-name> <key>       Remove key from the object map of <obj-name>\n"
+"   rmomapkey <obj-name> <key>       remove key from the object map of <obj-name>\n"
 "   clearomap <obj-name> [obj-name2 obj-name3...] clear all the omap keys for the specified objects\n"
-"   getomapheader <obj-name> [file]  Dump the hexadecimal value of the object map header of <obj-name>\n"
-"   setomapheader <obj-name> <val>   Set the value of the object map header of <obj-name>\n"
+"   getomapheader <obj-name> [file]  dump the hexadecimal value of the object map header of <obj-name>\n"
+"   setomapheader <obj-name> <val | --input-file file>\n"
+"                                    set the value of the object map header of <obj-name>\n"
 "   watch <obj-name>                 add watcher on this object\n"
 "   notify <obj-name> <message>      notify watcher of this object with message\n"
 "   listwatchers <obj-name>          list the watchers of this object\n"
@@ -2844,17 +2845,33 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       ret = 0;
     }
   } else if (strcmp(nargs[0], "setomapheader") == 0) {
-    if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+    uint32_t min_args = 3;
+    if (obj_name) {
+      min_args--;
+    }
+    if (!input_file.empty()) {
+      min_args--;
+    }
+
+    if (!pool_name || nargs.size() < min_args) {
       usage(cerr);
       return 1;
     }
 
-    bufferlist bl;
     if (!obj_name) {
       obj_name = nargs[1];
-      bl.append(nargs[2]); // val
+    }
+
+    bufferlist bl;
+    if (!input_file.empty()) {
+      string err;
+      ret = bl.read_file(input_file.c_str(), &err);
+      if (ret < 0) {
+        cerr << "error reading file " << input_file.c_str() << ": " << err << std::endl;
+        return 1;
+      }
     } else {
-      bl.append(nargs[1]); // val
+      bl.append(nargs[min_args - 1]); // val
     }
     ret = io_ctx.omap_set_header(*obj_name, bl);
     if (ret < 0) {
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
index 71da0bd274a..95c8725aa33 100644
--- a/src/tools/rbd/Utils.cc
+++ b/src/tools/rbd/Utils.cc
@@ -478,10 +478,11 @@ int validate_snapshot_name(at::ArgumentModifier mod,
 int get_image_options(const boost::program_options::variables_map &vm,
 		      bool get_format, librbd::ImageOptions *opts) {
   uint64_t order = 0, stripe_unit = 0, stripe_count = 0, object_size = 0;
-  uint64_t features = 0, features_clear = 0;
+  uint64_t features = 0, features_set = 0, features_clear = 0;
   std::string data_pool;
   bool order_specified = true;
   bool features_specified = false;
+  bool features_set_specified = false;
   bool features_clear_specified = false;
   bool stripe_specified = false;
 
@@ -509,6 +510,13 @@ int get_image_options(const boost::program_options::variables_map &vm,
     stripe_specified = true;
   }
 
+  if (vm.count(at::IMAGE_MIRROR_IMAGE_MODE) &&
+      vm[at::IMAGE_MIRROR_IMAGE_MODE].as<librbd::mirror_image_mode_t>() ==
+      RBD_MIRROR_IMAGE_MODE_JOURNAL) {
+    features_set |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+    features_set_specified = true;
+  }
+
   if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
     if (features_specified) {
       features &= ~RBD_FEATURES_SINGLE_CLIENT;
@@ -581,6 +589,8 @@ int get_image_options(const boost::program_options::variables_map &vm,
     opts->set(RBD_IMAGE_OPTION_ORDER, order);
   if (features_specified)
     opts->set(RBD_IMAGE_OPTION_FEATURES, features);
+  if (features_set_specified)
+    opts->set(RBD_IMAGE_OPTION_FEATURES_SET, features_set);
   if (features_clear_specified) {
     opts->set(RBD_IMAGE_OPTION_FEATURES_CLEAR, features_clear);
   }