276 files changed, 8059 insertions, 3969 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d8d18693efc..3e81444ea3d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -164,6 +164,10 @@ README*                                         @ceph/doc-writers
 /src/cls/rgw_gc                                 @ceph/rgw
 /src/cls/user                                   @ceph/rgw
 /src/cls/version                                @ceph/rgw
+/src/mrgw.sh                                    @ceph/rgw
+/src/mrun                                       @ceph/rgw
+/src/mstart.sh                                  @ceph/rgw
+/src/mstop.sh                                   @ceph/rgw
 /src/rgw                                        @ceph/rgw
 /src/s3select                                   @ceph/rgw
 /src/spawn                                      @ceph/rgw
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 1b50ff7c5a3..cc32be38501 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -263,6 +263,19 @@ rbd:
   - systemd/rbdmap.service.in
   - udev/50-rbd.rules
 
+nvmeof:
+  - qa/suites/nvmeof/**
+  - qa/tasks/nvmeof.py
+  - qa/workunits/nvmeof/**
+  - src/ceph_nvmeof_monitor_client.cc
+  - src/cephadm/cephadmlib/daemons/nvmeof.py
+  - src/messages/MNVMeofGw*
+  - src/mon/NVMeofGw*
+  - src/nvmeof/**
+  - src/pybind/mgr/cephadm/services/nvmeof.py
+  - src/pybind/mgr/cephadm/templates/services/nvmeof/**
+  - src/tools/ceph-dencoder/nvmeof*
+
 rgw:
   - qa/suites/rgw/**
   - qa/tasks/rgw*
@@ -275,6 +288,9 @@ rgw:
   - src/cls/rgw_gc/**
   - src/cls/timeindex/**
   - src/mrgw.sh
+  - src/mrun
+  - src/mstart.sh
+  - src/mstop.sh
   - src/rgw/**
   - src/test/cls_rgw/**
   - src/test/librgw_*
diff --git a/.github/workflows/check-license.yml b/.github/workflows/check-license.yml
new file mode 100644
index 00000000000..d201ed71354
--- /dev/null
+++ b/.github/workflows/check-license.yml
@@ -0,0 +1,13 @@
+---
+name: "Check Incomatible Licenses"
+on: [pull_request]
+
+jobs:
+  check_pr:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check PR
+      uses: JJ/github-pr-contains-action@526dfe784d8604ea1c39b6c26609074de95b1ffd  # releases/v14.1
+      with:
+        github-token: ${{github.token}}
+        diffDoesNotContain: "GNU General Public License"
diff --git a/.githubmap b/.githubmap
index 724de9c002d..1b622f68cb0 100644
--- a/.githubmap
+++ b/.githubmap
@@ -12,6 +12,7 @@ aaSharma14 Aashish Sharma <aasharma@redhat.com>
 aclamk Adam Kupczyk <akupczyk@redhat.com>
 adamemerson Adam C. Emerson <aemerson@redhat.com>
 adk3798 Adam King <adking@redhat.com>
+afreen23 Afreen Misbah <afreen@ibm.com>
 ajarr Ramana Raja <rraja@redhat.com>
 alfonsomthd Alfonso Martínez <almartin@redhat.com>
 alfredodeza Alfredo Deza <adeza@redhat.com>
@@ -27,7 +28,7 @@ b-ranto Boris Ranto <branto@redhat.com>
 badone Brad Hubbard <bhubbard@redhat.com>
 baruza Barbora Ančincová <bara@redhat.com>
 bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
-batrick Patrick Donnelly <pdonnell@redhat.com>
+batrick Patrick Donnelly <pdonnell@ibm.com>
 bigjust Justin Caratzas <jcaratza@redhat.com>
 bk201 Kiefer Chang <kiefer.chang@suse.com>
 BlaineEXE Blaine Gardner <bgardner@suse.com>
@@ -47,6 +48,7 @@ Devp00l Stephan Müller <smueller@suse.com>
 dillaman Jason Dillaman <dillaman@redhat.com>
 djgalloway David Galloway <dgallowa@redhat.com>
 dmick Dan Mick <dmick@redhat.com>
+dnyanee1997 Dnyaneshwari talwekar <dtalweka@redhat.com>
 dragonylffly Li Wang <laurence.liwang@gmail.com>
 dsavineau Dimitri Savineau <dsavinea@redhat.com>
 dvanders Dan van der Ster <dan.vanderster@clyso.com>
@@ -96,6 +98,7 @@ mikechristie Mike Christie <mchristi@redhat.com>
 mogeb Mohamad Gebai <mgebai@suse.com>
 MrFreezeex Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@cern.ch>
 myoungwon Myoungwon Oh <myoungwon.oh@samsung.com>
+nmunet Naman Munet <nmunet@redhat.com>
 Naveenaidu Naveen Naidu <naveen.naidu@ibm.com>
 neha-ojha Neha Ojha <nojha@redhat.com>
 NitzanMordhai Nitzan Mordechai <nmordech@redhat.com>
@@ -109,6 +112,8 @@ p-se Patrick Seidensal <pseidensal@suse.com>
 pcuzner Paul Cuzner <pcuzner@redhat.com>
 Pegonzal Pedro Gonzalez Gomez <pegonzal@redhat.com>
 pereman2 Pere Diaz Bou <pdiazbou@redhat.com>
+prgoel-code Prachi prgoel@redhat.com
+pujaoshahu Puja Shahu <pshahu@redhat.com>
 rchagam Anjaneya Chagam <anjaneya.chagam@intel.com>
 renhwztetecs huanwen ren <ren.huanwen@zte.com.cn>
 ricardoasmarques Ricardo Marques <rimarques@suse.com>
diff --git a/.mailmap b/.mailmap
index 1c4ac95340a..75a1610b244 100644
--- a/.mailmap
+++ b/.mailmap
@@ -24,6 +24,7 @@ Adam Kupczyk <akupczyk@redhat.com> <aclamk@gmail.com>
 Adam Kupczyk <akupczyk@redhat.com> <akucpzyk@redhat.com>
 Adam Twardowski <adam.twardowski@gmail.com>
 Adir Lev <adirl@mellanox.com>
+Afreen Misbah <afreen@ibm.com>
 Ahoussi Armand <ahoussi.say@telecom-bretagne.eu> <delco225>
 Ailing Zhang <zhangal1992@gmail.com> <ailzhang@users.noreply.github.com>
 Aishwarya Mathuria <amathuri@redhat.com> amathuria <NOT@FOUND>
@@ -168,6 +169,7 @@ Dhairya Parmar <dparmar@redhat.com> dparmar18 <dparmar@redhat.com>
 Dingdang Zhang <boqian.zy@alibaba-inc.com>
 Dmitry Smirnov <onlyjob@member.fsf.org> <onlyjob@debian.org>
 Dmitry Yatsushkevich <dyatsushkevich@mirantis.com> <dmitry.yatsushkevich@gmail.com>
+Dnyaneshwari talwekar <dtalweka@redhat.com>
 Dominik Hannen <cantares1+github@gmail.com> <dhxgit@users.noreply.github.com>
 Dongdong Tao <dongodng.tao@canonical.com>
 Dongdong Tao <tdd21151186@gmail.com>
@@ -508,6 +510,7 @@ Myoungwon Oh <omwmw@sk.com>
 Myoungwon Oh <omwmw@sk.com> <ommw@sk.com>
 Na Xie <xie.na@h3c.com>
 Nag Pavan Chilakam <nagpavan.chilakam@gmail.com> <55574442+nagpavan-chilakam@users.noreply.github.com>
+Naman Munet <nmunet@redhat.com>
 Nancy Su <su_nan@inspur.com>
 Nathan Cutler <ncutler@suse.com>
 Nathan Cutler <ncutler@suse.com> <cutler@suse.cz>
@@ -544,7 +547,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
 Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
 Pascal de Bruijn <pascal@unilogicnetworks.net>
 Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
-Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
 Patrick McGarry <patrick@inktank.com>
 Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
 Patrick Seidensal <pseidensal@suse.com>
@@ -572,6 +576,8 @@ Pooja Gautam <pooja.gautam@ts.fujitsu.com>
 Pritha Srivastava <prsrivas@redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <pritha@dhcp35-190.lab.eng.blr.redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <prsivas@redhat.com>
+Prachi prgoel@redhat.com
+Puja Shahu <pshahu@redhat.com>
 Qi Liang Hong <qilianghong@huawei.com>
 Qiankun Zheng <zheng.qiankun@h3c.com>
 Qinfei Liu <lucas.liuqinfei@huawei.com> <18138800392@163.com>
diff --git a/.organizationmap b/.organizationmap
index 3a601f4e2b2..4f3f7dfa7d9 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -346,21 +346,27 @@ Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com>
 Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn>
 HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com>
 IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com>
+IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com>
 IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com>
 IBM <contact@IBM.com> Andrew Solomon <asolomon@us.ibm.com>
+IBM <contact@IBM.com> Dnyaneshwari talwekar <Dnyaneshwari.Talwekar@ibm.com>
 IBM <contact@IBM.com> Guillaume Abrioux <gabrioux@ibm.com>
 IBM <contact@IBM.com> Jonas Pfefferle <jpf@ibm.com>
 IBM <contact@IBM.com> Laura Flores <lflores@ibm.com>
 IBM <contact@IBM.com> Martin Ohmacht <mohmacht@us.ibm.com>
 IBM <contact@IBM.com> Michel Normand <normand@linux.vnet.ibm.com>
+IBM <contact@IBM.com> Naman Munet <Naman.Munet@ibm.com>
 IBM <contact@IBM.com> Naveen Naidu <naveen.naidu@ibm.com>
 IBM <contact@IBM.com> Neeraj Pratap Singh <Neeraj.Pratap.Singh1@ibm.com>
 IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
 IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
+IBM <contact@IBM.com> Prachi Goel <PRACHI.GOEL2@ibm.com>
+IBM <contact@IBM.com> Puja Shahu <puja-shahu.omprakash@ibm.com>
 IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
 IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
 IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
 IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
+IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
 IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
 IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
 IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>
@@ -584,6 +590,7 @@ Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam Kupczyk <akupczyk@redhat.com>
 Red Hat <contact@redhat.com> Ademar de Souza Reis Jr <areis@redhat.com>
+Red Hat <contact@redhat.com> Afreen Misbah <afrahman@redhat.com>
 Red Hat <contact@redhat.com> Aishwarya Mathuria <amathuri@redhat.com>
 Red Hat <contact@redhat.com> Albin Antony <aantony@redhat.com>
 Red Hat <contact@redhat.com> Alex Elder <aelder@redhat.com>
@@ -620,6 +627,7 @@ Red Hat <contact@redhat.com> Deepika Upadhyay <dupadhya@redhat.com>
 Red Hat <contact@redhat.com> Dhairya Parmar <dparmar@redhat.com>
 Red Hat <contact@redhat.com> Dimitri Savineau <dsavinea@redhat.com>
 Red Hat <contact@redhat.com> Divyansh Kamboj <dkamboj@redhat.com>
+Red Hat <contact@redhat.com> Dnyaneshwari talwekar <dtalweka@redhat.com>
 Red Hat <contact@redhat.com> Douglas Fuller <dfuller@redhat.com>
 Red Hat <contact@redhat.com> Ernesto Puerta <epuertat@redhat.com>
 Red Hat <contact@redhat.com> Erwan Velu <erwan@redhat.com>
@@ -685,6 +693,7 @@ Red Hat <contact@redhat.com> Mike Hackett <mhackett@redhat.com>
 Red Hat <contact@redhat.com> Mike Perez <miperez@redhat.com>
 Red Hat <contact@redhat.com> Milan Broz <mbroz@redhat.com>
 Red Hat <contact@redhat.com> Milind Changire <mchangir@redhat.com>
+Red Hat <contact@redhat.com> Naman Munet <nmunet@redhat.com>
 Red Hat <contact@redhat.com> Nathan Weinberg <nweinber@redhat.com>
 Red Hat <contact@redhat.com> Neeraj Pratap Singh <neesingh@redhat.com>
 Red Hat <contact@redhat.com> Neha Ojha <nojha@redhat.com>
@@ -708,9 +717,11 @@ Red Hat <contact@redhat.com> Pere Diaz Bou <pdiazbou@redhat.com>
 Red Hat <contact@redhat.com> Pete Zaitcev <zaitcev@redhat.com>
 Red Hat <contact@redhat.com> Petr Lautrbach <plautrba@redhat.com>
 Red Hat <contact@redhat.com> Petr Machata <pmachata@redhat.com>
+Red Hat <contact@redhat.com> Prachi prgoel@redhat.com
 Red Hat <contact@redhat.com> Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
 Red Hat <contact@redhat.com> Prashant D <pdhange@redhat.com>
 Red Hat <contact@redhat.com> Pritha Srivastava <prsrivas@redhat.com>
+Red Hat <contact@redhat.com> Puja Shahu <pshahu@redhat.com>
 Red Hat <contact@redhat.com> Radoslaw Zarzynski <rzarzynski@redhat.com>
 Red Hat <contact@redhat.com> Rafael Quintero <rquinter@redhat.com>
 Red Hat <contact@redhat.com> Ramakrishnan Periyasamy <rperiyas@redhat.com>
diff --git a/.peoplemap b/.peoplemap
index 507f50edb43..418e8505fb4 100644
--- a/.peoplemap
+++ b/.peoplemap
@@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
 Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
 Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
 Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
-Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
+Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
 Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>
diff --git a/CodingStyle b/CodingStyle
index 659298f0e5a..019d23c7703 100644
--- a/CodingStyle
+++ b/CodingStyle
@@ -108,6 +108,12 @@ by section.
    portability since `#pragma once` is widely supported and is known
    to work on GCC and Clang.
 
+* Header Files -> Forward declarations:
+
+    Forward declarations of structs, unions, classes and enums can be
+    used to reduce header dependencies.  This speeds up compile times
+    because the compiler has to process less code.
+
 
 The following guidelines have not been followed in the legacy code,
 but are worth mentioning and should be followed strictly for new code:
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 1a4e26e747f..8af2a262dff 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -26,6 +26,14 @@
    - osd_op_num_threads_per_shard_hdd = 5 (was 1)
   For more details see https://tracker.ceph.com/issues/66289.
 
+* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
+  unhealthy now requires users to pass the confirmation flag
+  (--yes-i-really-mean-it). This has been added as a precaution to tell the
+  users that modifying "max_mds" may not help with troubleshooting or recovery
+  effort. Instead, it might further destabilize the cluster.
+
+
+
 >=19.0.0
 
 * cephx: key rotation is now possible using `ceph auth rotate`. Previously,
diff --git a/SubmittingPatches-backports.rst b/SubmittingPatches-backports.rst
index 0f96aec65c4..bb55088cb5f 100644
--- a/SubmittingPatches-backports.rst
+++ b/SubmittingPatches-backports.rst
@@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
 issue, just add a comment describing what changes you would like to make.
 Someone with permissions will make the necessary modifications on your behalf.
 
-For straightforward backports, that's all that you (as the developer of the fix)
-need to do. Volunteers from the `Stable Releases and Backports team`_ will
-proceed to create Backport issues to track the necessary backports and stage the
-backports by opening GitHub PRs with the cherry-picks. If you don't want to
-wait, and provided you have sufficient permissions at https://tracker.ceph.com,
-you can `create Backport tracker issues` and `stage backports`_ yourself. In
-that case, read on.
-
+Authors of pull requests are responsible for creating associated backport pull
+requests. As long as you have sufficient permissions at
+https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
+backports`_ yourself. Read these linked sections to learn how to create
+backport tracker issues and how to stage backports: 
 
 .. _`create backport tracker issues`:
 .. _`backport tracker issue`:
@@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
 
 Under ordinary circumstances, the developer who merges the ``main`` PR will flag
 the ``main`` branch tracker issue for backport by changing the Status to "Pending
-Backport", and volunteers from the `Stable Releases and Backports team`_
-periodically create backport tracker issues by running the
-``backport-create-issue`` script. They also do the actual backporting. But that
-does take time and you may not want to wait.
+Backport". 
 
 You might be tempted to forge ahead and create the backport issues yourself.
 Please don't do that - it is difficult (bordering on impossible) to get all the
@@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
 Milestone tag to the stable release the backport PR is targeting. For example,
 if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
 
-If you don't have sufficient GitHub permissions to set the Milestone, don't
-worry. Members of the `Stable Releases and Backports team`_ periodically run
-a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
-branches and automatically adds the correct Milestone tag if it is missing.
-
 Next, check which component label was applied to the ``main`` PR corresponding to
 this backport, and double-check that that label is applied to the backport PR as
 well. For example, if the ``main`` PR carries the component label "core", the
 backport PR should also get that label.
 
-In general, it is the responsibility of the `Stable Releases and Backports
-team`_ to ensure that backport PRs are properly labelled. If in doubt, just
-leave the labelling to them.
-
 .. _`backport PR reviewing`:
 .. _`backport PR testing`:
 .. _`backport PR merging`:
@@ -381,9 +366,8 @@ leave the labelling to them.
 Reviewing, testing, and merging of backport PRs
 -----------------------------------------------
 
-Once your backport PR is open and the Milestone is set properly, the
-`Stable Releases and Backports team` will take care of getting the PR
-reviewed and tested. Once the PR is reviewed and tested, it will be merged.
+Once your backport PR is open, it will be reviewed and tested. When the PR has
+been reviewed and tested, it will be merged.
 
 If you would like to facilitate this process, you can solicit reviews and run
 integration tests on the PR. In this case, add comments to the PR describing the
@@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
 PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
 unnecessarily complicates the release preparation process, which is done by
 volunteers.)
-
-
-Stable Releases and Backports team
-----------------------------------
-
-Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
-which is charged with maintaining the stable releases and backporting bugfixes
-from the ``main`` branch to them. (That team maintains a wiki, accessible by
-clicking the `Stable Releases and Backports`_ link, which describes various
-workflows in the backporting lifecycle.)
-
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
-
-Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
-issue). The volunteers from the Stable Releases and Backports team will
-backport the fix, run regression tests on it, and include it in one or more
-future point releases.
-
-
diff --git a/container/build.sh b/container/build.sh
index 7c97e2261c1..5edf469d2d2 100755
--- a/container/build.sh
+++ b/container/build.sh
@@ -136,9 +136,9 @@ if [[ ${CI_CONTAINER} == "true" ]] ; then
     branch_repo_tag=$repopath/ceph:${BRANCH}
     sha1_repo_tag=$repopath/ceph:${CEPH_SHA1}
 
-    if [[ "${ARCH}" == "aarch64" ]] ; then
-        branch_repo_tag=${branch_repo_tag}-aarch64
-        sha1_repo_tag=${sha1_repo_tag}-aarch64
+    if [[ "${ARCH}" == "arm64" ]] ; then
+        branch_repo_tag=${branch_repo_tag}-arm64
+        sha1_repo_tag=${sha1_repo_tag}-arm64
     fi
 
     podman tag ${image_id} ${full_repo_tag}
diff --git a/doc/cephadm/operations.rst b/doc/cephadm/operations.rst
index 3b117c1bd6a..420ee655ac8 100644
--- a/doc/cephadm/operations.rst
+++ b/doc/cephadm/operations.rst
@@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster
 
   # For each host:
   cephadm rm-cluster --force --zap-osds --fsid <fsid>
+
+
+Replacing a device
+==================
+
+The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD.
+Previously, this process required manual intervention at various stages.
+With this new command, all necessary operations are performed automatically, streamlining the replacement process
+and improving the overall user experience.
+
+.. note:: This only supports LVM-based deployed OSD(s)
+
+.. prompt:: bash #
+
+  ceph orch device replace <host> <device-path>
+
+In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd
+
+  Error EINVAL: /dev/vdd is a shared device.
+  Replacing /dev/vdd implies destroying OSD(s): ['0', '1'].
+  Please, *be very careful*, this can be a very dangerous operation.
+  If you know what you are doing, pass --yes-i-really-mean-it
+
+If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it
+    Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced.
+
+``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the
+different OSD(s) ID(s) will be preserved:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph osd tree
+    ID  CLASS  WEIGHT   TYPE NAME         STATUS     REWEIGHT  PRI-AFF
+    -1         0.97659  root default
+    -3         0.97659      host devel-1
+     0    hdd  0.29300          osd.0     destroyed   1.00000  1.00000
+     1    hdd  0.29300          osd.1     destroyed   1.00000  1.00000
+     2    hdd  0.19530          osd.2            up   1.00000  1.00000
+     3    hdd  0.19530          osd.3            up   1.00000  1.00000
+
+The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph orch device ls
+  HOST     PATH      TYPE  DEVICE ID   SIZE  AVAILABLE  REFRESHED  REJECT REASONS
+  osd-1  /dev/vdb  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdc  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdd  hdd               200G  Yes        13s ago    Is being replaced
+  osd-1  /dev/vde  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+  osd-1  /dev/vdf  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+
+If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace <host> <device> --clear``:
+
+.. prompt:: bash #
+
+  [ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear
+  Replacement header cleared on /dev/vdk
+  [ceph: root@devel-1 /]#
+
+After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``).
diff --git a/doc/cephfs/administration.rst b/doc/cephfs/administration.rst
index 5760e67f73e..07646bff067 100644
--- a/doc/cephfs/administration.rst
+++ b/doc/cephfs/administration.rst
@@ -61,10 +61,17 @@ is a subset of the same information from the ``ceph fs dump`` command.
 
 ::
 
-    ceph fs set <file system name> <var> <val>
+    ceph fs set <file system name> <var> <val> [--yes-i-really-mean-it]
 
 Change a setting on a file system. These settings are specific to the named
-file system and do not affect other file systems.
+file system and do not affect other file systems. Confirmation flag is only
+needed for changing ``max_mds`` when cluster is unhealthy.
+
+.. note:: It is mandatory to pass confirmation flag (--yes--i-really-mean-it)
+   for modifying FS setting variable ``max_mds`` when cluster is unhealthy.
+   It has been added a precaution to tell users that modifying ``max_mds``
+   during troubleshooting or recovery might not help. Instead, it might
+   further destabilize the cluster.
 
 ::
 
diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst
index 4ad7304481f..3ae1139ceac 100644
--- a/doc/cephfs/cephfs-journal-tool.rst
+++ b/doc/cephfs/cephfs-journal-tool.rst
@@ -105,12 +105,12 @@ Example: header get/set
       "write_pos": 4274947,
       "expire_pos": 4194304,
       "trimmed_pos": 4194303,
+      "stream_format": 1,
       "layout": { "stripe_unit": 4194304,
-          "stripe_count": 4194304,
+          "stripe_count": 1,
           "object_size": 4194304,
-          "cas_hash": 4194304,
-          "object_stripe_unit": 4194304,
-          "pg_pool": 4194304}}
+          "pool_id": 2,
+          "pool_ns": ""}}
 
     # cephfs-journal-tool header set trimmed_pos 4194303
     Updating trimmed_pos 0x400000 -> 0x3fffff
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
index 34de1b7501d..78d0a8f54d3 100644
--- a/doc/cephfs/troubleshooting.rst
+++ b/doc/cephfs/troubleshooting.rst
@@ -128,6 +128,11 @@ things to do:
 
   That prevents any clients from establishing new sessions with the MDS.
 
+* **Dont tweak max_mds** Modifying the FS setting variable ``max_mds`` is
+  sometimes perceived as a good step during troubleshooting or recovery effort.
+  Instead, doing so might further destabilize the cluster. If ``max_mds`` must
+  be changed in such circumstances, run the command to change ``max_mds`` with
+  the confirmation flag (``--yes-i-really-mean-it``)
 
 
 Expediting MDS journal trim
diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst
index a804a007599..e09fed213f2 100644
--- a/doc/dev/cephfs-mirroring.rst
+++ b/doc/dev/cephfs-mirroring.rst
@@ -17,12 +17,10 @@ Key Idea
 --------
 
 For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
-readdir diff to identify changes in a directory tree. The diffs are applied to
+`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
 directory in the remote file system thereby only synchronizing files that have
 changed between two snapshots.
 
-This feature is tracked here: https://tracker.ceph.com/issues/47034.
-
 Currently, snapshot data is synchronized by bulk copying to the remote
 filesystem.
 
@@ -407,3 +405,5 @@ Feature Status
 --------------
 
 `cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
+
+.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature
diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst
index cbde8779a66..7cce4c6f898 100644
--- a/doc/dev/developer_guide/essentials.rst
+++ b/doc/dev/developer_guide/essentials.rst
@@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
 Backporting
 -----------
 
-All bugfixes should be merged to the ``main`` branch before being
-backported. To flag a bugfix for backporting, make sure it has a
-`tracker issue`_ associated with it and set the ``Backport`` field to a
-comma-separated list of previous releases (e.g. "hammer,jewel") that you think
-need the backport.
-The rest (including the actual backporting) will be taken care of by the
-`Stable Releases and Backports`_ team.
+All bugfixes should be merged to the ``main`` branch before being backported.
+To flag a bugfix for backporting, make sure it has a `tracker issue`_
+associated with it and set the ``Backport`` field to a comma-separated list of
+previous releases (e.g. "hammer,jewel") that you think need the backport. You
+are responsible for the backporting of pull requests that you raise.
 
 .. _`tracker issue`: http://tracker.ceph.com/
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
 
 Dependabot
 ----------
diff --git a/doc/dev/kclient.rst b/doc/dev/kclient.rst
new file mode 100644
index 00000000000..fd4903ac1ab
--- /dev/null
+++ b/doc/dev/kclient.rst
@@ -0,0 +1,478 @@
+Testing changes to the Linux Kernel CephFS driver
+=================================================
+
+This walkthrough will explain one (opinionated) way to do testing of the Linux
+kernel client against a development cluster. We will try to mimimize any
+assumptions about pre-existing knowledge of how to do kernel builds or any
+related best-practices.
+
+.. note:: There are many completely valid ways to do kernel development for
+          Ceph. This guide is a walkthrough of the author's own environment.
+          You may decide to do things very differently.
+
+Step One: build the kernel
+==========================
+
+Clone the kernel:
+
+.. code-block:: bash
+
+    git init linux && cd linux
+    git remote add torvalds git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+    git remote add ceph https://github.com/ceph/ceph-client.git
+    git fetch && git checkout torvalds/master
+
+
+Configure the kernel:
+
+.. code-block:: bash
+
+    make defconfig
+
+.. note:: You can alternatively use the `Ceph Kernel QA Config`_ for building the kernel.
+
+We now have a kernel config with reasonable defaults for the architecture you're
+building on. The next thing to do is to enable configs which will build Ceph and/or
+provide functionality we need to do testing.
+
+.. code-block:: bash
+
+    cat > ~/.ceph.config <<EOF
+    CONFIG_CEPH_FS=y
+    CONFIG_CEPH_FSCACHE=y
+    CONFIG_CEPH_FS_POSIX_ACL=y
+    CONFIG_CEPH_FS_SECURITY_LABEL=y
+    CONFIG_CEPH_LIB_PRETTYDEBUG=y
+    CONFIG_DYNAMIC_DEBUG=y
+    CONFIG_DYNAMIC_DEBUG_CORE=y
+    CONFIG_FRAME_POINTER=y
+    CONFIG_FSCACHE
+    CONFIG_FSCACHE_STATS
+    CONFIG_FS_ENCRYPTION=y
+    CONFIG_FS_ENCRYPTION_ALGS=y
+    CONFIG_KGDB=y
+    CONFIG_KGDB_SERIAL_CONSOLE=y
+    CONFIG_XFS_FS=y
+    EOF
+
+Beyond enabling Ceph-related configs, we are also enabling some useful
+debug configs and XFS (as an alternative to ext4 if needed for our root file
+system).
+
+.. note:: It is a good idea to not build anything as a kernel module. Otherwise, you would need to ``make modules_install`` on the root drive of the VM.
+
+Now, merge the configs.
+
+
+.. code-block:: bash
+
+
+    scripts/kconfig/merge_config.sh .config ~/.ceph.config
+
+
+Finally, build the kernel:
+
+.. code-block:: bash
+
+    make -j
+
+
+.. note:: This document does not discuss how to get relevant utilities for your
+          distribution to actually build the kernel, like gcc. Please use your search
+          engine of choice to learn how to do that.
+
+
+Step Two: create a VM
+=====================
+
+A virtual machine is a good choice for testing the kernel client for a few reasons:
+
+* You can more easily monitor and configure networking for the VM.
+* You can very rapidly test a change to the kernel (build -> mount in less than 10 seconds).
+* A fault in the kernel won't crash your machine.
+* You have a suite of tools available for analysis on the running kernel.
+
+The main decision for you to make is what Linux distribution you want to use.
+This document uses Arch Linux due to the author's familiarity. We also use LVM
+to create a volume. You may use partitions or whatever mechanism you like to
+create a block device. In general, this block device will be used repeatedly in
+testing. You may want to use snapshots to avoid a VM somehow corrupting your
+root disk and forcing you to start over.
+
+
+.. code-block:: bash
+
+    # create a volume
+    VOLUME_GROUP=foo
+    sudo lvcreate -L 256G "$VOLUME_GROUP" -n $(whoami)-vm-0
+    DEV="/dev/${VOLUME_GROUP}/$(whoami)-vm-0"
+    sudo mkfs.xfs "$DEV"
+    sudo mount "$DEV" /mnt
+    sudo pacstrap /mnt base base-devel vim less jq
+    sudo arch-chroot /mnt
+    # # delete root's password for ease of login
+    # passwd -d root
+    # mkdir -p /root/.ssh && echo "$YOUR_SSH_KEY_PUBKEY" >> /root/.ssh/authorized_keys
+    # exit
+    sudo umount /mnt
+
+Once that's done, we should be able to run a VM:
+
+
+.. code-block:: bash
+
+    qemu-system-x86_64 -enable-kvm -kernel $(pwd)/arch/x86/boot/bzImage -drive file="$DEV",if=virtio,format=raw -append 'root=/dev/vda rw'
+
+You should see output like:
+
+::
+
+    VNC server running on ::1:5900
+
+You could view that console using:
+
+
+.. code-block:: bash
+
+    vncviewer 127.0.0.1:5900
+
+Congratulations, you have a VM running the kernel that you just built.
+
+
+Step Three: Networking the VM
+=============================
+
+This is the "hard part" and requires the most customization depending on what
+you want to do. For this author, I currently have a development setup like:
+
+
+::
+
+     sepian netns
+    ______________
+   |              |
+   | kernel VM    |              sepia-bounce VM      vossi04.front.sepia.ceph.com
+   |  -------  |  |                  ------                    -------
+   |  |     |  |  | 192.168.20.1     |    |                    |     |
+   |  |     |--|--|- <- wireguard -> |    |  <-- sepia vpn ->  |     |
+   |  |_____|  |  |     192.168.20.2 |____|                    |_____|
+   |          br0 |
+   |______________|
+
+
+The sepia-bounce VM is used as a bounce box to the sepia lab. It can proxy ssh
+connections, route any sepia-bound traffic, or serve as a DNS proxy. The use of
+a sepia-bounce VM is optional but can be useful, especially if you want to
+create numerous kernel VMs for testing.
+
+I like to use the vossi04 `developer playground`_ to build Ceph and setup a
+vstart cluster.  It has sufficient resources to make building Ceph very fast
+(~5 minutes cold build) and local disk resources to run a decent vstart
+cluster.
+
+To avoid overcomplicating this document with the details of the sepia-bounce
+VM, I will note the following main configurations used for the purpose of
+testing the kernel:
+
+- setup a wireguard tunnel between the machine creating kernel VMs and the sepia-bounce VM
+- use ``systemd-resolved`` as a DNS resolver and listen on 192.168.20.2 (instead of just localhost)
+- connect to the sepia `VPN`_ and use `systemd resolved update script`_ to configure ``systemd-resolved`` to use the DNS servers acquired via DHCP from the sepia VPN
+- configure ``firewalld`` to allow wireguard traffic and to masquerade and forward traffic to the sepia vpn
+
+The next task is to connect the kernel VM to the sepia-bounce VM. A network
+namespace can be useful for this purpose to isolate traffic / routing rules for
+the VMs. For me, I orchestrate this using a custom systemd one-shot unit that
+looks like:
+
+::
+
+    # create the net namespace
+    ExecStart=/usr/bin/ip netns add sepian
+    # bring lo up
+    ExecStart=/usr/bin/ip netns exec sepian ip link set dev lo up
+    # setup wireguard to sepia-bounce
+    ExecStart=/usr/bin/ip link add wg-sepian type wireguard
+    ExecStart=/usr/bin/wg setconf wg-sepian /etc/wireguard/wg-sepian.conf
+    # move the wireguard interface to the sepian nents
+    ExecStart=/usr/bin/ip link set wg-sepian netns sepian
+    # configure the static ip and bring it up
+    ExecStart=/usr/bin/ip netns exec sepian ip addr add 192.168.20.1/24 dev wg-sepian
+    ExecStart=/usr/bin/ip netns exec sepian ip link set wg-sepian up
+    # logging info
+    ExecStart=/usr/bin/ip netns exec sepian ip addr
+    ExecStart=/usr/bin/ip netns exec sepian ip route
+    # make wireguard the default route
+    ExecStart=/usr/bin/ip netns exec sepian ip route add default via 192.168.20.2 dev wg-sepian
+    # more logging
+    ExecStart=/usr/bin/ip netns exec sepian ip route
+    # add a bridge interface for VMs
+    ExecStart=/usr/bin/ip netns exec sepian ip link add name br0 type bridge
+    # configure the addresses and bring it up
+    ExecStart=/usr/bin/ip netns exec sepian ip addr add 192.168.0.1/24 dev br0
+    ExecStart=/usr/bin/ip netns exec sepian ip link set br0 up
+    # masquerade/forward traffic to sepia-bounce
+    ExecStart=/usr/bin/ip netns exec sepian iptables -t nat -A POSTROUTING -o wg-sepian -j MASQUERADE
+
+
+When using the network namespace, we will use ``ip netns exec``. There is a
+handy feature to automatically bind mount files into the ``/etc`` namespace for
+commands run via that command:
+
+::
+
+    # cat /etc/netns/sepian/resolv.conf
+    nameserver 192.168.20.2
+
+That file will configure the libc name resolution stack to route DNS requests
+for applications to the ``systemd-resolved`` daemon running on sepia-bounce.
+Consequently, any application running in that netns will be able to resolve
+sepia hostnames:
+
+::
+
+    $ sudo ip netns exec sepian host vossi04.front.sepia.ceph.com
+    vossi04.front.sepia.ceph.com has address 172.21.10.4
+
+
+Okay, great. We have a network namespace that forwards traffic to the sepia
+VPN.  The next mental step is to connect virtual machines running a kernel to
+the bridge we have configured. The straightforward way to do that is to create
+a "tap" device which connects to the bridge:
+
+.. code-block:: bash
+
+     sudo ip netns exec sepian qemu-system-x86_64 \
+         -enable-kvm \
+         -kernel $(pwd)/arch/x86/boot/bzImage \
+         -drive file="$DEV",if=virtio,format=raw \
+         -netdev tap,id=net0,ifname=tap0,script="$HOME/bin/qemu-br0",downscript=no \
+         -device virtio-net-pci,netdev=net0 \
+         -append 'root=/dev/vda rw'
+
+The new relevant bits here are (a) executing the VM in the netns we have
+constructed; (b) a ``-netdev``  command to configure a tap device; (c) a
+virtual network card for the VM. There is also a script ``$HOME/bin/qemu-br0``
+run by qemu to configure the tap device it creates for the VM:
+
+::
+
+    #!/bin/bash
+    tap=$1
+    ip link set "$tap" master br0
+    ip link set dev "$tap" up
+
+That simply plugs the new tap device into the bridge.
+
+This is all well and good but we are now missing one last crucial step. What is
+the IP address of the VM?  There are two options:
+
+1. configure a static IP but the VM's root device networking stack
+   configuration must be modified
+2. use DHCP and configure the root device for VMs to always use dhcp to
+   configure their ethernet device addresses
+
+The second option is more complicated to setup, since you must run a DHCP
+server now, but provides the greatest flexibility for adding more VMs as needed
+when testing.
+
+The modified (or "hacked") standard dhcpd systemd service looks like:
+
+::
+
+    # cat sepian-dhcpd.service
+    [Unit]
+    Description=IPv4 DHCP server
+    After=network.target network-online.target sepian-netns.service
+    Wants=network-online.target
+    Requires=sepian-netns.service
+    
+    [Service]
+    ExecStartPre=/usr/bin/touch /tmp/dhcpd.leases
+    ExecStartPre=/usr/bin/cat /etc/netns/sepian/dhcpd.conf
+    ExecStart=/usr/bin/dhcpd -f -4 -q -cf /etc/netns/sepian/dhcpd.conf -lf /tmp/dhcpd.leases
+    NetworkNamespacePath=/var/run/netns/sepian
+    RuntimeDirectory=dhcpd4
+    User=dhcp
+    AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_NET_RAW
+    ProtectSystem=full
+    ProtectHome=on
+    KillSignal=SIGINT
+    # We pull in network-online.target for a configured network connection.
+    # However this is not guaranteed to be the network connection our
+    # networks are configured for. So try to restart on failure with a delay
+    # of two seconds. Rate limiting kicks in after 12 seconds.
+    RestartSec=2s
+    Restart=on-failure
+    StartLimitInterval=12s
+    
+    [Install]
+    WantedBy=multi-user.target
+
+Similarly, the referenced dhcpd.conf:
+
+::
+
+    # cat /etc/netns/sepian/dhcpd.conf
+    option domain-name-servers 192.168.20.2;
+    option subnet-mask 255.255.255.0;
+    option routers 192.168.0.1;
+    subnet 192.168.0.0 netmask 255.255.255.0 {
+        range 192.168.0.100 192.168.0.199;
+    }
+
+Importantly, this tells the VM to route traffic to 192.168.0.1 (the IP of the
+bridge in the netns) and DNS can be provided by 192.168.20.2 (via
+``systemd-resolved`` on the sepia-bounce VM).
+
+In the VM, the networking looks like:
+
+::
+
+	[root@archlinux ~]# ip link
+	1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
+    	link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+	2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
+    	link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
+	3: sit0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000
+    	link/sit 0.0.0.0 brd 0.0.0.0
+	[root@archlinux ~]# ip addr
+	1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
+    	link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+    	inet 127.0.0.1/8 scope host lo
+       	valid_lft forever preferred_lft forever
+    	inet6 ::1/128 scope host noprefixroute 
+       	valid_lft forever preferred_lft forever
+	2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
+    	link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
+    	inet 192.168.0.100/24 metric 1024 brd 192.168.0.255 scope global dynamic enp0s3
+       	valid_lft 28435sec preferred_lft 28435sec
+    	inet6 fe80::5054:ff:fe12:3456/64 scope link proto kernel_ll 
+       	valid_lft forever preferred_lft forever
+	3: sit0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
+    	link/sit 0.0.0.0 brd 0.0.0.0
+	[root@archlinux ~]# systemd-resolve --status
+	Global
+           	Protocols: +LLMNR +mDNS -DNSOverTLS DNSSEC=no/unsupported
+    	resolv.conf mode: stub
+	Fallback DNS Servers: 1.1.1.1#cloudflare-dns.com 9.9.9.9#dns.quad9.net 8.8.8.8#dns.google 2606:4700:4700::1111#cloudflare-dns.com 2620:fe::9#dns.quad9.net 2001:4860:4860::8888#dns.google
+	
+	Link 2 (enp0s3)
+    	Current Scopes: DNS LLMNR/IPv4 LLMNR/IPv6
+         	Protocols: +DefaultRoute +LLMNR -mDNS -DNSOverTLS DNSSEC=no/unsupported
+	Current DNS Server: 192.168.20.2
+       	DNS Servers: 192.168.20.2
+	
+	Link 3 (sit0)
+    	Current Scopes: none
+         	Protocols: -DefaultRoute +LLMNR +mDNS -DNSOverTLS DNSSEC=no/unsupported
+
+
+Finally, some other networking configurations to consider:
+
+* Run the VM on your machine with full access to the host networking stack. If you have the sepia vpn, this will probably work without too much configuration.
+* Run the VM in a netns as above but also setup the sepia vpn in the same netns. This can help to avoid using a sepia-bounce VM. You'll still need to configure routing between the bridge and the sepia VPN.
+* Run the VM in a netns as above but only use a local vstart cluster (possibly in another VM) in the same netns.
+
+
+Step Four: mounting a CephFS file system in your VM
+---------------------------------------------------
+
+This guide uses a vstart cluster on a machine in the sepia lab. Because the mon
+addresses will change with any new vstart cluster, it will invalidate any
+static configuration we may setup for our VM mounting the CephFS via the kernel
+driver.  So, we should create a script to fetch the configuration for our
+vstart cluster prior to mounting:
+
+.. code-block:: bash
+
+    #!/bin/bash
+    # kmount.sh -- mount a vstart Ceph cluster on a remote machine
+    
+    # the cephx client credential, vstart creates "client.fs" by default
+    NAME=fs
+    # static fs name, vstart creates an "a" file system by default
+    FS=a
+    # where to mount on the VM
+    MOUNTPOINT=/mnt
+    # cephfs mount point (root by default)
+    CEPHFS_MOUNTPOINT=/
+    
+    function run {
+        printf '%s\n' "$*" >&2
+        "$@"
+    }
+    
+    function mssh {
+        run ssh vossi04.front.sepia.ceph.com "cd ceph/build && (source vstart_environment.sh; $1)"
+    }
+    
+    # create the minimum config (including mon addresses) and store it in the VM's ceph.conf. This is not used for mounting; we're storing it for potential use with `ceph` commands.
+    mssh "ceph config generate-minimal-conf" > /etc/ceph/ceph.conf
+    # get the vstart cluster's fsid
+    FSID=$(mssh "ceph fsid")
+    # get the auth key associated with client.fs
+    KEY=$(mssh "ceph auth get-key client.$NAME")
+    # dump the v2 mon addresses and format for the -o mon_addr mount option
+    MONS=$(mssh "ceph mon dump --format=json" | jq -r '.mons[] | .public_addrs.addrvec[] | select(.type == "v2") | .addr' | paste -s -d/)
+    
+    # turn on kernel debugging (and any other debugging you'd like)
+    echo "module ceph +p" | tee /sys/kernel/debug/dynamic_debug/control
+    # do the mount! we use the new device syntax for this mount
+    run mount -t ceph "${NAME}@${FSID}.${FS}=${CEPHFS_MOUNTPOINT}" -o "mon_addr=${MONS},ms_mode=crc,name=${NAME},secret=${KEY},norequire_active_mds,noshare" "$MOUNTPOINT"
+
+That would be run like:
+
+.. code-block:: bash
+
+    $ sudo ip netns exec sepian ssh root@192.168.0.100 ./kmount.sh
+    ...
+    mount -t ceph fs@c9653bca-110b-4f70-9f84-5a195b205e9a.a=/ -o mon_addr=172.21.10.4:40762/172.21.10.4:40764/172.21.10.4:40766,ms_mode=crc,name=fs,secret=AQD0jgln43pBCxAA7cJlZ4Px7J0UmiK4A4j3rA==,norequire_active_mds,noshare /mnt
+    $ sudo ip netns exec sepian ssh root@192.168.0.100 df -h /mnt
+    Filesystem                                   Size  Used Avail Use% Mounted on
+    fs@c9653bca-110b-4f70-9f84-5a195b205e9a.a=/  169G     0  169G   0% /mnt
+
+
+If you run into difficulties, it may be:
+
+* The firewall on the node running the vstart cluster is blocking your connections.
+* Some misconfiguration in your networking stack.
+* An incorrect configuration for the mount.
+
+
+Step Five: testing kernel changes in teuthology
+-----------------------------------------------
+
+There 3 static branches in the `ceph kernel git repository`_ managed by the Ceph team:
+
+* `for-linus <https://github.com/ceph/ceph-client/tree/for-linus>`_: A branch managed by the primary Ceph maintainer to share changes with Linus Torvalds (upstream). Do not push to this branch.
+* `master <https://github.com/ceph/ceph-client/tree/master>`_: A staging ground for patches planned to be sent to Linus. Do not push to this branch. 
+* `testing <https://github.com/ceph/ceph-client/tree/testing>`_ A staging ground for miscellaneous patches that need wider QA testing (via nightlies or regular Ceph QA testing). Push patches you believe to be nearly ready for upstream acceptance.
+
+You may also push a ``wip-$feature`` branch to the ``ceph-client.git``
+repository which will be built by Jenkins. Then view the results of the build
+in `Shaman <https://shaman.ceph.com/builds/kernel/>`_.
+
+Once a kernel branch is built, you can test it via the ``fs`` CephFS QA suite:
+
+.. code-block:: bash
+
+    $ teuthology-suite ... --suite fs --kernel wip-$feature --filter k-testing
+
+
+The ``k-testing`` filter is looking for the fragment which normally sets
+``testing`` branch of the kernel for routine QA. That is, the ``fs`` suite
+regularly runs tests against whatever is in the ``testing`` branch of the
+kernel. We are overriding that choice of kernel branch via the ``--kernel
+wip-$featuree`` switch.
+
+.. note:: Without filtering for ``k-testing``, the ``fs`` suite will also run jobs using ceph-fuse or stock kernel, libcephfs tests, and other tests that may not be of interest to you when evaluating changes to the kernel.
+
+The actual override is controlled using Lua merge scripts in the
+``k-testing.yaml`` fragment. See that file for more details.
+
+
+.. _VPN: https://wiki.sepia.ceph.com/doku.php?id=vpnaccess
+.. _systemd resolved update script: systemd-resolved: https://wiki.archlinux.org/title/Systemd-resolved
+.. _Ceph Kernel QA Config: https://github.com/ceph/ceph-build/tree/899d0848a0f487f7e4cee773556aaf9529b8db26/kernel/build
+.. _developer playground: https://wiki.sepia.ceph.com/doku.php?id=devplayground#developer_playgrounds
+.. _ceph kernel git repository: https://github.com/ceph/ceph-client
diff --git a/doc/dev/radosgw/bucket_index.rst b/doc/dev/radosgw/bucket_index.rst
index 6764641e0f5..ceff57b58cf 100644
--- a/doc/dev/radosgw/bucket_index.rst
+++ b/doc/dev/radosgw/bucket_index.rst
@@ -32,7 +32,7 @@ For a given bucket, the index may be split into several rados objects, called bu
 
 The default shard count for new buckets is 11, but can be overridden in the zonegroup's ``bucket_index_max_shards`` or ceph.conf's ``rgw_override_bucket_index_max_shards``. As the number of objects in a bucket grows, its index shard count will also increase as a result of dynamic resharding.
 
-Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/rgw_reshard.cc``.
+Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/driver/rados/rgw_reshard.cc``.
 
 -----------------
 Index Transaction
@@ -46,7 +46,7 @@ To keep the bucket index consistent, all object writes or deletes must also upda
 
 Object writes and deletes may race with each other, so a given object may have more than one prepared transaction at a time. RGW considers an object entry to be 'pending' if there are any outstanding transactions, or 'completed' otherwise.
 
-This transaction is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
+This transaction is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
 
 -------
 Listing
@@ -56,7 +56,7 @@ When listing objects, RGW will read all entries (pending and completed) from the
 
 If an RGW crashes in the middle of an `Index Transaction`_, an index entry may get stuck in this 'pending' state. When bucket listing encounters these pending entries, it also sends information from the head object back to the bucket index so it can update the entry and resolve its stale transactions. This message is called 'dir suggest', because the bucket index treats it as a hint or suggestion.
 
-Bucket listing is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
+Bucket listing is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
 
 --------------------
 S3 Object Versioning
@@ -66,9 +66,9 @@ For versioned buckets, the bucket index contains an entry for each object versio
 
 RGW stores a head object in the rgw.buckets.data pool for each object version. This rados object's oid is a combination of the object name and its version id.
 
-In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
+In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
 
-To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
+To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/driver/rados/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
 
 .. _ListObjectsV2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html
 .. _ListObjectVersions: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html
diff --git a/doc/governance.rst b/doc/governance.rst
index 5ceb2fd43a1..bc88560f18a 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -21,57 +21,15 @@ Bodies
 Ceph Executive Council
 ======================
 
-.. _exec-council-responsibilities:
-
-Ceph Executive Council Responsibilities
----------------------------------------
-
-- Spokesperson
-
-   - welcome/keynote for cephalocon
-     
-   - maintaining slides and presenting about the project
+Responsibilities
+----------------
 
-   - Community focal point (user interaction, conference talks, mailing list,
-     etc)
+ * Arbiter in cases where decisions cannot be reached by consensus
+ * Distribute key responsibilities amongst themselves or others
+ * Point of contact for the project
+ * Representatives for Ceph foundation board meetings
+ * Ensure things get done
    
-- Community
-
-    - managing community manager
-
-    - LF Program Manager person, Social Media person
-
-    - liase with the ambassadors
-
-    - make sure ceph events happen, successfully: cephalocon, ceph days, cds, user/dev, cdm
-
-    - coordinating with LF
-
-    - creating program committee
-
-    - recordings on youtube
-
-    - getting sponsors for events
-
-    - communications, schedule, venue decisions
-
-    - coordinate blog posts
-
-
-- Ceph Foundation
-
-    - ensure foundation is healthy: financials, operations
-
-    - represent the CLT on the Board
-
-    - present project status regularly (yearly)
-
-    - collect member ideas / feedback
-
-    - ensure members feel valued
-
-    - guide the members how to support the project (events, testing, marketing, hardware, ...)
-
 Membership
 ----------
 
@@ -124,7 +82,7 @@ Current Members
  * Casey Bodley <cbodley@redhat.com>
  * Dan van der Ster <dan.vanderster@clyso.com>
  * David Orman <ormandj@1111systems.com>
- * Ernesto Puerta <epuerta@redhat.com>
+ * Ernesto Puerta <epuertat@redhat.com>
  * Gregory Farnum <gfarnum@redhat.com>
  * Haomai Wang <haomai@xsky.com>
  * Ilya Dryomov <idryomov@redhat.com>
@@ -138,7 +96,7 @@ Current Members
  * Mike Perez <miperez@redhat.com>
  * Myoungwon Oh <myoungwon.oh@samsung.com>
  * Neha Ojha <nojha@redhat.com>
- * Patrick Donnelly <pdonnell@redhat.com>
+ * Patrick Donnelly <pdonnell@ibm.com>
  * Sam Just <sjust@redhat.com>
  * Vikhyat Umrao <vikhyat@redhat.com>
  * Xie Xingguo <xie.xingguo@zte.com.cn>
@@ -146,6 +104,17 @@ Current Members
  * Yingxin Cheng <yingxin.cheng@intel.com>
  * Yuri Weinstein <yweinste@redhat.com>
  * Zac Dover <zac.dover@proton.me>
+ * Laura Flores <lflores@redhat.com>
+ * Venky Shankar <vshankar@redhat.com>
+ * Guillaume Abrioux <gabrioux@redhat.com>
+ * Anthony D'Atri <anthony.datri@gmail.com>
+ * Joseph Mundackal <jmundackal@bloomberg.net>
+ * Gaurav Sitlani <gsitlani@ibm.com>
+ * Afreen Misbah <afreen@ibm.com>
+ * Radoslaw Zarzynski <rzarzyns@redhat.com>
+ * Matan Breizman <mbreizma@redhat.com>
+ * Yaarit Hatuka <yhatuka@ibm.com>
+ * Adam C. Emerson <aemerson@redhat.com>
 
 .. _ctl:
 
diff --git a/doc/man/8/mount.ceph.rst b/doc/man/8/mount.ceph.rst
index 7ecdeb5e852..553e190bdac 100644
--- a/doc/man/8/mount.ceph.rst
+++ b/doc/man/8/mount.ceph.rst
@@ -192,12 +192,13 @@ Advanced
 :command:`wsync`
     Execute all namespace operations synchronously. This ensures that the
     namespace operation will only complete after receiving a reply from
-    the MDS. This is the default.
+    the MDS. 
 
 :command:`nowsync`
     Allow the client to do namespace operations asynchronously. When this
     option is enabled, a namespace operation may complete before the MDS
-    replies, if it has sufficient capabilities to do so.
+    replies, if it has sufficient capabilities to do so. This has been the
+    default since kernel version 5.16.
 
 :command:`crush_location=x`
     Specify the location of the client in terms of CRUSH hierarchy (since 5.8).
diff --git a/doc/mgr/smb.rst b/doc/mgr/smb.rst
index 05e6369ddf1..3252c485a9a 100644
--- a/doc/mgr/smb.rst
+++ b/doc/mgr/smb.rst
@@ -96,6 +96,11 @@ clustering
     enables clustering regardless of the placement count. A value of ``never``
     disables clustering regardless of the placement count. If unspecified,
     ``default`` is assumed.
+public_addrs
+    Optional. A string in the form of <ipaddress/prefixlength>[%<destination interface>].
+    Supported only when using Samba's clustering. Assign "virtual" IP
+    addresses that will be managed by the clustering subsystem and may automatically
+    move between nodes running Samba containers.
 
 Remove Cluster
 ++++++++++++++
diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst
index 794fdf84195..afccd9ab16a 100644
--- a/doc/monitoring/index.rst
+++ b/doc/monitoring/index.rst
@@ -64,6 +64,30 @@ in:
 
 It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard.
 
+Ceph daemon health metrics
+==========================
+
+The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket.
+
+The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons.
+
+Labels:
+- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host.
+- **``hostname``**: Name of the host where the Ceph daemon is running.
+
+Example:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1
+   ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0
+
+To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0
+
 
 Performance metrics
 ===================
diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst
index da92692fa8b..3085e1a528f 100644
--- a/doc/radosgw/index.rst
+++ b/doc/radosgw/index.rst
@@ -88,4 +88,4 @@ Cluster with one API and then retrieve that data with the other API.
    D3N Data Cache <d3n_datacache>
    Cloud Transition <cloud-transition>
    Metrics <metrics>
-
+   UADK Acceleration for Compression <uadk-accel>
diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
index 6a21b7479e6..d6925c8ed9c 100644
--- a/doc/radosgw/multisite.rst
+++ b/doc/radosgw/multisite.rst
@@ -507,7 +507,7 @@ For example:
 Updating the Period
 -------------------
 
-After updating the master zone configuration, update the period:
+After updating the secondary zone configuration, update the period:
 
 .. prompt:: bash #
 
diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst
new file mode 100644
index 00000000000..fdf99f891f0
--- /dev/null
+++ b/doc/radosgw/uadk-accel.rst
@@ -0,0 +1,132 @@
+===============================================
+UADK Acceleration for Compression
+===============================================
+
+UADK is a framework for applications to access hardware accelerators in a
+unified, secure, and efficient way. UADK is comprised of UACCE, libwd and many
+other algorithm libraries.
+
+See `Compressor UADK Support`_.
+
+
+UADK in the Software Stack
+==========================
+
+UADK is a general-purpose user space accelerator framework that uses shared
+virtual addressing (SVA) to provide a unified programming interface for hardware
+acceleration of cryptographic and compression algorithms.
+
+UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE),
+which enables hardware accelerators that support SVA to adapt to UADK.
+
+Currently, HiSilicon Kunpeng hardware accelerators have been registered with
+UACCE. Through the UADK framework, users can run cryptographic and compression
+algorithms using hardware accelerators instead of CPUs, freeing up CPU computing
+power and improving computing performance.
+
+A user can access the hardware accelerators by performing user-mode operations on
+the character devices, or the use of UADK can be done via frameworks that have
+been enabled by others including UADK support (for example, OpenSSL* libcrypto*,
+DPDK, and the Linux* Kernel Crypto Framework).
+
+See `OpenSSL UADK Engine`_.
+
+UADK Environment Setup
+======================
+UADK consists of UACCE, vendors’ drivers, and an algorithm layer. UADK requires the
+hardware accelerator to support SVA, and the operating system to support IOMMU and
+SVA. Hardware accelerators from different vendors are registered as different character
+devices with UACCE by using kernel-mode drivers of the vendors.
+
+::
+
+          +----------------------------------+
+          |                apps              |
+          +----+------------------------+----+
+               |                        |
+               |                        |
+       +-------+--------+       +-------+-------+
+       |   scheduler    |       | alg libraries |
+       +-------+--------+       +-------+-------+
+               |                         |
+               |                         |
+               |                         |
+               |                +--------+------+
+               |                | vendor drivers|
+               |                +-+-------------+
+               |                  |
+               |                  |
+            +--+------------------+--+
+            |         libwd          |
+    User    +----+-------------+-----+
+    --------------------------------------------------
+    Kernel    +--+-----+   +------+
+              | uacce  |   | smmu |
+              +---+----+   +------+
+                  |
+              +---+------------------+
+              | vendor kernel driver |
+              +----------------------+
+    --------------------------------------------------
+             +----------------------+
+             |   HW Accelerators    |
+             +----------------------+
+
+Configuration
+=============
+
+#. Kernel Requirement
+
+User needs to make sure that UACCE is already supported in Linux kernel. The kernel version
+should be at least v5.9 with SVA (Shared Virtual Addressing) enabled.
+
+UACCE may be built as a module or built into the kernel. Here's an example to build UACCE
+with hardware accelerators for the HiSilicon Kunpeng platform.
+
+    .. prompt:: bash $
+
+       CONFIG_IOMMU_SVA_LIB=y
+       CONFIG_ARM_SMMU=y
+       CONFIG_ARM_SMMU_V3=y
+       CONFIG_ARM_SMMU_V3_SVA=y
+       CONFIG_PCI_PASID=y
+       CONFIG_UACCE=y
+       CONFIG_CRYPTO_DEV_HISI_QM=y
+       CONFIG_CRYPTO_DEV_HISI_ZIP=y
+
+Make sure all these above kernel configurations are selected.
+
+#. UADK enablement
+If the architecture is aarch64, it will automatically download the UADK source code to build
+the static library. If it runs on other architecture, user can enable it with build parameters
+`-DWITH_UADK=true`
+
+#. Manual Build UADK
+As the above paragraph shows, the UADK is enabled automatically, no need to build manually.
+For developer who is interested in UADK, you can refer to the below steps for building.
+
+   .. prompt:: bash $ 
+
+      git clone https://github.com/Linaro/uadk.git
+      cd uadk
+      mkdir build
+      ./autogen.sh
+      ./configure --prefix=$PWD/build
+      make
+      make install
+
+   .. note:: Without –prefix, UADK will be installed to /usr/local/lib by
+             default. If get error:"cannot find -lnuma", please install 
+             the `libnuma-dev`.
+
+#. Configure
+
+   Edit the Ceph configuration file (usually ``ceph.conf``) to enable UADK
+   support for *zlib* compression::
+
+         uadk_compressor_enabled=true
+
+   The default value in `global.yaml.in` for `uadk_compressor_enabled` is false.
+
+.. _Compressor UADK Support: https://github.com/ceph/ceph/pull/58336
+.. _OpenSSL UADK Engine: https://github.com/Linaro/uadk_engine
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index fa2899b22c1..cde1a736f8c 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -45,8 +45,8 @@
           'for': '30s',
           expr: |||
             (
-              (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
-                count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
+              (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
+                count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
               )
             ) == 1
           |||,
@@ -54,22 +54,20 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
             summary: 'Monitor quorum is at risk%(cluster)s' % $.MultiClusterSummary(),
-            description: '{{ $min := query "floor(count(ceph_mon_metadata) / 2) + 1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
+            description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=\'%s\'}) / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=\'%s\'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
           alert: 'CephMonDown',
           'for': '30s',
           expr: |||
-            count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
+            (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
           |||,
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
             summary: 'One or more monitors down%(cluster)s' % $.MultiClusterSummary(),
-            description: |||
-              {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
-            |||,
+            description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=\'%s\'} == 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=\'%s\'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -112,11 +110,11 @@
       rules: [
         {
           alert: 'CephOSDDownHigh',
-          expr: 'count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10',
+          expr: 'count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.1' },
           annotations: {
             summary: 'More than 10%% of OSDs are down%(cluster)s' % $.MultiClusterSummary(),
-            description: '{{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
+            description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=\'%s\'} == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf "count (ceph_osd_up{cluster=\'%s\'})" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -126,7 +124,7 @@
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.8' },
           annotations: {
             summary: 'An OSD host is offline%(cluster)s' % $.MultiClusterSummary(),
-            description: 'The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}',
+            description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}',
           },
         },
         {
@@ -137,9 +135,7 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down',
             summary: 'An OSD has been marked down%(cluster)s' % $.MultiClusterSummary(),
-            description: |||
-              {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
-            |||,
+            description: '{{ $num := printf "count(ceph_osd_up{cluster=\'%s\'} == 0) " .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -235,7 +231,7 @@
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.7' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany',
-            summary: 'Too many devices are predicted to fail, unable to resolve%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Too many devices are predicted to fail%(cluster)s, unable to resolve' % $.MultiClusterSummary(),
             description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated.',
           },
         },
@@ -298,7 +294,7 @@
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.1' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages',
-            summary: 'CephFS filesystem is damaged%(cluster)s.' % $.MultiClusterSummary(),
+            summary: 'CephFS filesystem is damaged%(cluster)s' % $.MultiClusterSummary(),
             description: 'Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support.',
           },
         },
@@ -390,7 +386,7 @@
           expr: 'up{job="ceph"} == 0',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.2' },
           annotations: {
-            summary: 'The mgr/prometheus module is not available%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The mgr/prometheus module is not available',
             description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.",
           },
         },
@@ -507,7 +503,7 @@
           expr: 'node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.1' },
           annotations: {
-            summary: 'Root filesystem is dangerously full%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Root filesystem is dangerously full',
             description: 'Root volume is dangerously full: {{ $value | humanize }}% free.',
           },
         },
@@ -527,7 +523,7 @@
           ||| % $._config,
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
           annotations: {
-            summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'One or more NICs reports packet drops',
             description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
           },
         },
@@ -564,7 +560,7 @@
         },
         {
           alert: 'CephNodeDiskspaceWarning',
-          expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0',
+          expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0',
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' },
           annotations: {
             summary: 'Host filesystem free space is getting low%(cluster)s' % $.MultiClusterSummary(),
@@ -573,7 +569,7 @@
         },
         {
           alert: 'CephNodeInconsistentMTU',
-          expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )',
+          expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             summary: 'MTU settings across Ceph hosts are inconsistent%(cluster)s' % $.MultiClusterSummary(),
@@ -611,7 +607,7 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full',
             summary: 'Pool is full - writes are blocked%(cluster)s' % $.MultiClusterSummary(),
-            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)",
+            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf \"topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))\" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)",
           },
         },
         {
@@ -647,7 +643,7 @@
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
-            summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
+            summary: '{{ $labels.ceph_daemon }} operations are slow to complete%(cluster)s' % $.MultiClusterSummary(),
             description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
           },
         },
@@ -763,7 +759,7 @@
           expr: 'absent(up{job="ceph"})',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.12.1' },
           annotations: {
-            summary: 'The scrape job for Ceph is missing from Prometheus%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The scrape job for Ceph is missing from Prometheus',
             description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.",
           },
         },
@@ -775,7 +771,7 @@
         {
           alert: 'CephObjectMissing',
           'for': '30s',
-          expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1',
+          expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.1' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound',
@@ -807,31 +803,31 @@
         {
           alert: 'CephRBDMirrorImagesPerDaemonHigh',
           'for': '1m',
-          expr: 'sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+          expr: 'sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.2' },
           annotations: {
-            summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
-            description: 'Number of image replications per daemon is not suppossed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+            summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s%(cluster)s' % [$._config.CephRBDMirrorImagesPerDaemonThreshold, $.MultiClusterSummary()],
+            description: 'Number of image replications per daemon is not supposed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
           },
         },
         {
           alert: 'CephRBDMirrorImagesNotInSync',
           'for': '1m',
-          expr: 'sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0',
+          expr: 'sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.3' },
           annotations: {
-            summary: 'Some of the RBD mirror images are not in sync with the remote counter parts.',
+            summary: 'Some of the RBD mirror images are not in sync with the remote counter parts%(cluster)s' % $.MultiClusterSummary(),
             description: 'Both local and remote RBD mirror images should be in sync.',
           },
         },
         {
           alert: 'CephRBDMirrorImagesNotInSyncVeryHigh',
           'for': '1m',
-          expr: 'count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)',
+          expr: 'count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.4' },
           annotations: {
-            summary: 'Number of unsynchronized images are very high.',
-            description: 'More than 10% of the images have synchronization problems',
+            summary: 'Number of unsynchronized images are very high%(cluster)s' % $.MultiClusterSummary(),
+            description: 'More than 10% of the images have synchronization problems.',
           },
         },
         {
@@ -840,7 +836,7 @@
           expr: 'rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > %.2f' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold],
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.5' },
           annotations: {
-            summary: 'The replication network usage has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
+            summary: 'The replication network usage%(cluster)s has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$.MultiClusterSummary(), $._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
             description: 'Detected a heavy increase in bandwidth for rbd replications (over %d%s) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
           },
         },
@@ -852,50 +848,50 @@
         {
           alert: 'NVMeoFSubsystemNamespaceLimit',
           'for': '1m',
-          expr: '(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit',
+          expr: '(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces %(cluster)s' % $.MultiClusterSummary(),
+            summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces%(cluster)s' % $.MultiClusterSummary(),
             description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}',
           },
         },
         {
           alert: 'NVMeoFTooManyGateways',
           'for': '1m',
-          expr: 'count(ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
+          expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Max supported gateways exceeded %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Max supported gateways exceeded%(cluster)s' % $.MultiClusterSummary(),
             description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config,
           },
         },
         {
           alert: 'NVMeoFMaxGatewayGroupSize',
           'for': '1m',
-          expr: 'count by(group) (ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup],
+          expr: 'count(ceph_nvmeof_gateway_info) by (cluster,group) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded%(cluster)s' % $.MultiClusterSummary(),
             description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config,
           },
         },
         {
           alert: 'NVMeoFSingleGatewayGroup',
           'for': '5m',
-          expr: 'count by(group) (ceph_nvmeof_gateway_info) == 1',
+          expr: 'count(ceph_nvmeof_gateway_info) by(cluster,group) == 1',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible%(cluster)s' % $.MultiClusterSummary(),
             description: 'Although a single member gateway group is valid, it should only be used for test purposes',
           },
         },
         {
           alert: 'NVMeoFHighGatewayCPU',
           'for': '10m',
-          expr: 'label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU],
+          expr: 'label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high%(cluster)s' % $.MultiClusterSummary(),
             description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores',
           },
         },
@@ -905,27 +901,27 @@
           expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security%(cluster)s' % $.MultiClusterSummary(),
             description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss',
           },
         },
         {
           alert: 'NVMeoFTooManySubsystems',
           'for': '1m',
-          expr: 'count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway],
+          expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The number of subsystems defined to the gateway exceeds supported values %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The number of subsystems defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(),
             description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported',
           },
         },
         {
           alert: 'NVMeoFVersionMismatch',
           'for': '1h',
-          expr: 'count(count by(version) (ceph_nvmeof_gateway_info)) > 1',
+          expr: 'count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The cluster has different NVMe-oF gateway releases active %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Too many different NVMe-oF gateway releases active%(cluster)s' % $.MultiClusterSummary(),
             description: 'This may indicate an issue with deployment. Check cephadm logs',
           },
         },
@@ -935,17 +931,17 @@
           expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The number of clients connected to {{ $labels.nqn }} is too high %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The number of clients connected to {{ $labels.nqn }} is too high%(cluster)s' % $.MultiClusterSummary(),
             description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config,
           },
         },
         {
           alert: 'NVMeoFHighHostCPU',
           'for': '10m',
-          expr: '100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU],
+          expr: '100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }}) %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }})%(cluster)s' % $.MultiClusterSummary(),
             description: 'High CPU on a gateway host can lead to CPU contention and performance degradation',
           },
         },
@@ -955,7 +951,7 @@
           expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}',
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' },
           annotations: {
-            summary: 'Network interface {{ $labels.device }} is down %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Network interface {{ $labels.device }} is down%(cluster)s' % $.MultiClusterSummary(),
             description: 'A NIC used by one or more subsystems is in a down state',
           },
         },
@@ -965,7 +961,7 @@
           expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Network interface {{ $labels.device }} is not running in full duplex mode %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Network interface {{ $labels.device }} is not running in full duplex mode%(cluster)s' % $.MultiClusterSummary(),
             description: 'Until this is resolved, performance from the gateway will be degraded',
           },
         },
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 84452e5845a..ba6a6ded0a3 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -3,8 +3,8 @@ groups:
     rules:
       - alert: "CephHealthError"
         annotations:
-          description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information."
-          summary: "Ceph is in the ERROR state"
+          description: "The cluster state has been HEALTH_ERROR for more than 5 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the ERROR state on cluster {{ $labels.cluster }}"
         expr: "ceph_health_status == 2"
         for: "5m"
         labels:
@@ -13,8 +13,8 @@ groups:
           type: "ceph_default"
       - alert: "CephHealthWarning"
         annotations:
-          description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information."
-          summary: "Ceph is in the WARNING state"
+          description: "The cluster state has been HEALTH_WARN for more than 15 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the WARNING state on cluster {{ $labels.cluster }}"
         expr: "ceph_health_status == 1"
         for: "15m"
         labels:
@@ -24,13 +24,13 @@ groups:
     rules:
       - alert: "CephMonDownQuorumAtRisk"
         annotations:
-          description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          description: "{{ $min := printf \"floor(count(ceph_mon_metadata{cluster='%s'}) / 2) + 1\" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
-          summary: "Monitor quorum is at risk"
+          summary: "Monitor quorum is at risk on cluster {{ $labels.cluster }}"
         expr: |
           (
-            (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
-              count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
+            (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
+              count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
             )
           ) == 1
         for: "30s"
@@ -40,12 +40,11 @@ groups:
           type: "ceph_default"
       - alert: "CephMonDown"
         annotations:
-          description: |
-            {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          description: "{{ $down := printf \"count(ceph_mon_quorum_status{cluster='%s'} == 0)\" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $down 1.0 }}{{ $s = \"s\" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
-          summary: "One or more monitors down"
+          summary: "One or more monitors down on cluster {{ $labels.cluster }}"
         expr: |
-          count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
+          (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
         for: "30s"
         labels:
           severity: "warning"
@@ -54,7 +53,7 @@ groups:
         annotations:
           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
-          summary: "Filesystem space on at least one monitor is critically low"
+          summary: "Filesystem space on at least one monitor is critically low on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
         for: "1m"
         labels:
@@ -65,7 +64,7 @@ groups:
         annotations:
           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
-          summary: "Drive space on at least one monitor is approaching full"
+          summary: "Drive space on at least one monitor is approaching full on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
         for: "5m"
         labels:
@@ -75,7 +74,7 @@ groups:
         annotations:
           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
-          summary: "Clock skew detected among monitors"
+          summary: "Clock skew detected among monitors on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
         for: "1m"
         labels:
@@ -85,17 +84,17 @@ groups:
     rules:
       - alert: "CephOSDDownHigh"
         annotations:
-          description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
-          summary: "More than 10% of OSDs are down"
-        expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10"
+          description: "{{ $value | humanize }}% or {{ with printf \"count (ceph_osd_up{cluster='%s'} == 0)\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf \"count (ceph_osd_up{cluster='%s'})\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          summary: "More than 10% of OSDs are down on cluster {{ $labels.cluster }}"
+        expr: "count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.1"
           severity: "critical"
           type: "ceph_default"
       - alert: "CephOSDHostDown"
         annotations:
-          description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
-          summary: "An OSD host is offline"
+          description: "The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
+          summary: "An OSD host is offline on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
         for: "5m"
         labels:
@@ -104,10 +103,9 @@ groups:
           type: "ceph_default"
       - alert: "CephOSDDown"
         annotations:
-          description: |
-            {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          description: "{{ $num := printf \"count(ceph_osd_up{cluster='%s'} == 0) \" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $num 1.0 }}{{ $s = \"s\" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s \"\" }}is{{ else }}are{{ end }} down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
-          summary: "An OSD has been marked down"
+          summary: "An OSD has been marked down on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
         for: "5m"
         labels:
@@ -118,7 +116,7 @@ groups:
         annotations:
           description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
-          summary: "OSD(s) running low on free space (NEARFULL)"
+          summary: "OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
         for: "5m"
         labels:
@@ -129,7 +127,7 @@ groups:
         annotations:
           description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
-          summary: "OSD full, writes blocked"
+          summary: "OSD full, writes blocked on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0"
         for: "1m"
         labels:
@@ -140,7 +138,7 @@ groups:
         annotations:
           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
-          summary: "OSD(s) too full for backfill operations"
+          summary: "OSD(s) too full for backfill operations on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
         for: "1m"
         labels:
@@ -150,7 +148,7 @@ groups:
         annotations:
           description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
-          summary: "OSD reports a high number of read errors"
+          summary: "OSD reports a high number of read errors on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
         for: "30s"
         labels:
@@ -159,7 +157,7 @@ groups:
       - alert: "CephOSDTimeoutsPublicNetwork"
         annotations:
           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
-          summary: "Network issues delaying OSD heartbeats (public network)"
+          summary: "Network issues delaying OSD heartbeats (public network) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
         for: "1m"
         labels:
@@ -168,7 +166,7 @@ groups:
       - alert: "CephOSDTimeoutsClusterNetwork"
         annotations:
           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
-          summary: "Network issues delaying OSD heartbeats (cluster network)"
+          summary: "Network issues delaying OSD heartbeats (cluster network) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
         for: "1m"
         labels:
@@ -178,7 +176,7 @@ groups:
         annotations:
           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
-          summary: "OSD size inconsistency error"
+          summary: "OSD size inconsistency error on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
         for: "1m"
         labels:
@@ -188,7 +186,7 @@ groups:
         annotations:
           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
-          summary: "Device(s) predicted to fail soon"
+          summary: "Device(s) predicted to fail soon on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
         for: "1m"
         labels:
@@ -198,7 +196,7 @@ groups:
         annotations:
           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
-          summary: "Too many devices are predicted to fail, unable to resolve"
+          summary: "Too many devices are predicted to fail on cluster {{ $labels.cluster }}, unable to resolve"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
         for: "1m"
         labels:
@@ -209,7 +207,7 @@ groups:
         annotations:
           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
-          summary: "Device failure is predicted, but unable to relocate data"
+          summary: "Device failure is predicted, but unable to relocate data on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
         for: "1m"
         labels:
@@ -219,8 +217,8 @@ groups:
         annotations:
           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
           documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
-          summary: "Network issues are causing OSDs to flap (mark each other down)"
-        expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
+          summary: "Network issues are causing OSDs to flap (mark each other down) on cluster {{ $labels.cluster }}"
+        expr: "(rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.4"
           severity: "warning"
@@ -229,7 +227,7 @@ groups:
         annotations:
           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
-          summary: "Device read errors detected"
+          summary: "Device read errors detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
         for: "30s"
         labels:
@@ -238,12 +236,12 @@ groups:
       - alert: "CephPGImbalance"
         annotations:
           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
-          summary: "PGs are not balanced across OSDs"
+          summary: "PGs are not balanced across OSDs on cluster {{ $labels.cluster }}"
         expr: |
           abs(
-            ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
+            ((ceph_osd_numpg > 0) - on (cluster,job) group_left avg(ceph_osd_numpg > 0) by (cluster,job)) /
             on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-          ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+          ) * on (cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
         for: "5m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
@@ -255,7 +253,7 @@ groups:
         annotations:
           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
-          summary: "CephFS filesystem is damaged."
+          summary: "CephFS filesystem is damaged on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
         for: "1m"
         labels:
@@ -266,7 +264,7 @@ groups:
         annotations:
           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
-          summary: "CephFS filesystem is offline"
+          summary: "CephFS filesystem is offline on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
         for: "1m"
         labels:
@@ -277,7 +275,7 @@ groups:
         annotations:
           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
-          summary: "CephFS filesystem is degraded"
+          summary: "CephFS filesystem is degraded on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
         for: "1m"
         labels:
@@ -288,7 +286,7 @@ groups:
         annotations:
           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
-          summary: "Ceph MDS daemon count is lower than configured"
+          summary: "Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
         for: "1m"
         labels:
@@ -298,7 +296,7 @@ groups:
         annotations:
           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
-          summary: "Ceph filesystem standby daemons too few"
+          summary: "Ceph filesystem standby daemons too few on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
         for: "1m"
         labels:
@@ -308,7 +306,7 @@ groups:
         annotations:
           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
-          summary: "MDS daemon failed, no further standby available"
+          summary: "MDS daemon failed, no further standby available on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
         for: "1m"
         labels:
@@ -319,7 +317,7 @@ groups:
         annotations:
           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
-          summary: "CephFS filesystem in read only mode due to write error(s)"
+          summary: "CephFS filesystem in read only mode due to write error(s) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
         for: "1m"
         labels:
@@ -332,7 +330,7 @@ groups:
         annotations:
           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
-          summary: "A manager module has recently crashed"
+          summary: "A manager module has recently crashed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
         for: "5m"
         labels:
@@ -354,8 +352,8 @@ groups:
       - alert: "CephPGsInactive"
         annotations:
           description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
-          summary: "One or more placement groups are inactive"
-        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
+          summary: "One or more placement groups are inactive on cluster {{ $labels.cluster }}"
+        expr: "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
         for: "5m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.7.1"
@@ -364,8 +362,8 @@ groups:
       - alert: "CephPGsUnclean"
         annotations:
           description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
-          summary: "One or more placement groups are marked unclean"
-        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
+          summary: "One or more placement groups are marked unclean on cluster {{ $labels.cluster }}"
+        expr: "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
         for: "15m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.7.2"
@@ -375,7 +373,7 @@ groups:
         annotations:
           description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
-          summary: "Placement group damaged, manual intervention needed"
+          summary: "Placement group damaged, manual intervention needed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
         for: "5m"
         labels:
@@ -386,7 +384,7 @@ groups:
         annotations:
           description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
-          summary: "OSDs are too full for recovery"
+          summary: "OSDs are too full for recovery on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
         for: "1m"
         labels:
@@ -397,7 +395,7 @@ groups:
         annotations:
           description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
-          summary: "PG is unavailable, blocking I/O"
+          summary: "PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O"
         expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
         for: "1m"
         labels:
@@ -408,7 +406,7 @@ groups:
         annotations:
           description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
-          summary: "Backfill operations are blocked due to lack of free space"
+          summary: "Backfill operations are blocked due to lack of free space on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
         for: "1m"
         labels:
@@ -419,7 +417,7 @@ groups:
         annotations:
           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
-          summary: "Placement group(s) have not been scrubbed"
+          summary: "Placement group(s) have not been scrubbed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
         for: "5m"
         labels:
@@ -429,7 +427,7 @@ groups:
         annotations:
           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
-          summary: "Placement groups per OSD is too high"
+          summary: "Placement groups per OSD is too high on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
         for: "1m"
         labels:
@@ -439,7 +437,7 @@ groups:
         annotations:
           description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
-          summary: "Placement group(s) have not been deep scrubbed"
+          summary: "Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
         for: "5m"
         labels:
@@ -479,7 +477,7 @@ groups:
       - alert: "CephNodeNetworkPacketErrors"
         annotations:
           description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
-          summary: "One or more NICs reports packet errors"
+          summary: "One or more NICs reports packet errors on cluster {{ $labels.cluster }}"
         expr: |
           (
             rate(node_network_receive_errs_total{device!="lo"}[1m]) +
@@ -498,7 +496,7 @@ groups:
       - alert: "CephNodeNetworkBondDegraded"
         annotations:
           description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
-          summary: "Degraded Bond on Node {{ $labels.instance }}"
+          summary: "Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster }}"
         expr: |
           node_bonding_slaves - node_bonding_active != 0
         labels:
@@ -507,8 +505,8 @@ groups:
       - alert: "CephNodeDiskspaceWarning"
         annotations:
           description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
-          summary: "Host filesystem free space is getting low"
-        expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
+          summary: "Host filesystem free space is getting low on cluster {{ $labels.cluster }}"
+        expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
           severity: "warning"
@@ -516,8 +514,8 @@ groups:
       - alert: "CephNodeInconsistentMTU"
         annotations:
           description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
-          summary: "MTU settings across Ceph hosts are inconsistent"
-        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+          summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
+        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
         labels:
           severity: "warning"
           type: "ceph_default"
@@ -526,8 +524,8 @@ groups:
       - alert: "CephPoolGrowthWarning"
         annotations:
           description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
-          summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
+          summary: "Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster }}"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, instance) group_right() ceph_pool_metadata) >= 95"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
           severity: "warning"
@@ -535,16 +533,16 @@ groups:
       - alert: "CephPoolBackfillFull"
         annotations:
           description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
-          summary: "Free space in a pool is too low for recovery/backfill"
+          summary: "Free space in a pool is too low for recovery/backfill on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "CephPoolFull"
         annotations:
-          description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+          description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf \"topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))\" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
-          summary: "Pool is full - writes are blocked"
+          summary: "Pool is full - writes are blocked on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0"
         for: "1m"
         labels:
@@ -554,7 +552,7 @@ groups:
       - alert: "CephPoolNearFull"
         annotations:
           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
-          summary: "One or more Ceph pools are nearly full"
+          summary: "One or more Ceph pools are nearly full on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
         for: "5m"
         labels:
@@ -566,7 +564,7 @@ groups:
         annotations:
           description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
-          summary: "OSD operations are slow to complete"
+          summary: "OSD operations are slow to complete on cluster {{ $labels.cluster }}"
         expr: "ceph_healthcheck_slow_ops > 0"
         for: "30s"
         labels:
@@ -576,7 +574,7 @@ groups:
         annotations:
           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
-          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete on cluster {{ $labels.cluster }}"
         expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
         for: "30s"
         labels:
@@ -587,7 +585,7 @@ groups:
       - alert: "CephadmUpgradeFailed"
         annotations:
           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
-          summary: "Ceph version upgrade has failed"
+          summary: "Ceph version upgrade has failed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
         for: "30s"
         labels:
@@ -597,7 +595,7 @@ groups:
       - alert: "CephadmDaemonFailed"
         annotations:
           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
-          summary: "A ceph daemon managed by cephadm is down"
+          summary: "A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
         for: "30s"
         labels:
@@ -608,7 +606,7 @@ groups:
         annotations:
           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
           documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
-          summary: "Orchestration tasks via cephadm are PAUSED"
+          summary: "Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
         for: "1m"
         labels:
@@ -619,7 +617,7 @@ groups:
       - alert: "HardwareStorageError"
         annotations:
           description: "Some storage devices are in error. Check `ceph health detail`."
-          summary: "Storage devices error(s) detected"
+          summary: "Storage devices error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
         for: "30s"
         labels:
@@ -629,7 +627,7 @@ groups:
       - alert: "HardwareMemoryError"
         annotations:
           description: "DIMM error(s) detected. Check `ceph health detail`."
-          summary: "DIMM error(s) detected"
+          summary: "DIMM error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
         for: "30s"
         labels:
@@ -639,7 +637,7 @@ groups:
       - alert: "HardwareProcessorError"
         annotations:
           description: "Processor error(s) detected. Check `ceph health detail`."
-          summary: "Processor error(s) detected"
+          summary: "Processor error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
         for: "30s"
         labels:
@@ -649,7 +647,7 @@ groups:
       - alert: "HardwareNetworkError"
         annotations:
           description: "Network error(s) detected. Check `ceph health detail`."
-          summary: "Network error(s) detected"
+          summary: "Network error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
         for: "30s"
         labels:
@@ -659,7 +657,7 @@ groups:
       - alert: "HardwarePowerError"
         annotations:
           description: "Power supply error(s) detected. Check `ceph health detail`."
-          summary: "Power supply error(s) detected"
+          summary: "Power supply error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
         for: "30s"
         labels:
@@ -669,7 +667,7 @@ groups:
       - alert: "HardwareFanError"
         annotations:
           description: "Fan error(s) detected. Check `ceph health detail`."
-          summary: "Fan error(s) detected"
+          summary: "Fan error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
         for: "30s"
         labels:
@@ -694,8 +692,8 @@ groups:
         annotations:
           description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
-          summary: "Object(s) marked UNFOUND"
-        expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1"
+          summary: "Object(s) marked UNFOUND on cluster {{ $labels.cluster }}"
+        expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1"
         for: "30s"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.1"
@@ -707,7 +705,7 @@ groups:
         annotations:
           description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
-          summary: "One or more Ceph daemons have crashed, and are pending acknowledgement"
+          summary: "One or more Ceph daemons have crashed, and are pending acknowledgement on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
         for: "1m"
         labels:
@@ -718,9 +716,9 @@ groups:
     rules:
       - alert: "CephRBDMirrorImagesPerDaemonHigh"
         annotations:
-          description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
-          summary: "Number of image replications are now above 100"
-        expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
+          description: "Number of image replications per daemon is not supposed to go beyond threshold 100"
+          summary: "Number of image replications are now above 100 on cluster {{ $labels.cluster }}"
+        expr: "sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
@@ -729,8 +727,8 @@ groups:
       - alert: "CephRBDMirrorImagesNotInSync"
         annotations:
           description: "Both local and remote RBD mirror images should be in sync."
-          summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
-        expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
+          summary: "Some of the RBD mirror images are not in sync with the remote counter parts on cluster {{ $labels.cluster }}"
+        expr: "sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
@@ -738,9 +736,9 @@ groups:
           type: "ceph_default"
       - alert: "CephRBDMirrorImagesNotInSyncVeryHigh"
         annotations:
-          description: "More than 10% of the images have synchronization problems"
-          summary: "Number of unsynchronized images are very high."
-        expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
+          description: "More than 10% of the images have synchronization problems."
+          summary: "Number of unsynchronized images are very high on cluster {{ $labels.cluster }}"
+        expr: "count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
@@ -749,7 +747,7 @@ groups:
       - alert: "CephRBDMirrorImageTransferBandwidthHigh"
         annotations:
           description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
-          summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+          summary: "The replication network usage on cluster {{ $labels.cluster }} has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
         expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80"
         for: "1m"
         labels:
@@ -761,8 +759,8 @@ groups:
       - alert: "NVMeoFSubsystemNamespaceLimit"
         annotations:
           description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
-          summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
-        expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
+          summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}"
+        expr: "(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
         for: "1m"
         labels:
           severity: "warning"
@@ -770,17 +768,17 @@ groups:
       - alert: "NVMeoFTooManyGateways"
         annotations:
           description: "You may create many gateways, but 4 is the tested limit"
-          summary: "Max supported gateways exceeded "
-        expr: "count(ceph_nvmeof_gateway_info) > 4.00"
+          summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
         for: "1m"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "NVMeoFMaxGatewayGroupSize"
         annotations:
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
-          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
-        expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
+          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -788,8 +786,8 @@ groups:
       - alert: "NVMeoFSingleGatewayGroup"
         annotations:
           description: "Although a single member gateway group is valid, it should only be used for test purposes"
-          summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
-        expr: "count by(group) (ceph_nvmeof_gateway_info) == 1"
+          summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by(cluster,group) == 1"
         for: "5m"
         labels:
           severity: "warning"
@@ -797,8 +795,8 @@ groups:
       - alert: "NVMeoFHighGatewayCPU"
         annotations:
           description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
-          summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
-        expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
+          summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster {{ $labels.cluster }}"
+        expr: "label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
         for: "10m"
         labels:
           severity: "warning"
@@ -806,7 +804,7 @@ groups:
       - alert: "NVMeoFGatewayOpenSecurity"
         annotations:
           description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
-          summary: "Subsystem {{ $labels.nqn }} has been defined without host level security "
+          summary: "Subsystem {{ $labels.nqn }} has been defined without host level security on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
         for: "5m"
         labels:
@@ -815,8 +813,8 @@ groups:
       - alert: "NVMeoFTooManySubsystems"
         annotations:
           description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
-          summary: "The number of subsystems defined to the gateway exceeds supported values "
-        expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
+          summary: "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
+        expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -824,8 +822,8 @@ groups:
       - alert: "NVMeoFVersionMismatch"
         annotations:
           description: "This may indicate an issue with deployment. Check cephadm logs"
-          summary: "The cluster has different NVMe-oF gateway releases active "
-        expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
+          summary: "Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster }}"
+        expr: "count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1"
         for: "1h"
         labels:
           severity: "warning"
@@ -833,7 +831,7 @@ groups:
       - alert: "NVMeoFHighClientCount"
         annotations:
           description: "The supported limit for clients connecting to a subsystem is 32"
-          summary: "The number of clients connected to {{ $labels.nqn }} is too high "
+          summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_host_count > 32.00"
         for: "1m"
         labels:
@@ -842,8 +840,8 @@ groups:
       - alert: "NVMeoFHighHostCPU"
         annotations:
           description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
-          summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
-        expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
+          summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) on cluster {{ $labels.cluster }}"
+        expr: "100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
         for: "10m"
         labels:
           severity: "warning"
@@ -851,7 +849,7 @@ groups:
       - alert: "NVMeoFInterfaceDown"
         annotations:
           description: "A NIC used by one or more subsystems is in a down state"
-          summary: "Network interface {{ $labels.device }} is down "
+          summary: "Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
         for: "30s"
         labels:
@@ -861,7 +859,7 @@ groups:
       - alert: "NVMeoFInterfaceDuplex"
         annotations:
           description: "Until this is resolved, performance from the gateway will be degraded"
-          summary: "Network interface {{ $labels.device }} is not running in full duplex mode "
+          summary: "Network interface {{ $labels.device }} is not running in full duplex mode on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
         for: "30s"
         labels:
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 40d6f4d0983..a269ff74227 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -5,13 +5,13 @@ tests:
  # health error
  - interval: 5m
    input_series:
-    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '2 2 2 2 2 2 2'
    promql_expr_test:
     - expr: ceph_health_status == 2
       eval_time: 5m
       exp_samples:
-       - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+       - labels: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
          value: 2
    alert_rule_test:
     - eval_time: 1m
@@ -25,20 +25,21 @@ tests:
           oid: 1.3.6.1.4.1.50495.1.2.1.2.1
           type: ceph_default
           severity: critical
+          cluster: mycluster
         exp_annotations:
-          summary: Ceph is in the ERROR state
-          description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
+          summary: Ceph is in the ERROR state on cluster mycluster
+          description: The cluster state has been HEALTH_ERROR for more than 5 minutes on cluster mycluster. Please check 'ceph health detail' for more information.
 
  # health warning
  - interval: 5m
    input_series:
-    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_status == 1
        eval_time: 15m
        exp_samples:
-         - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+         - labels: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 10m
@@ -51,45 +52,46 @@ tests:
           job: ceph
           type: ceph_default
           severity: warning
+          cluster: mycluster
         exp_annotations:
-          summary: Ceph is in the WARNING state
-          description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
+          summary: Ceph is in the WARNING state on cluster mycluster
+          description: The cluster state has been HEALTH_WARN for more than 15 minutes on cluster mycluster. Please check 'ceph health detail' for more information.
 
  # 10% OSDs down
  - interval: 1m
    input_series:
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
    promql_expr_test:
-     - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+     - expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"    }'
            value: 3.333333333333333E+01
    alert_rule_test:
      - eval_time: 1m
@@ -99,39 +101,40 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.4.1
            type: ceph_default
            severity: critical
+           cluster: mycluster
          exp_annotations:
-           summary: More than 10% of OSDs are down
+           summary: More than 10% of OSDs are down on cluster mycluster
            description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
 
  # flapping OSD
  - interval: 1s
    input_series:
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+1x100'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x100'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x100'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
    promql_expr_test:
      - expr: |
@@ -142,7 +145,7 @@ tests:
        eval_time: 1m
        exp_samples:
          - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
-           job="ceph"}'
+            job="ceph",cluster="mycluster"}'
            value: 1.2200000000000001E+01
    alert_rule_test:
      - eval_time: 5m
@@ -155,54 +158,39 @@ tests:
            job: ceph
            oid: 1.3.6.1.4.1.50495.1.2.1.4.4
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
-           summary: Network issues are causing OSDs to flap (mark each other down)
+           summary: Network issues are causing OSDs to flap (mark each other down) on cluster mycluster
            description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
 
  # high pg count deviation
  - interval: 1m
    input_series:
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 320'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
    promql_expr_test:
      - expr: |
@@ -215,8 +203,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
-           job="ceph"}'
+         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 6E-01
    alert_rule_test:
      - eval_time: 10m
@@ -229,44 +216,40 @@ tests:
            job: ceph
            oid: 1.3.6.1.4.1.50495.1.2.1.4.5
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: PGs are not balanced across OSDs
+           summary: PGs are not balanced across OSDs on cluster mycluster
            description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
 
  # pgs inactive
  - interval: 1m
    input_series:
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="1"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="2"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="2"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="3"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
       values: '32 32 32 32 32 32 32 32'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
       values: '33 32 32 32 32 33 33 32'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
       values: '32 32 32 32 32 32 32 32'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
       values: '32 32 32 32 32 32 32 32'
    promql_expr_test:
      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
              (ceph_pg_total - ceph_pg_active) > 0
        eval_time: 5m
        exp_samples:
-         - labels: '{instance="ceph:9283", job="ceph",
-           name="device_health_metrics",
-           pool_id="3"}'
-           value: 1
+        - labels: '{instance="ceph:9283", job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
+          value: 1
    alert_rule_test:
      - eval_time: 5m
        alertname: CephPGsInactive
@@ -278,46 +261,39 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.7.1
            pool_id: 3
            severity: critical
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: One or more placement groups are inactive
+           summary: One or more placement groups are inactive on cluster mycluster
            description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
 
  #pgs unclean
  - interval: 1m
    input_series:
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="1"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="2"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="2"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="3"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32 32'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
-      33 33'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
+      values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
    promql_expr_test:
      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
              (ceph_pg_total - ceph_pg_clean) > 0
        eval_time: 15m
        exp_samples:
-         - labels: '{instance="ceph:9283", job="ceph",
-           name="device_health_metrics", pool_id="3"}'
+         - labels: '{instance="ceph:9283", job="ceph",cluster="mycluster",name="device_health_metrics", pool_id="3"}'
            value: 1
    alert_rule_test:
      - eval_time: 16m
@@ -330,32 +306,33 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.7.2
            pool_id: 3
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: One or more placement groups are marked unclean
+           summary: One or more placement groups are marked unclean on cluster mycluster
            description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
 
  # root volume full
  - interval: 1m
    input_series:
     - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
-      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
-      mountpoint="/"}'
+        --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+        mountpoint="/"}'
       values: '35336400896 35336400896 35336400896 35336400896 35336400896
-      3525385519.104 3533640089'
+        3525385519.104 3533640089'
     - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
-      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
-      mountpoint="/"}'
+        --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+        mountpoint="/"}'
       values: '73445531648 73445531648 73445531648 73445531648 73445531648
-      73445531648 73445531648'
+        73445531648 73445531648'
    promql_expr_test:
      - expr: node_filesystem_avail_bytes{mountpoint="/"} /
              node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
        eval_time: 5m
        exp_samples:
          - labels: '{device="/dev/mapper/fedora_localhost --live-home",
-           fstype="ext4", instance="node-exporter", job="node-exporter",
-           mountpoint="/"}'
+            fstype="ext4", instance="node-exporter", job="node-exporter",
+            mountpoint="/"}'
            value: 4.8E+00
    alert_rule_test:
      - eval_time: 10m
@@ -377,17 +354,13 @@ tests:
  # network packets dropped
  - interval: 1m
    input_series:
-    - series: 'node_network_receive_drop_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_drop_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_drop_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_drop_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+600x10'
-    - series: 'node_network_receive_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_packets_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+750x10'
-    - series: 'node_network_transmit_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_packets_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+750x10'
    promql_expr_test:
      - expr: |
@@ -404,8 +377,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{device="eth0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{device="eth0", instance="node-exporter",job="node-exporter"}'
            value: 8E-1
    alert_rule_test:
      - eval_time: 5m
@@ -425,17 +397,13 @@ tests:
  # network packets errors
  - interval: 1m
    input_series:
-    - series: 'node_network_receive_errs_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_errs_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_errs_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_errs_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_packets_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+750x10'
-    - series: 'node_network_receive_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_packets_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+750x10'
    promql_expr_test:
      - expr: |
@@ -452,8 +420,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{device="eth0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{device="eth0", instance="node-exporter",job="node-exporter",cluster="mycluster"}'
            value: 8E-01
    alert_rule_test:
      - eval_time: 5m
@@ -466,26 +433,24 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.8.3
            severity: warning
            type: ceph_default
-         exp_annotations: 
-           summary: One or more NICs reports packet errors
+           cluster: mycluster
+         exp_annotations:
+           summary: One or more NICs reports packet errors on cluster mycluster
            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 
  # Bond is missing a peer
  - interval: 1m
    input_series:
-    - series: 'node_bonding_active{master="bond0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_bonding_active{master="bond0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '3'
-    - series: 'node_bonding_slaves{master="bond0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_bonding_slaves{master="bond0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '4'
    promql_expr_test:
      - expr: |
          node_bonding_slaves - node_bonding_active != 0
        eval_time: 5m
        exp_samples:
-         - labels: '{master="bond0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{master="bond0", instance="node-exporter",job="node-exporter",cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 5m
@@ -497,23 +462,22 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: mycluster
          exp_annotations:
-           summary: Degraded Bond on Node node-exporter
+           summary: Degraded Bond on Node node-exporter on cluster mycluster
            description: "Bond bond0 is degraded on Node node-exporter."
 
 # Node Storage disk space filling up
  - interval: 1m
    # 20GB = 21474836480, 256MB = 268435456
    input_series:
-    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
-      fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",fstype="xfs",instance="node-1",mountpoint="/rootfs",cluster="mycluster"}'
       values: '21474836480-268435456x48'
-    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
-      fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",fstype="xfs",instance="node-2",mountpoint="/rootfs",cluster="mycluster"}'
       values: '21474836480+0x48'
-    - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
+    - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com",cluster="mycluster"}'
       values: 1+0x48
-    - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
+    - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com",cluster="mycluster"}'
       values: 1+0x48
    promql_expr_test:
      - expr: |
@@ -521,8 +485,7 @@ tests:
           on(instance) group_left(nodename) node_uname_info < 0
        eval_time: 5m
        exp_samples:
-         - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
-         mountpoint="/rootfs",nodename="node-1.unittests.com"}'
+         - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",mountpoint="/rootfs",nodename="node-1.unittests.com",cluster="mycluster"}'
            value: -1.912602624E+12
    alert_rule_test:
      - eval_time: 5m
@@ -537,72 +500,60 @@ tests:
            instance: node-1
            mountpoint: /rootfs
            nodename: node-1.unittests.com
+           cluster: mycluster
          exp_annotations:
-           summary: Host filesystem free space is getting low
+           summary: Host filesystem free space is getting low on cluster mycluster
            description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
+
  # MTU Mismatch
  - interval: 1m
    input_series:
-    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '9000 9000 9000 9000 9000'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
       values: '2200 2200 2200 2200 2200'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
       values: '2400 2400 2400 2400 2400'
-    - series: 'node_network_up{device="eth0",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth1",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth2",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth3",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname1",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname2",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
    promql_expr_test:
      - expr: |
           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
             scalar(
-              max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+              max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
             )
           or
           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
             scalar(
-              min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+              min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
             )
        eval_time: 1m
        exp_samples:
-         - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
+         - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
            value: 9000
-         - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
+         - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
            value: 2200
    alert_rule_test:
      - eval_time: 1m
@@ -614,8 +565,9 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: "mycluster"
          exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent
+           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
            description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
        - exp_labels:
            device: eth4
@@ -623,51 +575,52 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: "mycluster"
          exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent
+           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
            description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
 
  # pool full, data series has 6 but using topk(5) so to ensure the
  # results are working as expected
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_FULL"}'
+    - series: 'ceph_health_detail{name="POOL_FULL", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", cluster="mycluster"}'
       values: '32+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", cluster="mycluster"}'
       values: '96+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="3"}'
+    - series: 'ceph_pool_percent_used{pool_id="3", cluster="mycluster"}'
       values: '90+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="4"}'
+    - series: 'ceph_pool_percent_used{pool_id="4", cluster="mycluster"}'
       values: '72+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="5"}'
+    - series: 'ceph_pool_percent_used{pool_id="5", cluster="mycluster"}'
       values: '19+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="6"}'
+    - series: 'ceph_pool_percent_used{pool_id="6", cluster="mycluster"}'
       values: '10+0x10'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="cephfs_data",pool_id="1"}'
+        name="cephfs_data",pool_id="1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="rbd",pool_id="2"}'
+        name="rbd",pool_id="2", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="iscsi",pool_id="3"}'
+        name="iscsi",pool_id="3", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="default.rgw.index",pool_id="4"}'
+        name="default.rgw.index",pool_id="4", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="default.rgw.log",pool_id="5"}'
+        name="default.rgw.log",pool_id="5", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="dummy",pool_id="6"}'
+        name="dummy",pool_id="6", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_FULL"} > 0
        eval_time: 5m
        exp_samples:
-         - labels:  '{__name__="ceph_health_detail", name="POOL_FULL"}'
+         - labels:  '{__name__="ceph_health_detail", name="POOL_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 1m
@@ -678,23 +631,24 @@ tests:
        - exp_labels:
            name: POOL_FULL
            severity: critical
+           cluster: mycluster
            type: ceph_default
            oid: 1.3.6.1.4.1.50495.1.2.1.9.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
-           summary: Pool is full - writes are blocked
+           summary: Pool is full - writes are blocked on cluster mycluster
            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+
  # slow OSD ops
  - interval : 1m
    input_series:
-    - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x120'
    promql_expr_test:
      - expr: ceph_healthcheck_slow_ops > 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
-           job="ceph"}'
+         - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 20m
@@ -704,23 +658,23 @@ tests:
            instance: ceph:9283
            job: ceph
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
-           summary: OSD operations are slow to complete
+           summary: OSD operations are slow to complete on cluster mycluster
            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 
  # slow daemon ops
  - interval : 1m
    input_series:
-    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283", job="ceph", type="SLOW_OPS", cluster="mycluster"}'
       values: '1+0x120'
    promql_expr_test:
      - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
-           job="ceph", type="SLOW_OPS"}'
+         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1", instance="ceph:9283", cluster="mycluster",job="ceph", type="SLOW_OPS"}'
            value: 1
    alert_rule_test:
      - eval_time: 20m
@@ -731,22 +685,23 @@ tests:
            ceph_daemon: "osd.1"
            job: ceph
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
-           summary: osd.1 operations are slow to complete
+           summary: osd.1 operations are slow to complete on cluster mycluster
            description: "osd.1 operations are taking too long to process (complaint time exceeded)"
 
 # CEPHADM orchestrator alert triggers
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
+    - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
+         - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -757,20 +712,21 @@ tests:
       - exp_labels:
           name: UPGRADE_EXCEPTION
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.11.2
         exp_annotations:
-          summary: Ceph version upgrade has failed
+          summary: Ceph version upgrade has failed on cluster mycluster
           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
+    - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -781,20 +737,21 @@ tests:
       - exp_labels:
           name: CEPHADM_FAILED_DAEMON
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.11.1
         exp_annotations:
-          summary: A ceph daemon managed by cephadm is down
+          summary: A ceph daemon managed by cephadm is down on cluster mycluster
           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
+    - series: 'ceph_health_detail{name="CEPHADM_PAUSED", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -805,21 +762,23 @@ tests:
       - exp_labels:
           name: CEPHADM_PAUSED
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
-          summary: Orchestration tasks via cephadm are PAUSED
+          summary: Orchestration tasks via cephadm are PAUSED on cluster mycluster
           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
+
 # MDS
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
+    - series: 'ceph_health_detail{name="MDS_DAMAGE", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -830,21 +789,22 @@ tests:
       - exp_labels:
           name: MDS_DAMAGE
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
-          summary: CephFS filesystem is damaged.
+          summary: CephFS filesystem is damaged on cluster mycluster
           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
+    - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -855,21 +815,22 @@ tests:
       - exp_labels:
           name: MDS_HEALTH_READ_ONLY
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
-          summary: CephFS filesystem in read only mode due to write error(s)
+          summary: CephFS filesystem in read only mode due to write error(s) on cluster mycluster
           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
+    - series: 'ceph_health_detail{name="MDS_ALL_DOWN", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -880,21 +841,22 @@ tests:
       - exp_labels:
           name: MDS_ALL_DOWN
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
-          summary: CephFS filesystem is offline
+          summary: CephFS filesystem is offline on cluster mycluster
           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="FS_DEGRADED"}'
+    - series: 'ceph_health_detail{name="FS_DEGRADED", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
+         - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -905,21 +867,22 @@ tests:
       - exp_labels:
           name: FS_DEGRADED
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.4
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
-          summary: CephFS filesystem is degraded
+          summary: CephFS filesystem is degraded on cluster mycluster
           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
+    - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -930,20 +893,21 @@ tests:
       - exp_labels:
           name: MDS_INSUFFICIENT_STANDBY
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
-          summary: Ceph filesystem standby daemons too few
+          summary: Ceph filesystem standby daemons too few on cluster mycluster
           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
+    - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
+         - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -954,21 +918,22 @@ tests:
       - exp_labels:
           name: FS_WITH_FAILED_MDS
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.5
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
-          summary: MDS daemon failed, no further standby available
+          summary: MDS daemon failed, no further standby available on cluster mycluster
           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
+    - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -979,10 +944,11 @@ tests:
       - exp_labels:
           name: MDS_UP_LESS_THAN_MAX
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
-          summary: Ceph MDS daemon count is lower than configured
+          summary: Ceph MDS daemon count is lower than configured on cluster mycluster
           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
 # MGR
  - interval: 1m
@@ -1012,13 +978,13 @@ tests:
           description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
+    - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
+         - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1029,24 +995,26 @@ tests:
       - exp_labels:
           name: RECENT_MGR_MODULE_CRASH
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.6.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
-          summary: A manager module has recently crashed
+          summary: A manager module has recently crashed on cluster mycluster
           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
+
 # MON
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
+    - series: 'ceph_health_detail{name="MON_DISK_CRIT", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a", cluster="mycluster"}'
       values: '1+0x13'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1057,23 +1025,24 @@ tests:
       - exp_labels:
           name: "MON_DISK_CRIT"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.3.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
-          summary: Filesystem space on at least one monitor is critically low
+          summary: Filesystem space on at least one monitor is critically low on cluster mycluster
           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
+    - series: 'ceph_health_detail{name="MON_DISK_LOW", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a", cluster="mycluster"}'
       values: '1+0x13'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1084,20 +1053,21 @@ tests:
       - exp_labels:
           name: "MON_DISK_LOW"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
-          summary: Drive space on at least one monitor is approaching full
+          summary: Drive space on at least one monitor is approaching full on cluster mycluster
           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
+    - series: 'ceph_health_detail{name="MON_CLOCK_SKEW", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1108,34 +1078,35 @@ tests:
       - exp_labels:
           name: "MON_CLOCK_SKEW"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
-          summary: Clock skew detected among monitors
+          summary: Clock skew detected among monitors on cluster mycluster
           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
 
 # Check 3 mons one down, quorum at risk
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DOWN"}'
+    - series: 'ceph_health_detail{name="MON_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x12'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c", cluster="mycluster"}'
       values: '1+0x2 0+0x12'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3", cluster="mycluster"}'
       values: '1+0x14'
    promql_expr_test:
-     - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
+     - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (count(ceph_mon_quorum_status == 1) by (cluster) == bool (floor(count(ceph_mon_metadata) by (cluster) / 2) + 1))) == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1146,40 +1117,41 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.3.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
-          summary: Monitor quorum is at risk
+          summary: Monitor quorum is at risk on cluster mycluster
           description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
 # check 5 mons, 1 down - warning only
  - interval: 1m
    input_series:
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e", cluster="mycluster"}'
       values: '1+0x2 0+0x12'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5", cluster="mycluster"}'
       values: '1+0x14'
    promql_expr_test:
-     - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
+     - expr: (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
        eval_time: 3m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1189,21 +1161,23 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
-          summary: One or more monitors down
-          description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down:   - mon.e on ceph-mon-5\n"
+          summary: One or more monitors down on cluster mycluster
+          description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5"
+
 # Device Health
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1214,20 +1188,21 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
-          summary: Device(s) predicted to fail soon
+          summary: Device(s) predicted to fail soon on cluster mycluster
           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1238,21 +1213,22 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH_TOOMANY"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.7
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
-          summary: Too many devices are predicted to fail, unable to resolve
+          summary: Too many devices are predicted to fail on cluster mycluster, unable to resolve
           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1263,25 +1239,27 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH_IN_USE"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
-          summary: Device failure is predicted, but unable to relocate data
+          summary: Device failure is predicted, but unable to relocate data on cluster mycluster
           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
+
 # OSD
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_HOST_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1+0x2 0+0x10'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1", cluster="mycluster"}'
       values: '1+0x12'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1292,20 +1270,21 @@ tests:
       - exp_labels:
           name: "OSD_HOST_DOWN"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.8
         exp_annotations:
-          summary: An OSD host is offline
+          summary: An OSD host is offline on cluster mycluster
           description: "The following OSDs are down: - ceph-osd-1 : osd.0"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1316,19 +1295,20 @@ tests:
       - exp_labels:
           name: "OSD_SLOW_PING_TIME_FRONT"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Network issues delaying OSD heartbeats (public network)
+          summary: Network issues delaying OSD heartbeats (public network) on cluster mycluster
           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1339,19 +1319,20 @@ tests:
       - exp_labels:
           name: "OSD_SLOW_PING_TIME_BACK"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Network issues delaying OSD heartbeats (cluster network)
+          summary: Network issues delaying OSD heartbeats (cluster network) on cluster mycluster
           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+    - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1362,20 +1343,21 @@ tests:
       - exp_labels:
           name: "BLUESTORE_DISK_SIZE_MISMATCH"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
-          summary: OSD size inconsistency error
+          summary: OSD size inconsistency error on cluster mycluster
           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+    - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1386,32 +1368,33 @@ tests:
       - exp_labels:
           name: "BLUESTORE_SPURIOUS_READ_ERRORS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
-          summary: Device read errors detected
+          summary: Device read errors detected on cluster mycluster
           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1+0x2 0+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3", cluster="mycluster"}'
       values: '1+0x12'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_DOWN"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1422,21 +1405,22 @@ tests:
       - exp_labels:
           name: "OSD_DOWN"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
-          summary: An OSD has been marked down
-          description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
+          summary: An OSD has been marked down on cluster mycluster
+          description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
+    - series: 'ceph_health_detail{name="OSD_NEARFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1447,21 +1431,22 @@ tests:
       - exp_labels:
           name: "OSD_NEARFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
-          summary: OSD(s) running low on free space (NEARFULL)
+          summary: OSD(s) running low on free space (NEARFULL) on cluster mycluster
           description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_FULL"}'
+    - series: 'ceph_health_detail{name="OSD_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_FULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1472,21 +1457,22 @@ tests:
       - exp_labels:
           name: "OSD_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.6
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
-          summary: OSD full, writes blocked
+          summary: OSD full, writes blocked on cluster mycluster
           description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
+    - series: 'ceph_health_detail{name="OSD_BACKFILLFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1497,20 +1483,21 @@ tests:
       - exp_labels:
           name: "OSD_BACKFILLFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
-          summary: OSD(s) too full for backfill operations
+          summary: OSD(s) too full for backfill operations on cluster mycluster
           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
+    - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1521,38 +1508,40 @@ tests:
       - exp_labels:
           name: "OSD_TOO_MANY_REPAIRS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
-          summary: OSD reports a high number of read errors
+          summary: OSD reports a high number of read errors on cluster mycluster
           description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
+
 # Pools
    # trigger percent full prediction on pools 1 and 2 only
  - interval: 12h
    input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090", cluster="mycluster"}'
       values: '78 89 79 98 78'
-    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090", cluster="mycluster"}'
       values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
    promql_expr_test:
      - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster, pool_id, instance)
               group_right() ceph_pool_metadata) >= 95
        eval_time: 36h
        exp_samples:
-         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated", cluster="mycluster"}'
            value: 1.435E+02 # 142%
    alert_rule_test:
     - eval_time: 48h
@@ -1563,20 +1552,21 @@ tests:
           name: default.rgw.index
           pool_id: 1
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
         exp_annotations:
-          summary: Pool growth rate may soon exceed capacity
+          summary: Pool growth rate may soon exceed capacity on cluster mycluster
           description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
+    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1587,20 +1577,21 @@ tests:
       - exp_labels:
           name: "POOL_BACKFILLFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Free space in a pool is too low for recovery/backfill
+          summary: Free space in a pool is too low for recovery/backfill on cluster mycluster
           description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
 
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
+    - series: 'ceph_health_detail{name="POOL_NEAR_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1611,21 +1602,22 @@ tests:
       - exp_labels:
           name: "POOL_NEAR_FULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: One or more Ceph pools are nearly full
+          summary: One or more Ceph pools are nearly full on cluster mycluster
           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
 
 # PGs
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
+    - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED",cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1636,20 +1628,21 @@ tests:
       - exp_labels:
           name: "PG_NOT_SCRUBBED"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
-          summary: Placement group(s) have not been scrubbed
+          summary: Placement group(s) have not been scrubbed on cluster mycluster
           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_DAMAGED"}'
+    - series: 'ceph_health_detail{name="PG_DAMAGED",cluster="mycluster"}'
       values: '0+0x4 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
        eval_time: 5m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1660,21 +1653,22 @@ tests:
       - exp_labels:
           name: "PG_DAMAGED"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.4
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
-          summary: Placement group damaged, manual intervention needed
+          summary: Placement group damaged, manual intervention needed on cluster mycluster
           description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
+    - series: 'ceph_health_detail{name="TOO_MANY_PGS",cluster="mycluster"}'
       values: '0+0x4 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
        eval_time: 5m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
+         - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1685,20 +1679,21 @@ tests:
       - exp_labels:
           name: "TOO_MANY_PGS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
-          summary: Placement groups per OSD is too high
+          summary: Placement groups per OSD is too high on cluster mycluster
           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
+    - series: 'ceph_health_detail{name="PG_RECOVERY_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1709,21 +1704,22 @@ tests:
       - exp_labels:
           name: "PG_RECOVERY_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.5
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
-          summary: OSDs are too full for recovery
+          summary: OSDs are too full for recovery on cluster mycluster
           description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
+    - series: 'ceph_health_detail{name="PG_BACKFILL_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1734,17 +1730,18 @@ tests:
       - exp_labels:
           name: "PG_BACKFILL_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.6
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
-          summary: Backfill operations are blocked due to lack of free space
+          summary: Backfill operations are blocked due to lack of free space on cluster mycluster
           description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
+    - series: 'ceph_health_detail{name="PG_AVAILABILITY", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_DOWN", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
    promql_expr_test:
      - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
@@ -1767,21 +1764,22 @@ tests:
       - exp_labels:
           name: "PG_AVAILABILITY"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
-          summary: PG is unavailable, blocking I/O
+          summary: PG is unavailable on cluster mycluster, blocking I/O
           description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
+    - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1792,10 +1790,11 @@ tests:
       - exp_labels:
           name: "PG_NOT_DEEP_SCRUBBED"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
-          summary: Placement group(s) have not been deep scrubbed
+          summary: Placement group(s) have not been deep scrubbed on cluster mycluster
           description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
 
 # Prometheus
@@ -1821,25 +1820,26 @@ tests:
         exp_annotations:
           summary: The scrape job for Ceph is missing from Prometheus
           description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.
+
 # RADOS
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
+    - series: 'ceph_health_detail{name="OBJECT_UNFOUND", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
-     - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
+     - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right (cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by (cluster)) == 1
        eval_time: 1m
        exp_samples:
    alert_rule_test:
@@ -1853,16 +1853,18 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.10.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
-          summary: Object(s) marked UNFOUND
+          summary: Object(s) marked UNFOUND on cluster mycluster
           description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
+
 # Generic Alerts
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="RECENT_CRASH"}'
+    - series: 'ceph_health_detail{name="RECENT_CRASH", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
@@ -1880,11 +1882,12 @@ tests:
       - exp_labels:
           name: RECENT_CRASH
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.1.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
-          summary: One or more Ceph daemons have crashed, and are pending acknowledgement
+          summary: One or more Ceph daemons have crashed, and are pending acknowledgement on cluster mycluster
           description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.
 
   # new rbdmirror alerts tests
@@ -1892,21 +1895,21 @@ tests:
   # alert: CephRBDMirrorImagesPerDaemonHigh
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data", cluster="mycluster"}'
        values: '0+0x20 1+1x130'
-     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data", cluster="mycluster"}'
        values: '1+1x130 131+0x20'
    # prometheus query test
    promql_expr_test:
      # negative test where there are no samples
-     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+     - expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
        eval_time: 50m
        exp_samples:
      # second positive test
-     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+     - expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
        eval_time: 70m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628", namespace="default"}'
+         - labels: '{ceph_daemon="client.admin.40628", namespace="default", cluster="mycluster"}'
            value: 121
    # prometheus alert test
    alert_rule_test:
@@ -1921,31 +1924,32 @@ tests:
        - exp_labels:
            oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
            severity: "critical"
+           cluster: mycluster
            type: "ceph_default"
            ceph_daemon: "client.admin.40628"
            namespace: "default"
          exp_annotations:
-           description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
-           summary: "Number of image replications are now above 100"
+           summary: "Number of image replications are now above 100 on cluster mycluster"
+           description: "Number of image replications per daemon is not supposed to go beyond threshold 100"
 
  # alert: CephRBDMirrorImagesNotInSync
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 3.21+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 2.03+0x20'
    # prometheus query test
    promql_expr_test:
      # negative test where there are no samples
-     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+     - expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
        eval_time: 30m
        exp_samples:
        # second positive test
-     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+     - expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+         - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data", cluster="mycluster"}'
            value: 1.1800000000000002
    # prometheus alert test
    alert_rule_test:
@@ -1962,48 +1966,49 @@ tests:
              pool: "data"
              oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
              severity: "critical"
+             cluster: mycluster
              type: "ceph_default"
              ceph_daemon: "client.admin.40628"
              namespace: "default"
            exp_annotations:
+             summary: "Some of the RBD mirror images are not in sync with the remote counter parts on cluster mycluster"
              description: "Both local and remote RBD mirror images should be in sync."
-             summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
 
  # alert: CephRBDMirrorImagesNotInSyncVeryHigh
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 3.21+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 2.03+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 3.301+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 7.13+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 3.301+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 7.13+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x65'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x65'
-     - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628"}'
+     - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628",cluster="mycluster"}'
        values: '1+0x20 2+0x45'
    # prometheus query test
    promql_expr_test:
      # test each query individually
      # query 1
-     - expr: count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0)
+     - expr: count by (ceph_daemon, cluster) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0)
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628",cluster="mycluster"}'
            value: 3
      # query 2
-     - expr: sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots) * .1
+     - expr: sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots) * .1
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628",cluster="mycluster"}'
            value: 0.2
    # prometheus alert test
    alert_rule_test:
@@ -2019,15 +2024,16 @@ tests:
              ceph_daemon: "client.admin.40628"
              oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
              severity: "critical"
+             cluster: mycluster
              type: "ceph_default"
            exp_annotations:
-             description: "More than 10% of the images have synchronization problems"
-             summary: "Number of unsynchronized images are very high."
+             summary: "Number of unsynchronized images are very high on cluster mycluster"
+             description: "More than 10% of the images have synchronization problems."
 
  # alert: "CephRBDMirrorImageTransferBandwidthHigh"
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628"}'
+     - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628", cluster="mycluster"}'
        values: '0+0x10 1+0x5 10+30x25 736+200x30'
    # prometheus query test
    promql_expr_test:
@@ -2036,25 +2042,25 @@ tests:
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 5m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.0
      # rate 2
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 20m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.33
      # rate 3
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 40m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.5
      # rate 4
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 50m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 3.3333333333333335
    # prometheus alert test
    alert_rule_test:
@@ -2070,20 +2076,21 @@ tests:
            ceph_daemon: "client.admin.40628"
            oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
            severity: "warning"
+           cluster: mycluster
            type: "ceph_default"
          exp_annotations:
+           summary: "The replication network usage on cluster mycluster has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
            description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
-           summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
 
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}'
+    - series: 'ceph_health_detail{name="HARDWARE_STORAGE", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2094,20 +2101,21 @@ tests:
       - exp_labels:
           name: HARDWARE_STORAGE
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.1
         exp_annotations:
-          summary: Storage devices error(s) detected
+          summary: Storage devices error(s) detected on cluster mycluster
           description: "Some storage devices are in error. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}'
+    - series: 'ceph_health_detail{name="HARDWARE_MEMORY", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2118,20 +2126,21 @@ tests:
       - exp_labels:
           name: HARDWARE_MEMORY
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.2
         exp_annotations:
-          summary: DIMM error(s) detected
+          summary: DIMM error(s) detected on cluster mycluster
           description: "DIMM error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}'
+    - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2142,20 +2151,21 @@ tests:
       - exp_labels:
           name: HARDWARE_PROCESSOR
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.3
         exp_annotations:
-          summary: Processor error(s) detected
+          summary: Processor error(s) detected on cluster mycluster
           description: "Processor error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}'
+    - series: 'ceph_health_detail{name="HARDWARE_NETWORK", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2166,20 +2176,21 @@ tests:
       - exp_labels:
           name: HARDWARE_NETWORK
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.4
         exp_annotations:
-          summary: Network error(s) detected
+          summary: Network error(s) detected on cluster mycluster
           description: "Network error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_POWER"}'
+    - series: 'ceph_health_detail{name="HARDWARE_POWER", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2190,20 +2201,21 @@ tests:
       - exp_labels:
           name: HARDWARE_POWER
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.5
         exp_annotations:
-          summary: Power supply error(s) detected
+          summary: Power supply error(s) detected on cluster mycluster
           description: "Power supply error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_FANS"}'
+    - series: 'ceph_health_detail{name="HARDWARE_FANS", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2214,35 +2226,36 @@ tests:
       - exp_labels:
           name: HARDWARE_FANS
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.6
         exp_annotations:
-          summary: Fan error(s) detected
+          summary: Fan error(s) detected on cluster mycluster
           description: "Fan error(s) detected. Check `ceph health detail`."
 
 # nvmeof Tests
  # NVMeoFSubsystemNamespaceLimit
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah", cluster="mycluster"}'
       values: '5x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1", cluster="mycluster"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6"}'
-      values: '1x10'      
    promql_expr_test:
-     - expr: (count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit
+     - expr: (count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit
        eval_time: 1m
        exp_samples:
-         - labels: '{nqn="wah"}'
+         - labels: '{nqn="wah",cluster="mycluster"}'
            value: 6
    alert_rule_test:
     - eval_time: 5m
@@ -2251,29 +2264,30 @@ tests:
       - exp_labels:
           nqn: wah
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "wah subsystem has reached its maximum number of namespaces "
+          summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster"
           description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"
 
  # NVMeoFTooManyGateways
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5"}'
-      values: '1+0x20'             
-   promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) > 4.00
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
+   promql_expr_test:
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 5
    alert_rule_test:
     - eval_time: 5m
@@ -2281,30 +2295,35 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max supported gateways exceeded "
+          summary: "Max supported gateways exceeded on cluster mycluster"
           description: "You may create many gateways, but 4 is the tested limit"
 
  # NVMeoFMaxGatewayGroupSize
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.9",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
-      values: '1+0x20'             
-   promql_expr_test:
-     - expr: count by(group) (ceph_nvmeof_gateway_info) > 2.00
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
+   promql_expr_test:
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
        eval_time: 1m
        exp_samples:
-         - labels: '{group="group-1"}'
-           value: 3
+         - labels: '{cluster="mycluster",group="group-1"}'
+           value: 5
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFMaxGatewayGroupSize
@@ -2312,25 +2331,26 @@ tests:
       - exp_labels:
           group: group-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max gateways within a gateway group (group-1) exceeded "
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
+          summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
+          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
 
  # NVMeoFSingleGatewayGroup
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
-      values: '1+0x20'             
    promql_expr_test:
-     - expr: count by(group) (ceph_nvmeof_gateway_info) == 1
+     - expr: count by(group, cluster) (ceph_nvmeof_gateway_info) == 1
        eval_time: 1m
        exp_samples:
-         - labels: '{group="group-1"}'
+         - labels: '{group="group-1", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2339,21 +2359,22 @@ tests:
       - exp_labels:
           group: group-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The gateway group group-1 consists of a single gateway - HA is not possible "
-          description: "Although a single member gateway group is valid, it should only be used for test purposes" 
+          summary: "The gateway group group-1 consists of a single gateway - HA is not possible on cluster mycluster"
+          description: "Although a single member gateway group is valid, it should only be used for test purposes"
 
  # NVMeoFHighGatewayCPU
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008"}'
+    - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008",cluster="mycluster"}'
       values: '880+5080x20'
    promql_expr_test:
-     - expr: label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80
+     - expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80
        eval_time: 5m
        exp_samples:
-         - labels: '{instance="node-1"}'
+         - labels: '{instance="node-1", cluster="mycluster"}'
            value: 8.466666666666667E+01
    alert_rule_test:
     - eval_time: 15m
@@ -2362,23 +2383,24 @@ tests:
       - exp_labels:
           instance: node-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "CPU used by node-1 NVMe-oF Gateway is high "
-          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" 
- 
+          summary: "CPU used by node-1 NVMe-oF Gateway is high on cluster mycluster"
+          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
+
  # NVMeoFGatewayOpenSecurity
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no", cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes", cluster="mycluster"}'
       values: '1+0x10'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2388,53 +2410,54 @@ tests:
           allow_any_host: yes
           nqn: nqn.bad
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Subsystem nqn.bad has been defined without host level security "
-          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" 
+          summary: "Subsystem nqn.bad has been defined without host level security on cluster mycluster"
+          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
 
  # NVMeoFTooManySubsystems
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7",cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9"}'
-      values: '1+0x10'                                             
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8",cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17"}'
-      values: '1+0x10'  
-   promql_expr_test:
-     - expr: count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17",cluster="mycluster"}'
+      values: '1+0x10'
+   promql_expr_test:
+     - expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16
        eval_time: 1m
        exp_samples:
-         - labels: '{gateway_host="node-1"}'
+         - labels: '{gateway_host="node-1", cluster="mycluster"}'
            value: 17
    alert_rule_test:
     - eval_time: 5m
@@ -2443,23 +2466,24 @@ tests:
       - exp_labels:
           gateway_host: node-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The number of subsystems defined to the gateway exceeds supported values "
-          description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" 
+          summary: "The number of subsystems defined to the gateway exceeds supported values on cluster mycluster"
+          description: "Although you may continue to create subsystems in node-1, the configuration may not be supported"
 
  # NVMeoFVersionMismatch
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{version="0.0.7"}'
+    - series: 'ceph_nvmeof_gateway_info{version="0.0.7",cluster="mycluster"}'
       values: '1+0x80'
-    - series: 'ceph_nvmeof_gateway_info{version="1.0.0"}'
+    - series: 'ceph_nvmeof_gateway_info{version="1.0.0",cluster="mycluster"}'
       values: '1+0x80'
    promql_expr_test:
-     - expr: count(count by(version) (ceph_nvmeof_gateway_info)) > 1
+     - expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 2
    alert_rule_test:
     - eval_time: 1h
@@ -2467,23 +2491,24 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The cluster has different NVMe-oF gateway releases active "
-          description: "This may indicate an issue with deployment. Check cephadm logs"  
+          summary: "Too many different NVMe-oF gateway releases active on cluster mycluster"
+          description: "This may indicate an issue with deployment. Check cephadm logs"
 
  # NVMeoFHighClientCount
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1"}'
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
       values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
-    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2"}'
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
       values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_host_count > 32.00
        eval_time: 15m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
            value: 38
    alert_rule_test:
     - eval_time: 20m
@@ -2492,55 +2517,57 @@ tests:
       - exp_labels:
           nqn: nqn1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The number of clients connected to nqn1 is too high "
-          description: "The supported limit for clients connecting to a subsystem is 32" 
- 
+          summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
+          description: "The supported limit for clients connecting to a subsystem is 32"
+
  # NVMeoFHighHostCPU
  - interval: 1m
    input_series:
-    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0"}'
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0",cluster="mycluster"}'
       values: '0+18x10 180+9x20'
-    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1"}'
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1",cluster="mycluster"}'
       values: '0+18x10 180+9x20'
-    - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008"}'
+    - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008",cluster="mycluster"}'
       values: '1.00+0x20'
    promql_expr_test:
      - expr: 100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= 80
        eval_time: 16m
        exp_samples:
-         - labels: '{host="node-1",instance="node-1:10008"}'
+         - labels: '{host="node-1",instance="node-1:10008",cluster="mycluster"}'
            value: 85
    alert_rule_test:
     # negative match at 15m
     - eval_time: 15m
       alertname: NVMeoFHighHostCPU
-    # positive match at 25m      
+    # positive match at 25m
     - eval_time: 25m
       alertname: NVMeoFHighHostCPU
       exp_alerts:
       - exp_labels:
           instance: node-1:10008
           host: node-1
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) "
-          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"  
+          summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) on cluster mycluster"
+          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
 
  # NVMeoFInterfaceDown - triggered on eth0 only
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0"}'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0", cluster="mycluster"}'
+      values: '1+0x30'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1", cluster="mycluster"}'
       values: '1+0x30'
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1"}'
-      values: '1+0x30'      
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2550,24 +2577,25 @@ tests:
           oid: 1.3.6.1.4.1.50495.1.2.1.14.1
           operstate: down
           device: eth0
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "Network interface eth0 is down "
-          description: "A NIC used by one or more subsystems is in a down state" 
+          summary: "Network interface eth0 is down on cluster mycluster"
+          description: "A NIC used by one or more subsystems is in a down state"
 
  # NVMeoFInterfaceDuplex - triggered on eth1 only
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0"}'
-      values: '1+0x30'   
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1"}'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0", cluster="mycluster"}'
+      values: '1+0x30'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1", cluster="mycluster"}'
       values: '1+0x30'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}
        eval_time: 30s
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2576,18 +2604,19 @@ tests:
       - exp_labels:
           duplex: half
           device: eth1
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "Network interface eth1 is not running in full duplex mode "
-          description: "Until this is resolved, performance from the gateway will be degraded" 
+          summary: "Network interface eth1 is not running in full duplex mode on cluster mycluster"
+          description: "Until this is resolved, performance from the gateway will be degraded"
 
- # NVMeoFHighReadLatency 
+ # NVMeoFHighReadLatency
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
-      values: '0+1680x10 19800+3000x20'   
-    - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
+      values: '0+1680x10 19800+3000x20'
+    - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
       values: '0+286000x10 2980000+120000x20'
    promql_expr_test:
      - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
@@ -2610,14 +2639,14 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "The average read latency over the last 5 mins has reached 10 ms or more on node-1"
-          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" 
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
 
- # NVMeoFHighWriteLatency 
+ # NVMeoFHighWriteLatency
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
-      values: '0+1680x10 19800+3000x20'   
-    - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
+      values: '0+1680x10 19800+3000x20'
+    - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
       values: '0+286000x10 2980000+120000x20'
    promql_expr_test:
      - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[1m]) / rate(ceph_nvmeof_bdev_writes_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
@@ -2641,4 +2670,3 @@ tests:
         exp_annotations:
           summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
           description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
- 
-\ No newline at end of file
diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini
index 90ac311edba..8d1ec872e7f 100644
--- a/monitoring/ceph-mixin/tox.ini
+++ b/monitoring/ceph-mixin/tox.ini
@@ -26,6 +26,7 @@ allowlist_externals =
     jsonnet
     jsonnetfmt
     sh
+    ./lint-jsonnet.sh
 description =
     check: Ensure that auto-generated files matches the current version
     fix: Update generated files from jsonnet file with latest changes
diff --git a/qa/README b/qa/README
index f9b8988c6f9..a6a95c479bc 100644
--- a/qa/README
+++ b/qa/README
@@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or
 ubuntu, chosen randomly.
 
 The teuthology code can be found in https://github.com/ceph/teuthology.git
+
+Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git
+CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench.
+cosbench support has been removed from qa/tasks/cbt.py.
+
diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index 678548fe2cc..94b42579777 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -21,3 +21,6 @@ overrides:
       - overall HEALTH_
       - Replacing daemon
       - deprecated feature inline_data
+      - BLUESTORE_SLOW_OP_ALERT
+      - slow operation indications in BlueStore
+      - experiencing slow operations in BlueStore
diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml
index 1740134a2e0..07ca62e01fb 100644
--- a/qa/cephfs/overrides/pg_health.yaml
+++ b/qa/cephfs/overrides/pg_health.yaml
@@ -9,3 +9,5 @@ overrides:
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 4eac1106e8d..843e9b9901b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -234,146 +234,6 @@ function wait_background_check() {
     return $return_code
 }
 
-# osd_scrub_during_recovery=true make sure scrub happens
-# update 26.8.24: the test should be redesigned. The current version is not
-# reliable, and playing around with the timeouts and such won't fix the
-# design issues.
-function TEST_recovery_scrub_2() {
-    local dir=$1
-    local poolname=test
-    return 0
-
-    TESTDATA="testdata.$$"
-    OSDS=8
-    PGS=32
-    OBJECTS=40
-
-    setup $dir || return 1
-    run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
-    run_mgr $dir x --mgr_stats_period=1 || return 1
-    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 "
-    ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
-    ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
-    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 "
-    ceph_osd_args+="--mgr_stats_period=1"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=1 \
-                          $ceph_osd_args || return 1
-    done
-
-    # Create a pool with $PGS pgs
-    create_pool $poolname $PGS $PGS
-    wait_for_clean || return 1
-    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
-    dd if=/dev/urandom of=$TESTDATA bs=1M count=50
-    for i in $(seq 1 $OBJECTS)
-    do
-        rados -p $poolname put obj${i} $TESTDATA
-    done
-    rm -f $TESTDATA
-
-    ceph osd pool set $poolname size 3
-
-    ceph pg dump pgs
-
-    # note that the following will be needed if the mclock scheduler is specified
-    ceph tell osd.* config get osd_mclock_override_recovery_settings
-
-    # the '_max_active' is expected to be 0
-    ceph tell osd.1 config get osd_recovery_max_active
-    # both next parameters are expected to be >=3
-    ceph tell osd.1 config set osd_recovery_max_active_hdd 6
-    ceph tell osd.1 config set osd_recovery_max_active_ssd 6
-    ceph tell osd.1 config get osd_recovery_max_active_hdd
-    ceph tell osd.1 config get osd_recovery_max_active_ssd
-
-    # Wait for recovery to start
-    count=0
-    while(true)
-    do
-      #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
-      ceph pg dump pgs
-      if test $(ceph --format json pg dump pgs |
-	      jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
-      then
-        break
-      fi
-      sleep 2
-      if test "$count" -eq "10"
-      then
-        echo "Not enough recovery started simultaneously"
-        return 1
-      fi
-      count=$(expr $count + 1)
-    done
-    ceph pg dump pgs
-
-    pids=""
-    recov_scrub_count=0
-    for pg in $(seq 0 $(expr $PGS - 1))
-    do
-        run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
-    done
-    wait_background_check pids
-    return_code=$?
-    if [ $return_code -ne 0 ]; then return $return_code; fi
-
-    ERRORS=0
-    if test $recov_scrub_count -eq 0
-    then
-      echo "No scrubs occurred while PG recovering"
-      ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
-    pid=$(cat $pidfile)
-    if ! kill -0 $pid
-    then
-        echo "OSD crash occurred"
-        #tail -100 $dir/osd.0.log
-        ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    # Work around for http://tracker.ceph.com/issues/38195
-    kill_daemons $dir #|| return 1
-
-    declare -a err_strings
-    ## we do not expect a refusal to scrub
-    err_strings[0]="recovery in progress.*scrubs"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        grep "recovery in progress.*scrubs" $dir/osd.${osd}.log
-    done
-    for err_string in "${err_strings[@]}"
-    do
-        found=false
-        for osd in $(seq 0 $(expr $OSDS - 1))
-        do
-            if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
-            then
-                found=true
-            fi
-        done
-        if [ "$found" = "true" ]; then
-            echo "Found log message not expected '$err_string'"
-	    ERRORS=$(expr $ERRORS + 1)
-        fi
-    done
-
-    teardown $dir || return 1
-
-    if [ $ERRORS != "0" ];
-    then
-        echo "TEST FAILED WITH $ERRORS ERRORS"
-        return 1
-    fi
-
-    echo "TEST PASSED"
-    return 0
-}
-
 main osd-recovery-scrub "$@"
 
 # Local Variables:
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 59564f7e37e..491e46603f7 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() {
         ['pool_name']="testpool"
         ['extras']=" --osd_scrub_auto_repair=true"
     )
-    local extr_dbg=3
     standard_scrub_cluster $dir cluster_conf
     local poolid=${cluster_conf['pool_id']}
     local poolname=${cluster_conf['pool_name']}
@@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() {
     grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
 }
 
+#
+# Testing the "split scrub store" feature: shallow scrubs do not
+# purge deep errors from the store.
+#
+# Corrupt one copy of a replicated pool, creating both shallow and deep errors.
+# Then shallow-scrub the pool and verify that the deep errors are still present.
+#
+function TEST_dual_store_replicated_cluster() {
+    local dir=$1
+    local poolname=csr_pool
+    local total_objs=19
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    run_mon $dir a --osd_pool_default_size=2 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 "
+    for osd in $(seq 0 1)
+    do
+      run_osd $dir $osd $ceph_osd_args || return 1
+    done
+
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+
+    create_pool foo 1 || return 1
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    ceph osd pool set $poolname noscrub 1
+    ceph osd pool set $poolname nodeep-scrub 1
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+        add_something $dir $poolname $objname || return 1
+
+        rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+        rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+    done
+
+    # Increase file 1 MB + 1KB
+    dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025
+    rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1
+    rm -f $dir/new.ROBJ19
+
+    local pg=$(get_pg $poolname ROBJ0)
+    local primary=$(get_primary $poolname ROBJ0)
+
+    # Compute an old omap digest and save oi
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    pg_deep_scrub $pg
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+
+        # Alternate corruption between osd.0 and osd.1
+        local osd=$(expr $i % 2)
+
+        case $i in
+        1)
+            # Size (deep scrub data_digest too)
+            local payload=UVWXYZZZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        2)
+            # digest (deep scrub only)
+            local payload=UVWXYZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        3)
+             # missing
+             objectstore_tool $dir $osd $objname remove || return 1
+             ;;
+
+         4)
+             # Modify omap value (deep scrub only)
+             objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+             ;;
+
+         5)
+            # Delete omap key (deep scrub only)
+            objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+            ;;
+
+         6)
+            # Add extra omap key (deep scrub only)
+            echo extra > $dir/extra-val
+            objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+            rm $dir/extra-val
+            ;;
+
+         7)
+            # Modify omap header (deep scrub only)
+            echo -n newheader > $dir/hdr
+            objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+            rm $dir/hdr
+            ;;
+
+         8)
+            rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+            rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+            # Break xattrs
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+            objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+            echo -n val3-$objname > $dir/newval
+            objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+            rm $dir/bad-val $dir/newval
+            ;;
+
+        9)
+            objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+            echo -n D > $dir/change
+            rados --pool $poolname put $objname $dir/change
+            objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+            rm $dir/oi $dir/change
+            ;;
+
+          # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+          # ROBJ11 must be handled with config change before deep scrub
+          # ROBJ12 must be handled with config change before scrubs
+          # ROBJ13 must be handled before scrubs
+
+        14)
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+            objectstore_tool $dir 1 $objname rm-attr _ || return 1
+            rm $dir/bad-val
+            ;;
+
+        15)
+            objectstore_tool $dir $osd $objname rm-attr _ || return 1
+            ;;
+
+        16)
+            objectstore_tool $dir 0 $objname rm-attr snapset || return 1
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+	    ;;
+
+	17)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ17
+           echo $payload > $dir/new.ROBJ17
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   ;;
+
+	18)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ18
+           echo $payload > $dir/new.ROBJ18
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   # Make one replica have a different object info, so a full repair must happen too
+	   objectstore_tool $dir $osd $objname corrupt-info || return 1
+	   ;;
+
+	19)
+	   # Set osd-max-object-size smaller than this object's size
+
+        esac
+    done
+
+    local pg=$(get_pg $poolname ROBJ0)
+
+    ceph tell osd.\* injectargs -- --osd-max-object-size=1048576
+
+    inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+    inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+    inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+
+    # first sequence: the final shallow scrub should not override any of the deep errors
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1.json
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1b.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_1b_s.json
+
+    pg_deep_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_2.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_2s.json
+
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_3.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_3s.json
+
+    diff -u $dir/dp_results.json $dir/sh2_results.json || return 1
+
+    # inject a read error, which is a special case: the scrub encountering the read error
+    # would override the previously collected shard info.
+    inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+
+    pg_deep_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_4.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json
+    # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s.json
+
+    pg_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_5.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \
+        $dir/sh2Part2_w13_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json
+
+    # the shallow scrub results should differ from the results of the deep
+    # scrub preceding it, but the difference should be limited to ROBJ13
+    diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1
+    diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1
+
+    ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it
+    return 0
+}
+
 
 main osd-scrub-repair "$@"
 
diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
index 0f6021975a4..50d170f5022 100644
--- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml
+++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
@@ -10,3 +10,4 @@ tasks:
       osd:
         debug monc: 20
     flavor: crimson
+- ssh_keys:
diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml
index da841373220..42ca9336c8e 100644
--- a/qa/suites/fs/libcephfs/tasks/client.yaml
+++ b/qa/suites/fs/libcephfs/tasks/client.yaml
@@ -12,3 +12,4 @@ tasks:
     clients:
       client.0:
         - client/test.sh
+        - client/test_oc_disabled.sh
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
index 713adb9628a..96e4353e99c 100644
--- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
@@ -2,3 +2,4 @@ overrides:
   ceph:
     log-ignorelist:
       - OSD_DOWN
+      - osd.*is down
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 2e4741e8140..7c97edae552 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
index 2e873a04bab..9ef37004427 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
index 83d16e4cb2c..12cb50b408d 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
index 6db0c0d4e18..b4755a6433b 100644
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20 # each subsystem
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
index 0080d3bf730..c6bec082843 100644
--- a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
+++ b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
@@ -131,8 +131,10 @@ tasks:
       - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo
       - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done
       - ceph orch ps
+      - ceph versions
       # verify all rgw daemons on same version and version hash matches what we are upgrading to
-      - ceph versions | jq -e '.rgw | length == 1'
+      # `ceph versions` might not get updated immediately for rgw so retry this
+      - time timeout 60 bash -c "until ceph versions | jq -e '.rgw | length == 1'; do sleep 2; done"
       - ceph versions | jq -e '.rgw | keys' | grep $sha1
       - ceph orch upgrade status
       - ceph health detail
diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
new file mode 100644
index 00000000000..5207fd415b7
--- /dev/null
+++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
@@ -0,0 +1,77 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_FAILED_DAEMON
+    log-only-match:
+      - CEPHADM_
+roles:
+- - host.a
+  - mon.a
+  - mgr.a
+  - osd.0
+- - host.b
+  - mon.b
+  - mgr.b
+  - osd.1
+- - host.c
+  - mon.c
+  - osd.2
+tasks:
+- install:
+- cephadm:
+- cephadm.shell:
+    host.c:
+      - |
+        set -ex
+        # Deploy monitoring stack
+        ceph orch apply node-exporter
+        ceph orch apply grafana
+        ceph orch apply alertmanager
+        ceph orch apply prometheus
+        sleep 240
+        # generate SSL certificate
+        openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*"
+        # Generate a mgmt.spec template
+        cat << EOT > /tmp/mgmt.spec
+        service_type: mgmt-gateway
+        service_id: foo
+        placement:
+          hosts:
+            - ${HOSTNAME}
+        spec:
+          ssl_protocols:
+            - TLSv1.2
+            - TLSv1.3
+          ssl_ciphers:
+            - AES128-SHA
+            - AES256-SHA
+          enable_health_check_endpoint: True
+        EOT
+        # Add generated certificates to spec file
+        echo "  ssl_certificate: |" >> /tmp/mgmt.spec 
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
+        echo "  ssl_certificate_key: |" >> /tmp/mgmt.spec
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/key.pem >> /tmp/mgmt.spec
+        # Apply spec
+        ceph orch apply -i /tmp/mgmt.spec
+- cephadm.wait_for_service:
+    service: mgmt-gateway
+- cephadm.shell:
+    host.a:
+      - |
+        set -ex
+        # retrieve mgmt hostname and ip
+        MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
+        MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+        # check mgmt-gateway health
+        curl -k -s https://${MGMT_GTW_IP}/health
+        curl -k -s https://${MGMT_GTW_IP}:29443/health
+        # wait for background services to be reconfigured following mgmt-gateway installation
+        sleep 180
+        # check grafana endpoints are responsive and database health is okay
+        curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
+        # check prometheus endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
+        # check alertmanager endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
+
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 3814ea3efdb..146bd57960d 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -31,3 +31,5 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+    log-ignorelist:
+        - PG_DEGRADED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index bf3005fad45..ce4e0cc228b 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -6,6 +6,7 @@ overrides:
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
 tasks:
 - install:
     branch: reef
diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
index 84e096520b4..e6a9dc8223c 100644
--- a/qa/tasks/cbt.py
+++ b/qa/tasks/cbt.py
@@ -47,22 +47,11 @@ class CBT(Task):
 
         benchmark_config = self.config.get('benchmarks')
         benchmark_type = next(iter(benchmark_config.keys()))
+  
         if benchmark_type in ['librbdfio', 'fio']:
           testdir = misc.get_testdir(self.ctx)
           benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
-        if benchmark_type == 'cosbench':
-            # create cosbench_dir and cosbench_xml_dir
-            testdir = misc.get_testdir(self.ctx)
-            benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
-            benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
-            self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
-            benchmark_config['cosbench']['controller'] = osd_hosts[0]
-
-            # set auth details
-            remotes_and_roles = self.ctx.cluster.remotes.items()
-            ips = [host for (host, port) in
-                   (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-            benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+  
         client_endpoints_config = self.config.get('client_endpoints', None)
         monitoring_profiles = self.config.get('monitoring_profiles', {})
 
@@ -117,77 +106,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            # install cosbench
-            self.log.info('install dependencies for cosbench')
-            if system_type == 'rpm':
-                cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
-            else:
-                cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
-            self.first_mon.run(args=install_cmd + cosbench_depends)
-            testdir = misc.get_testdir(self.ctx)
-            cosbench_version = '0.4.2.c3'
-            cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
-            os_version = misc.get_system_type(self.first_mon, False, True)
-
-            # additional requirements for bionic
-            if os_version == '18.04':
-                self.first_mon.run(
-                    args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
-                # use our own version of cosbench
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-                # contains additional parameter "-N" to nc
-                cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
-                cosbench_dir = os.path.join(testdir, cosbench_version)
-                self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
-                    ]
-                )
-            else:
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version)
-                    ]
-                )
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'ln', '-s', cosbench_version, 'cos',
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
-                    'chmod', '+x', run.Raw('*.sh'),
-                ]
-            )
-
-            # start cosbench and check info
-            self.log.info('start cosbench')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'start-all.sh'
-                ]
-            )
-            self.log.info('check cosbench info')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'cli.sh', 'info'
-                ]
-            )
-
     def checkout_cbt(self):
         testdir = misc.get_testdir(self.ctx)
         repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
@@ -269,51 +187,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            os_version = misc.get_system_type(self.first_mon, False, True)
-            if os_version == '18.04':
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-            else:
-                cosbench_version = '0.4.2.c3'
-            # note: stop-all requires 'nc'
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'stop-all.sh',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'sudo', 'killall', '-9', 'java',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/cos'.format(tdir=testdir),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/xml'.format(tdir=testdir),
-                ]
-            )
         # Collect cbt performance data
         cbt_performance = CBTperformance()
         cbt_performance.collect(self.ctx, self.config)
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 1c00a49077d..2b7fd2ee569 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -640,8 +640,11 @@ class FilesystemBase(MDSClusterBase):
     def set_joinable(self, joinable=True):
         self.set_var("joinable", joinable)
 
-    def set_max_mds(self, max_mds):
-        self.set_var("max_mds", "%d" % max_mds)
+    def set_max_mds(self, max_mds, confirm=True):
+        if confirm:
+            self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it')
+        else:
+            self.set_var("max_mds", f"{max_mds}",)
 
     def set_session_timeout(self, timeout):
         self.set_var("session_timeout", "%d" % timeout)
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index 6a583cb4d0f..00a68dd0183 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -2683,3 +2683,60 @@ class TestMDSFail(TestAdminCommands):
                               errmsgs=health_warn)
         self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it')
         self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it')
+
+
+class TestFSSetMaxMDS(TestAdminCommands):
+
+    def test_when_unhealthy_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster is
+        unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_unhealthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster is unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_mds_trim_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster has
+        MDS_TRIM health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_mds_trim_when_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM
+        health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_healthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully also when cluster is
+        healthy.
+        '''
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index 932d504d47f..19076ea44b3 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -8,6 +8,7 @@ from io import BytesIO, StringIO
 from tasks.mgr.mgr_test_case import MgrTestCase
 from teuthology import contextutil
 from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import Raw
 
 log = logging.getLogger(__name__)
 
@@ -319,7 +320,7 @@ class TestNFS(MgrTestCase):
                     else:
                         log.warning(f'{e}, retrying')
 
-    def _test_mnt(self, pseudo_path, port, ip, check=True):
+    def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False):
         '''
         Test mounting of created exports
         :param pseudo_path: It is the pseudo root name
@@ -347,12 +348,27 @@ class TestNFS(MgrTestCase):
         self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
 
         try:
+            # Clean up volumes directory created by subvolume create by some tests
+            self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes'])
             self.ctx.cluster.run(args=['touch', '/mnt/test'])
             out_mnt = self._sys_cmd(['ls', '/mnt'])
             self.assertEqual(out_mnt,  b'test\n')
+            if datarw:
+              self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1'])
+              out_test1 = self._sys_cmd(['cat', '/mnt/test1'])
+              self.assertEqual(out_test1,  b'test data\n')
         finally:
             self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
 
+    def _test_data_read_write(self, pseudo_path, port, ip):
+        '''
+        Check if read/write works fine
+        '''
+        try:
+            self._test_mnt(pseudo_path, port, ip, True, True)
+        except CommandFailedError as e:
+            self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
+
     def _write_to_read_only_export(self, pseudo_path, port, ip):
         '''
         Check if write to read only export fails
@@ -599,6 +615,18 @@ class TestNFS(MgrTestCase):
         self._write_to_read_only_export(self.pseudo_path, port, ip)
         self._test_delete_cluster()
 
+    def test_data_read_write(self):
+        '''
+        Test date read and write on export.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._test_data_read_write(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
     def test_cluster_info(self):
         '''
         Test cluster info outputs correct ip and hostname
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 2baefd72c3f..2ee3b6ac052 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -2388,7 +2388,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # get earmark
@@ -2401,7 +2401,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # remove earmark
@@ -2559,7 +2559,7 @@ class TestSubvolumes(TestVolumesHelper):
             self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
@@ -7876,7 +7876,22 @@ class TestCloneProgressReporter(TestVolumesHelper):
                 self.run_ceph_cmd('fs subvolume snapshot rm --force '
                                   f'--format json {v} {sv} {ss}')
 
-            self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            try:
+                self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            except CommandFailedError as e:
+                if e.exitstatus == errno.ENOENT:
+                    log.info(
+                        'ignoring this error, perhaps subvolume was deleted '
+                        'during the test and snapshot deleted above is a '
+                        'retained snapshot. when a retained snapshot (which is '
+                        'snapshot retained despite of subvolume deletion) is '
+                        'deleted, the subvolume directory is also deleted '
+                        'along. and before retained snapshot deletion, the '
+                        'subvolume is reported by "subvolume ls" command, which'
+                        'is what probably caused confusion here')
+                    pass
+                else:
+                    raise
 
         # verify trash dir is clean
         self._wait_for_trash_empty()
@@ -8090,6 +8105,58 @@ class TestCloneProgressReporter(TestVolumesHelper):
         # and not cancelling these clone doesnt affect this test case.
         self.cancel_clones_and_ignore_if_finished(c)
 
+    def test_clone_after_subvol_is_removed(self):
+        '''
+        Initiate cloning after source subvolume has been deleted but with
+        snapshots retained and then test that, when this clone is in progress,
+        one progress bar is printed in output of command "ceph status" that
+        shows progress of this clone.
+        '''
+        v = self.volname
+        sv = 'sv1'
+        ss = 'ss1'
+        # XXX: "clone" must be part of clone name for sake of tearDown()
+        c = 'ss1clone1'
+
+        # XXX: without setting mds_snap_rstat to true rstats are not updated on
+        # a subvolume snapshot and therefore clone progress bar will not show
+        # any progress.
+        self.config_set('mds', 'mds_snap_rstat', 'true')
+
+        self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+        size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+        self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+        self.wait_till_rbytes_is_right(v, sv, size)
+
+        self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots')
+        self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+
+        with safe_while(tries=15, sleep=10) as proceed:
+            while proceed():
+                pev = self.get_pevs_from_ceph_status(c)
+
+                if len(pev) < 1:
+                   continue
+                elif len(pev) > 1:
+                    raise RuntimeError('For 1 clone "ceph status" output has 2 '
+                                       'progress bars, it should have only 1 '
+                                       f'progress bar.\npev -\n{pev}')
+
+                # ensure that exactly 1 progress bar for cloning is present in
+                # "ceph status" output
+                msg = ('"progress_events" dict in "ceph status" output must have '
+                       f'exactly one entry.\nprogress_event dict -\n{pev}')
+                self.assertEqual(len(pev), 1, msg)
+
+                pev_msg = tuple(pev.values())[0]['message']
+                self.assertIn('1 ongoing clones', pev_msg)
+                break
+
+        # allowing clone jobs to finish will consume too much time and space
+        # and not cancelling these clone doesnt affect this test case.
+        self.cancel_clones_and_ignore_if_finished(c)
+
     def test_clones_equal_to_cloner_threads(self):
         '''
         Test that one progress bar is printed in output of "ceph status" output
diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
index 07c69ddc47c..be7afccf331 100644
--- a/qa/tasks/mgr/dashboard/test_osd.py
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -11,6 +11,7 @@ from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple,
 class OsdTest(DashboardTestCase):
 
     AUTH_ROLES = ['cluster-manager']
+    _VERSION = '1.1'
 
     @classmethod
     def setUpClass(cls):
@@ -24,7 +25,7 @@ class OsdTest(DashboardTestCase):
 
     @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
     def test_access_permissions(self):
-        self._get('/api/osd')
+        self._get('/api/osd', version=self._VERSION)
         self.assertStatus(403)
         self._get('/api/osd/0')
         self.assertStatus(403)
@@ -33,7 +34,7 @@ class OsdTest(DashboardTestCase):
         self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
 
     def test_list(self):
-        data = self._get('/api/osd')
+        data = self._get('/api/osd', version=self._VERSION)
         self.assertStatus(200)
 
         self.assertGreaterEqual(len(data), 1)
diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh
new file mode 100755
index 00000000000..88552aa50bd
--- /dev/null
+++ b/qa/workunits/client/test_oc_disabled.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -ex
+
+ceph_test_client --client_oc=false
diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index fb72e1d6402..cc4024323eb 100755
--- a/qa/workunits/nvmeof/setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -29,7 +29,7 @@ list_subsystems () {
 # add all subsystems
 for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
     subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
-    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn
+    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
 done
 
 list_subsystems
diff --git a/src/ceph-volume/ceph_volume/__init__.py b/src/ceph-volume/ceph_volume/__init__.py
index b10100c0218..814619cfddd 100644
--- a/src/ceph-volume/ceph_volume/__init__.py
+++ b/src/ceph-volume/ceph_volume/__init__.py
@@ -6,6 +6,7 @@ from collections import namedtuple
 sys_info = namedtuple('sys_info', ['devices'])
 sys_info.devices = dict()
 logger = logging.getLogger(__name__)
+BEING_REPLACED_HEADER: str = 'CEPH_DEVICE_BEING_REPLACED'
 
 
 class AllowLoopDevices:
diff --git a/src/ceph-volume/ceph_volume/api/lvm.py b/src/ceph-volume/ceph_volume/api/lvm.py
index 16cbc08b262..fc376f891fd 100644
--- a/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/src/ceph-volume/ceph_volume/api/lvm.py
@@ -10,6 +10,8 @@ from itertools import repeat
 from math import floor
 from ceph_volume import process, util, conf
 from ceph_volume.exceptions import SizeAllocationError
+from typing import Any, Dict
+
 
 logger = logging.getLogger(__name__)
 
@@ -807,13 +809,16 @@ LV_CMD_OPTIONS =  ['--noheadings', '--readonly', '--separator=";"', '-a',
                    '--units=b', '--nosuffix']
 
 
-class Volume(object):
+class Volume:
     """
     Represents a Logical Volume from LVM, with some top-level attributes like
     ``lv_name`` and parsed tags as a dictionary of key/value pairs.
     """
 
-    def __init__(self, **kw):
+    def __init__(self, **kw: str) -> None:
+        self.lv_path: str = ''
+        self.lv_name: str = ''
+        self.lv_uuid: str = ''
         for k, v in kw.items():
             setattr(self, k, v)
         self.lv_api = kw
@@ -824,13 +829,13 @@ class Volume(object):
         self.encrypted = self.tags.get('ceph.encrypted', '0') == '1'
         self.used_by_ceph = 'ceph.osd_id' in self.tags
 
-    def __str__(self):
+    def __str__(self) -> str:
         return '<%s>' % self.lv_api['lv_path']
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.__str__()
 
-    def as_dict(self):
+    def as_dict(self) -> Dict[str, Any]:
         obj = {}
         obj.update(self.lv_api)
         obj['tags'] = self.tags
@@ -839,7 +844,7 @@ class Volume(object):
         obj['path'] = self.lv_path
         return obj
 
-    def report(self):
+    def report(self) -> Dict[str, Any]:
         if not self.used_by_ceph:
             return {
                 'name': self.lv_name,
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index c1bef82c109..388f6aeea27 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -5,12 +5,12 @@ import time
 
 from textwrap import dedent
 
-from ceph_volume import decorators, terminal, process
+from ceph_volume import decorators, terminal, process, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm as api
 from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict
 from ceph_volume.util.device import Device
 from ceph_volume.systemd import systemctl
-from typing import List
+from typing import Any, Dict, List
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -95,29 +95,29 @@ def zap_data(path):
         'conv=fsync'
     ])
 
-
-def find_associated_devices(osd_id=None, osd_fsid=None):
+def find_associated_devices(osd_id: str = '', osd_fsid: str = '') -> List[api.Volume]:
     """
     From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
     system that match those tag values, further detect if any partitions are
     part of the OSD, and then return the set of LVs and partitions (if any).
     """
     lv_tags = {}
-    if osd_id:
-        lv_tags['ceph.osd_id'] = osd_id
-    if osd_fsid:
-        lv_tags['ceph.osd_fsid'] = osd_fsid
+    lv_tags = {key: value for key, value in {
+        'ceph.osd_id': osd_id,
+        'ceph.osd_fsid': osd_fsid
+    }.items() if value}
 
     lvs = api.get_lvs(tags=lv_tags)
+
     if not lvs:
         raise RuntimeError('Unable to find any LV for zapping OSD: '
-                           '%s' % osd_id or osd_fsid)
-
+                            f'{osd_id or osd_fsid}')
     devices_to_zap = ensure_associated_lvs(lvs, lv_tags)
-    return [Device(path) for path in set(devices_to_zap) if path]
 
+    return [Device(path) for path in set(devices_to_zap) if path]
 
-def ensure_associated_lvs(lvs, lv_tags={}):
+def ensure_associated_lvs(lvs: List[api.Volume],
+                          lv_tags: Dict[str, Any] = {}) -> List[str]:
     """
     Go through each LV and ensure if backing devices (journal, wal, block)
     are LVs or partitions, so that they can be accurately reported.
@@ -166,14 +166,14 @@ def ensure_associated_lvs(lvs, lv_tags={}):
     return list(set(verified_devices))
 
 
-class Zap(object):
-
+class Zap:
     help = 'Removes all data and filesystems from a logical volume or partition.'
 
-    def __init__(self, argv):
+    def __init__(self, argv: List[str]) -> None:
         self.argv = argv
+        self.osd_ids_to_zap: List[str] = []
 
-    def unmount_lv(self, lv):
+    def unmount_lv(self, lv: api.Volume) -> None:
         if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'):
             lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id'])
         else:
@@ -186,40 +186,95 @@ class Zap(object):
         if dmcrypt and dmcrypt_uuid:
             self.dmcrypt_close(dmcrypt_uuid)
 
-    def zap_lv(self, device):
+    def _write_replacement_header(self, device: str) -> None:
+        """Write a replacement header to a device.
+
+        This method writes the string defined in `BEING_REPLACED_HEADER`
+        to the specified device. This header indicates that the device
+        is in the process of being replaced.
+
+        Args:
+            device (str): The path to the device on which the replacement
+                          header will be written.
+        """
+        disk._dd_write(device,
+                       BEING_REPLACED_HEADER)
+
+    def clear_replace_header(self) -> bool:
+        """Safely erase the replacement header on a device if it is marked as being replaced.
+
+        This method checks whether the given device is marked as being replaced
+        (`device.is_being_replaced`). If true, it proceeds to erase the replacement header
+        from the device using the `_erase_replacement_header` method. The method returns
+        a boolean indicating whether any action was taken.
+
+        Args:
+            device (Device): The device object, which includes information about the device's
+                            path and status (such as whether it is currently being replaced).
+
+        Returns:
+            bool: True if the replacement header was successfully erased, False if the
+                device was not marked as being replaced or no action was necessary.
+        """
+        result: bool = False
+        device: Device = self.args.clear_replace_header
+        if device.is_being_replaced:
+            self._erase_replacement_header(device.path)
+            result = True
+        return result
+
+    def _erase_replacement_header(self, device: str) -> None:
+        """Erase the replacement header on a device.
+
+        This method writes a sequence of null bytes (`0x00`) over the area of the device
+        where the replacement header is stored, effectively erasing it.
+
+        Args:
+            device (str): The path to the device from which the replacement header will be erased.
+        """
+        disk._dd_write(device,
+                       b'\x00' * len(BEING_REPLACED_HEADER))
+
+    def zap_lv(self, device: Device) -> None:
         """
         Device examples: vg-name/lv-name, /dev/vg-name/lv-name
         Requirements: Must be a logical volume (LV)
         """
-        lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name':
-                                        device.vg_name})
+        lv: api.Volume = device.lv_api
         self.unmount_lv(lv)
-
+        self.parent_device: str = disk.get_parent_device_from_mapper(lv.lv_path)
         zap_device(device.path)
 
         if self.args.destroy:
             lvs = api.get_lvs(filters={'vg_name': device.vg_name})
-            if lvs == []:
-                mlogger.info('No LVs left, exiting', device.vg_name)
-                return
-            elif len(lvs) <= 1:
+            if len(lvs) <= 1:
                 mlogger.info('Only 1 LV left in VG, will proceed to destroy '
                              'volume group %s', device.vg_name)
                 pvs = api.get_pvs(filters={'lv_uuid': lv.lv_uuid})
                 api.remove_vg(device.vg_name)
                 for pv in pvs:
                     api.remove_pv(pv.pv_name)
+                replacement_args: Dict[str, bool] = {
+                    'block': self.args.replace_block,
+                    'db': self.args.replace_db,
+                    'wal': self.args.replace_wal
+                }
+                if replacement_args.get(lv.tags.get('ceph.type'), False):
+                    mlogger.info(f'Marking {self.parent_device} as being replaced')
+                    self._write_replacement_header(self.parent_device)
             else:
                 mlogger.info('More than 1 LV left in VG, will proceed to '
                              'destroy LV only')
                 mlogger.info('Removing LV because --destroy was given: %s',
                              device.path)
+                if self.args.replace_block:
+                    mlogger.info(f'--replace-block passed but the device still has {str(len(lvs))} LV(s)')
                 api.remove_lv(device.path)
         elif lv:
             # just remove all lvm metadata, leaving the LV around
             lv.clear_tags()
 
-    def zap_partition(self, device):
+    def zap_partition(self, device: Device) -> None:
         """
         Device example: /dev/sda1
         Requirements: Must be a partition
@@ -247,7 +302,7 @@ class Zap(object):
             mlogger.info("Destroying partition since --destroy was used: %s" % device.path)
             disk.remove_partition(device)
 
-    def zap_lvm_member(self, device):
+    def zap_lvm_member(self, device: Device) -> None:
         """
         An LVM member may have more than one LV and or VG, for example if it is
         a raw device with multiple partitions each belonging to a different LV
@@ -267,7 +322,7 @@ class Zap(object):
 
 
 
-    def zap_raw_device(self, device):
+    def zap_raw_device(self, device: Device) -> None:
         """
         Any whole (raw) device passed in as input will be processed here,
         checking for LVM membership and partitions (if any).
@@ -287,10 +342,19 @@ class Zap(object):
             self.zap_partition(Device('/dev/%s' % part_name))
 
         zap_device(device.path)
+        # TODO(guits): I leave this commented out, this should be part of a separate patch in order to
+        # support device replacement with raw-based OSDs
+        # if self.args.replace_block:
+        #     disk._dd_write(device.path, 'CEPH_DEVICE_BEING_REPLACED')
 
     @decorators.needs_root
-    def zap(self, devices=None):
-        devices = devices or self.args.devices
+    def zap(self) -> None:
+        """Zap a device.
+
+        Raises:
+            SystemExit: When the device is a mapper and not a mpath device.
+        """
+        devices = self.args.devices
 
         for device in devices:
             mlogger.info("Zapping: %s", device.path)
@@ -317,21 +381,21 @@ class Zap(object):
             )
 
     @decorators.needs_root
-    def zap_osd(self):
+    def zap_osd(self) -> None:
         if self.args.osd_id and not self.args.no_systemd:
             osd_is_running = systemctl.osd_is_active(self.args.osd_id)
             if osd_is_running:
                 mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
                 mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
                 raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
-        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
-        self.zap(devices)
+        self.args.devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        self.zap()
 
-    def dmcrypt_close(self, dmcrypt_uuid):
+    def dmcrypt_close(self, dmcrypt_uuid: str) -> None:
         mlogger.info("Closing encrypted volume %s", dmcrypt_uuid)
         encryption.dmcrypt_close(mapping=dmcrypt_uuid, skip_path_check=True)
 
-    def main(self):
+    def main(self) -> None:
         sub_command_help = dedent("""
         Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume.
         If given a path to a logical volume it must be in the format of vg/lv. Any
@@ -419,12 +483,56 @@ class Zap(object):
             help='Skip systemd unit checks',
         )
 
+        parser.add_argument(
+            '--replace-block',
+            dest='replace_block',
+            action='store_true',
+            help='Mark the block device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-db',
+            dest='replace_db',
+            action='store_true',
+            help='Mark the db device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-wal',
+            dest='replace_wal',
+            action='store_true',
+            help='Mark the wal device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--clear-replace-header',
+            dest='clear_replace_header',
+            type=arg_validators.ValidClearReplaceHeaderDevice(),
+            help='clear the replace header on devices.'
+        )
+
         if len(self.argv) == 0:
             print(sub_command_help)
             return
 
         self.args = parser.parse_args(self.argv)
 
+        if self.args.clear_replace_header:
+            rc: bool = False
+            try:
+                rc = self.clear_replace_header()
+            except Exception as e:
+                raise SystemExit(e)
+            if rc:
+                mlogger.info(f'Replacement header cleared on {self.args.clear_replace_header}')
+            else:
+                mlogger.info(f'No replacement header detected on {self.args.clear_replace_header}, nothing to do.')
+            raise SystemExit(not rc)
+
+        if self.args.replace_block or self.args.replace_db or self.args.replace_wal:
+            self.args.destroy = True
+            mlogger.info('--replace-block|db|wal passed, enforcing --destroy.')
+
         if self.args.osd_id or self.args.osd_fsid:
             self.zap_osd()
         else:
diff --git a/src/ceph-volume/ceph_volume/tests/conftest.py b/src/ceph-volume/ceph_volume/tests/conftest.py
index ee58081d97d..e6bf31737b6 100644
--- a/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -360,7 +360,7 @@ def device_info(monkeypatch, patch_bluestore_label):
               has_bluestore_label=False):
         if devices:
             for dev in devices.keys():
-                devices[dev]['device_nodes'] = os.path.basename(dev)
+                devices[dev]['device_nodes'] = [os.path.basename(dev)]
         else:
             devices = {}
         lsblk = lsblk if lsblk else {}
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
index d630a7a6bf8..efe52c053ff 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -7,11 +7,30 @@ from ceph_volume.api import lvm as api
 from ceph_volume.devices.lvm import zap
 
 
-class TestZap(object):
-    def test_invalid_osd_id_passed(self):
+class TestZap:
+    def test_invalid_osd_id_passed(self) -> None:
         with pytest.raises(SystemExit):
             zap.Zap(argv=['--osd-id', 'foo']).main()
 
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = True
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 0
+
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_not_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = False
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 1
+
+
 class TestFindAssociatedDevices(object):
 
     def test_no_lvs_found_that_match_id(self, monkeypatch, device_info):
diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
index f4f50b06f8a..fd7c468037c 100644
--- a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
+++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
@@ -159,6 +159,7 @@ class TestRawBlueStore:
 
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.rename_mapper', Mock(return_value=MagicMock()))
     @patch('ceph_volume.util.disk.get_bluestore_header')
+    @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_close', Mock(return_value=MagicMock()))
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_open', Mock(return_value=MagicMock()))
     def test_activate_dmcrypt_tpm(self, m_bs_header, rawbluestore, fake_lsblk_all, mock_raw_direct_report, is_root) -> None:
         m_bs_header.return_value = {
diff --git a/src/ceph-volume/ceph_volume/tests/test_inventory.py b/src/ceph-volume/ceph_volume/tests/test_inventory.py
index 785d8b56e86..832c0836642 100644
--- a/src/ceph-volume/ceph_volume/tests/test_inventory.py
+++ b/src/ceph-volume/ceph_volume/tests/test_inventory.py
@@ -126,6 +126,7 @@ class TestInventory(object):
         'lvs',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
 
     expected_sys_api_keys = [
diff --git a/src/ceph-volume/ceph_volume/util/arg_validators.py b/src/ceph-volume/ceph_volume/util/arg_validators.py
index 99e7d039e74..e75b34e550e 100644
--- a/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -7,6 +7,9 @@ from ceph_volume.util import disk
 from ceph_volume.util.encryption import set_dmcrypt_no_workqueue
 
 
+mlogger = terminal.MultiLogger(__name__)
+
+
 def valid_osd_id(val):
     return str(int(val))
 
@@ -70,6 +73,17 @@ class ValidZapDevice(ValidDevice):
         return self._device
 
 
+class ValidClearReplaceHeaderDevice(ValidDevice):
+    def __call__(self, dev_path: str) -> str:
+        super().get_device(dev_path)
+        return self._format_device(self._is_valid_device())
+
+    def _is_valid_device(self) -> Device:
+        if not self._device.is_being_replaced:
+            mlogger.info(f'{self.dev_path} has no replacement header.')
+        return self._device
+
+
 class ValidDataDevice(ValidDevice):
     def __call__(self, dev_path):
         super().get_device(dev_path)
diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py
index 9c2c11e7f31..82ee3266e3f 100644
--- a/src/ceph-volume/ceph_volume/util/device.py
+++ b/src/ceph-volume/ceph_volume/util/device.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
-
+# type: ignore
 import logging
 import os
 from functools import total_ordering
-from ceph_volume import sys_info, allow_loop_devices
+from ceph_volume import sys_info, allow_loop_devices, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm
 from ceph_volume.util import disk, system
 from ceph_volume.util.lsmdisk import LSMDisk
 from ceph_volume.util.constants import ceph_disk_guids
+from typing import List, Tuple
 
 
 logger = logging.getLogger(__name__)
@@ -92,6 +93,7 @@ class Device(object):
         'sys_api',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
     pretty_report_sys_fields = [
         'actuators',
@@ -136,6 +138,7 @@ class Device(object):
         self._exists = None
         self._is_lvm_member = None
         self.ceph_device = False
+        self.being_replaced: bool = self.is_being_replaced
         self._parse()
         if self.path in sys_info.devices.keys():
             self.device_nodes = sys_info.devices[self.path]['device_nodes']
@@ -298,7 +301,7 @@ class Device(object):
             rot=self.rotational,
             available=self.available,
             model=self.model,
-            device_nodes=self.device_nodes
+            device_nodes=','.join(self.device_nodes)
         )
 
     def json_report(self):
@@ -590,7 +593,7 @@ class Device(object):
             return [vg_free]
 
     @property
-    def has_partitions(self):
+    def has_partitions(self) -> bool:
         '''
         Boolean to determine if a given device has partitions.
         '''
@@ -598,7 +601,14 @@ class Device(object):
             return True
         return False
 
-    def _check_generic_reject_reasons(self):
+    @property
+    def is_being_replaced(self) -> bool:
+        '''
+        Boolean to indicate if the device is being replaced.
+        '''
+        return disk._dd_read(self.path, 26) == BEING_REPLACED_HEADER
+
+    def _check_generic_reject_reasons(self) -> List[str]:
         reasons = [
             ('id_bus', 'usb', 'id_bus'),
             ('ro', '1', 'read-only'),
@@ -639,9 +649,11 @@ class Device(object):
             rejected.append('Has partitions')
         if self.has_fs:
             rejected.append('Has a FileSystem')
+        if self.is_being_replaced:
+            rejected.append('Is being replaced')
         return rejected
 
-    def _check_lvm_reject_reasons(self):
+    def _check_lvm_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = []
         if self.vgs:
             available_vgs = [vg for vg in self.vgs if int(vg.vg_free_count) > 10]
@@ -654,7 +666,7 @@ class Device(object):
 
         return len(rejected) == 0, rejected
 
-    def _check_raw_reject_reasons(self):
+    def _check_raw_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = self._check_generic_reject_reasons()
         if len(self.vgs) > 0:
             rejected.append('LVM detected')
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 78c140597d6..30ee56808c7 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -7,7 +7,7 @@ import json
 from ceph_volume import process, allow_loop_devices
 from ceph_volume.api import lvm
 from ceph_volume.util.system import get_file_contents
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Union
 
 
 logger = logging.getLogger(__name__)
@@ -857,13 +857,14 @@ def get_devices(_sys_block_path='/sys/block', device=''):
             device_slaves = os.listdir(os.path.join(sysdir, 'slaves'))
             metadata['partitions'] = get_partitions_facts(sysdir)
 
+        metadata['device_nodes'] = []
         if device_slaves:
-            metadata['device_nodes'] = ','.join(device_slaves)
+            metadata['device_nodes'].extend(device_slaves)
         else:
             if block[2] == 'part':
-                metadata['device_nodes'] = block[3]
+                metadata['device_nodes'].append(block[3])
             else:
-                metadata['device_nodes'] = devname
+                metadata['device_nodes'].append(devname)
 
         metadata['actuators'] = None
         if os.path.isdir(sysdir + "/queue/independent_access_ranges/"):
@@ -979,7 +980,7 @@ def _dd_read(device: str, count: int, skip: int = 0) -> str:
 
     return result
 
-def _dd_write(device: str, data: str, skip: int = 0) -> None:
+def _dd_write(device: str, data: Union[str, bytes], skip: int = 0) -> None:
     """Write bytes to a device
 
     Args:
@@ -991,10 +992,14 @@ def _dd_write(device: str, data: str, skip: int = 0) -> None:
         OSError: If there is an error opening or writing to the device.
         Exception: If any other error occurs during the write operation.
     """
+
+    if isinstance(data, str):
+        data = data.encode('utf-8')
+
     try:
         with open(device, 'r+b') as b:
             b.seek(skip)
-            b.write(data.encode('utf-8'))
+            b.write(data)
     except OSError:
         logger.warning(f"Can't write to {device}")
         raise
@@ -1370,8 +1375,8 @@ class UdevData:
         """
         result: str = self.path
         if self.is_lvm:
-            vg: str = self.environment.get('DM_VG_NAME')
-            lv: str = self.environment.get('DM_LV_NAME')
+            vg: str = self.environment.get('DM_VG_NAME', '')
+            lv: str = self.environment.get('DM_LV_NAME', '')
             result = f'/dev/{vg}/{lv}'
         return result
 
@@ -1385,6 +1390,6 @@ class UdevData:
         """
         result: str = self.path
         if self.is_lvm:
-            name: str = self.environment.get('DM_NAME')
+            name: str = self.environment.get('DM_NAME', '')
             result = f'/dev/mapper/{name}'
         return result
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 1ab98a0ac4f..e32e2bc49f3 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -29,6 +29,7 @@ from glob import glob
 from io import StringIO
 from threading import Thread, Event
 from pathlib import Path
+from configparser import ConfigParser
 
 from cephadmlib.constants import (
     # default images
@@ -142,6 +143,7 @@ from cephadmlib.container_types import (
     SidecarContainer,
     extract_uid_gid,
     is_container_running,
+    get_mgr_images,
 )
 from cephadmlib.decorators import (
     deprecated_command,
@@ -2954,7 +2956,7 @@ def command_bootstrap(ctx):
         mounts = {}
         mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
         try:
-            out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
+            out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
             logger.info(out)
         except Exception:
             ctx.error_code = -errno.EINVAL
@@ -4679,6 +4681,13 @@ def command_rescan_disks(ctx: CephadmContext) -> str:
     return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)'
 
 
+def command_list_images(ctx: CephadmContext) -> None:
+    """this function will list the default images used by different services"""
+    cp_obj = ConfigParser()
+    cp_obj['mgr'] = get_mgr_images()
+    # print default images
+    cp_obj.write(sys.stdout)
+
 ##################################
 
 
@@ -5542,6 +5551,9 @@ def _get_parser():
         'disk-rescan', help='rescan all HBAs to detect new/removed devices')
     parser_disk_rescan.set_defaults(func=command_rescan_disks)
 
+    parser_list_images = subparsers.add_parser(
+        'list-images', help='list all the default images')
+    parser_list_images.set_defaults(func=command_list_images)
     return parser
 
 
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index d25eb1391e0..354c3782398 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -5,15 +5,15 @@ DEFAULT_IMAGE = 'quay.ceph.io/ceph-ci/ceph:main'
 DEFAULT_IMAGE_IS_MAIN = True
 DEFAULT_IMAGE_RELEASE = 'squid'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -22,7 +22,7 @@ DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
 DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
 DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
-DEFAULT_REGISTRY = 'docker.io'  # normalize unqualified digests to this
+DEFAULT_REGISTRY = 'quay.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
 
 LATEST_STABLE_RELEASE = 'squid'
diff --git a/src/cephadm/cephadmlib/container_types.py b/src/cephadm/cephadmlib/container_types.py
index 665c4d89652..791a545538a 100644
--- a/src/cephadm/cephadmlib/container_types.py
+++ b/src/cephadm/cephadmlib/container_types.py
@@ -8,7 +8,28 @@ import os
 from typing import Dict, List, Optional, Any, Union, Tuple, Iterable, cast
 
 from .call_wrappers import call, call_throws, CallVerbosity
-from .constants import DEFAULT_TIMEOUT
+from .constants import (
+    DEFAULT_TIMEOUT,
+    # default container images
+    DEFAULT_ALERT_MANAGER_IMAGE,
+    DEFAULT_GRAFANA_IMAGE,
+    DEFAULT_LOKI_IMAGE,
+    DEFAULT_NODE_EXPORTER_IMAGE,
+    DEFAULT_PROMETHEUS_IMAGE,
+    DEFAULT_PROMTAIL_IMAGE,
+    DEFAULT_HAPROXY_IMAGE,
+    DEFAULT_KEEPALIVED_IMAGE,
+    DEFAULT_NVMEOF_IMAGE,
+    DEFAULT_SNMP_GATEWAY_IMAGE,
+    DEFAULT_ELASTICSEARCH_IMAGE,
+    DEFAULT_JAEGER_COLLECTOR_IMAGE,
+    DEFAULT_JAEGER_AGENT_IMAGE,
+    DEFAULT_JAEGER_QUERY_IMAGE,
+    DEFAULT_SMB_IMAGE,
+    DEFAULT_SMBMETRICS_IMAGE,
+    DEFAULT_NGINX_IMAGE,
+    DEFAULT_OAUTH2_PROXY_IMAGE,
+)
 from .container_engines import Docker, Podman
 from .context import CephadmContext
 from .daemon_identity import DaemonIdentity, DaemonSubIdentity
@@ -660,3 +681,30 @@ def enable_shared_namespaces(
     cc = f'container:{name}'
     for n in ns:
         _replace_container_arg(args, n.to_option(cc))
+
+
+def get_mgr_images() -> dict:
+    """Return dict of default mgr images"""
+    mgr_prefix = 'mgr/cephadm/container_image_'
+    mgr_images = {}
+    mgr_images[mgr_prefix + 'prometheus'] = DEFAULT_PROMETHEUS_IMAGE
+    mgr_images[mgr_prefix + 'alertmanager'] = DEFAULT_ALERT_MANAGER_IMAGE
+    mgr_images[mgr_prefix + 'graphana'] = DEFAULT_GRAFANA_IMAGE
+    mgr_images[mgr_prefix + 'loki'] = DEFAULT_LOKI_IMAGE
+    mgr_images[mgr_prefix + 'promtail'] = DEFAULT_PROMTAIL_IMAGE
+    mgr_images[mgr_prefix + 'node_exporter'] = DEFAULT_NODE_EXPORTER_IMAGE
+    mgr_images[mgr_prefix + 'haproxy'] = DEFAULT_HAPROXY_IMAGE
+    mgr_images[mgr_prefix + 'keepalived'] = DEFAULT_KEEPALIVED_IMAGE
+    mgr_images[mgr_prefix + 'nvmeof'] = DEFAULT_NVMEOF_IMAGE
+    mgr_images[mgr_prefix + 'snmp_gateway'] = DEFAULT_SNMP_GATEWAY_IMAGE
+    mgr_images[mgr_prefix + 'elasticsearch'] = DEFAULT_ELASTICSEARCH_IMAGE
+    mgr_images[
+        mgr_prefix + 'jaeger_collector'
+    ] = DEFAULT_JAEGER_COLLECTOR_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_agent'] = DEFAULT_JAEGER_AGENT_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_query'] = DEFAULT_JAEGER_QUERY_IMAGE
+    mgr_images[mgr_prefix + 'smb'] = DEFAULT_SMB_IMAGE
+    mgr_images[mgr_prefix + 'smbmetrics'] = DEFAULT_SMBMETRICS_IMAGE
+    mgr_images[mgr_prefix + 'nginx'] = DEFAULT_NGINX_IMAGE
+    mgr_images[mgr_prefix + 'oauth2_proxy'] = DEFAULT_OAUTH2_PROXY_IMAGE
+    return mgr_images
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 74cb13f4ab0..82f886e72ec 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -72,6 +72,7 @@ class Config:
     instance_id: str
     source_config: str
     samba_debug_level: int
+    ctdb_log_level: str
     debug_delay: int
     domain_member: bool
     clustered: bool
@@ -98,6 +99,7 @@ class Config:
         domain_member: bool,
         clustered: bool,
         samba_debug_level: int = 0,
+        ctdb_log_level: str = '',
         debug_delay: int = 0,
         join_sources: Optional[List[str]] = None,
         user_sources: Optional[List[str]] = None,
@@ -119,6 +121,7 @@ class Config:
         self.domain_member = domain_member
         self.clustered = clustered
         self.samba_debug_level = samba_debug_level
+        self.ctdb_log_level = ctdb_log_level
         self.debug_delay = debug_delay
         self.join_sources = join_sources or []
         self.user_sources = user_sources or []
@@ -370,6 +373,8 @@ class CTDBDaemonContainer(SambaContainerCommon):
         # make conditional?
         # CAP_NET_ADMIN is needed for event script to add public ips to iface
         cargs.append('--cap-add=NET_ADMIN')
+        # CAP_NET_RAW allows to send gratuitous ARPs/tickle ACKs via raw sockets
+        cargs.append('--cap-add=NET_RAW')
         return cargs
 
 
@@ -756,7 +761,7 @@ class SMB(ContainerDaemonForm):
     def _write_ctdb_stub_config(self, path: pathlib.Path) -> None:
         reclock_cmd = ' '.join(_MUTEX_SUBCMD + [self._cfg.cluster_lock_uri])
         nodes_cmd = ' '.join(_NODES_SUBCMD)
-        stub_config = {
+        stub_config: Dict[str, Any] = {
             'samba-container-config': 'v0',
             'ctdb': {
                 # recovery_lock is passed directly to ctdb: needs '!' prefix
@@ -768,6 +773,8 @@ class SMB(ContainerDaemonForm):
                 ),
             },
         }
+        if self._cfg.ctdb_log_level:
+            stub_config['ctdb']['log_level'] = self._cfg.ctdb_log_level
         with file_utils.write_new(path) as fh:
             json.dump(stub_config, fh)
 
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 2f4674752cc..0ab8b38d2b5 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -165,17 +165,17 @@ def is_fsid(s):
 def normalize_image_digest(digest: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/ubuntu', 'quay.io')
+    'quay.io/ubuntu'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/cephadm/samples/custom_container.json b/src/cephadm/samples/custom_container.json
index 194a44d2abb..210cf1e3e55 100644
--- a/src/cephadm/samples/custom_container.json
+++ b/src/cephadm/samples/custom_container.json
@@ -1,5 +1,5 @@
 {
-    "image": "docker.io/prom/alertmanager:v0.20.0",
+    "image": "quay.io/prometheus/alertmanager:v0.20.0",
     "ports": [9093, 9094],
     "args": [
         "-p", "9093:9093",
diff --git a/src/cephadm/tests/build/test_cephadm_build.py b/src/cephadm/tests/build/test_cephadm_build.py
index 1465c2c5efe..c2995a76d4b 100644
--- a/src/cephadm/tests/build/test_cephadm_build.py
+++ b/src/cephadm/tests/build/test_cephadm_build.py
@@ -34,12 +34,12 @@ CONTAINERS = {
     },
     'ubuntu-20.04': {
         'name': 'cephadm-build-test:ubuntu-20-04-py3',
-        'base_image': 'docker.io/library/ubuntu:20.04',
+        'base_image': 'quay.io/library/ubuntu:20.04',
         'script': 'apt update && apt install -y python3-venv',
     },
     'ubuntu-22.04': {
         'name': 'cephadm-build-test:ubuntu-22-04-py3',
-        'base_image': 'docker.io/library/ubuntu:22.04',
+        'base_image': 'quay.io/library/ubuntu:22.04',
         'script': 'apt update && apt install -y python3-venv',
     },
 }
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index 928982de70b..f27b9bcd362 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -533,12 +533,12 @@ class TestCephAdm(object):
 
     def test_get_image_info_from_inspect(self):
         # podman
-        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
+        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/ceph/ceph:latest')
         print(r)
         assert r == {
             'image_id': '204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1',
-            'repo_digests': ['docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
+            'repo_digests': ['quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
         }
 
         # docker
@@ -550,13 +550,13 @@ class TestCephAdm(object):
         }
 
         # multiple digests (podman)
-        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
+        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/prom/prometheus:latest')
         assert r == {
             'image_id': 'e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42',
             'repo_digests': [
-                'docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
-                'docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
+                'quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
+                'quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
             ]
         }
 
@@ -604,7 +604,7 @@ class TestCephAdm(object):
                                  '')
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=cinfo):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -613,7 +613,7 @@ class TestCephAdm(object):
         # make sure first valid image is used when no container_info is found
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -621,12 +621,12 @@ class TestCephAdm(object):
 
         # make sure images without digest are discarded (no container_info is found)
         out = '''quay.ceph.io/ceph-ci/ceph@|||
-        docker.io/ceph/ceph@|||
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@|||
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
-                assert image == 'docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
+                assert image == 'quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
 
 
 
@@ -2409,7 +2409,7 @@ class TestSNMPGateway:
 
     def test_unit_run_V2c(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V2c_config)
             ctx.fsid = fsid
@@ -2434,11 +2434,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
 
     def test_unit_run_V3_noPriv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_no_priv_config)
             ctx.fsid = fsid
@@ -2463,11 +2463,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
 
     def test_unit_run_V3_Priv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_priv_config)
             ctx.fsid = fsid
@@ -2492,11 +2492,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
 
     def test_unit_run_no_dest(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.no_destination_config)
             ctx.fsid = fsid
@@ -2512,7 +2512,7 @@ class TestSNMPGateway:
 
     def test_unit_run_bad_version(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.bad_version_config)
             ctx.fsid = fsid
diff --git a/src/cephadm/tests/test_custom_container.py b/src/cephadm/tests/test_custom_container.py
index c185b0908df..197ed38dca3 100644
--- a/src/cephadm/tests/test_custom_container.py
+++ b/src/cephadm/tests/test_custom_container.py
@@ -47,7 +47,7 @@ class TestCustomContainer(unittest.TestCase):
                     ]
                 ]
             },
-            image='docker.io/library/hello-world:latest'
+            image='quay.io/hello-world/hello-world:latest'
         )
 
     def test_entrypoint(self):
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index 70e9a411238..20608c1681c 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -49,7 +49,8 @@ deps =
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
-    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 11'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25'
 # Downstream distributions may choose to alter this "docker.io" number,
 # to make sure no new references to docker.io are creeping in unnoticed.
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index e208cf76675..f8373095b38 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -6125,6 +6125,10 @@ int Client::may_open(Inode *in, int flags, const UserPerm& perms)
   int r = 0;
   switch (in->mode & S_IFMT) {
     case S_IFLNK:
+#if defined(__linux__) && defined(O_PATH)
+      if (flags & O_PATH)
+        break;
+#endif
       r = -CEPHFS_ELOOP;
       goto out;
     case S_IFDIR:
@@ -7953,6 +7957,12 @@ int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, c
     return r;
   }
 
+  if (!strcmp(relpath, "")) {
+    if (!dirinode.get()->is_symlink())
+      return -CEPHFS_ENOENT;
+    return _readlink(dirinode.get(), buf, size);
+  }
+
   InodeRef in;
   filepath path(relpath);
   r = path_walk(path, &in, perms, false, 0, dirinode);
@@ -10798,7 +10808,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
         goto success;
     }
 
-    clnt->put_cap_ref(in, CEPH_CAP_FILE_RD);
     // reverify size
     {
       r = clnt->_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
@@ -10810,14 +10819,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
     if ((uint64_t)pos >= in->size)
       goto success;
 
-    {
-      int have_caps2 = 0;
-      r = clnt->get_caps(f, CEPH_CAP_FILE_RD, have_caps, &have_caps2, -1);
-      if (r < 0) {
-        goto error;
-      }
-    }
-
     wanted = left;
     retry();
     clnt->client_lock.unlock();
@@ -10971,6 +10972,20 @@ retry:
     // branch below but in a non-blocking fashion. The code in _read_sync
     // is duplicated and modified and exists in
     // C_Read_Sync_NonBlocking::finish().
+
+    // trim read based on file size?
+    if ((offset >= in->size) || (size == 0)) {
+      // read is requested at the EOF or the read len is zero, therefore just
+      // release managed pointers and complete the C_Read_Finisher immediately with 0 bytes
+
+      Context *iof = iofinish.release();
+      crf.release();
+      iof->complete(0);
+
+      // Signal async completion
+      return 0;
+    }
+
     C_Read_Sync_NonBlocking *crsa =
       new C_Read_Sync_NonBlocking(this, iofinish.release(), f, in, f->pos,
                                   offset, size, bl, filer.get(), have);
@@ -11399,10 +11414,18 @@ int64_t Client::_write_success(Fh *f, utime_t start, uint64_t fpos,
   return r;
 }
 
+void Client::C_Lock_Client_Finisher::finish(int r)
+{
+  std::scoped_lock lock(clnt->client_lock);
+  onfinish->complete(r);
+}
+
 void Client::C_Write_Finisher::finish_io(int r)
 {
   bool fini;
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   clnt->put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
   if (r >= 0) {
@@ -11438,6 +11461,8 @@ void Client::C_Write_Finisher::finish_fsync(int r)
   bool fini;
   client_t const whoami = clnt->whoami;  // For the benefit of ldout prefix
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   ldout(clnt->cct, 3) << "finish_fsync r = " << r << dendl;
 
   fsync_finished = true;
@@ -11598,6 +11623,7 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
 
   std::unique_ptr<Context> iofinish = nullptr;
   std::unique_ptr<C_Write_Finisher> cwf = nullptr;
+  std::unique_ptr<Context> filer_iofinish = nullptr;
   
   if (in->inline_version < CEPH_INLINE_NONE) {
     if (endoff > cct->_conf->client_max_inline_size ||
@@ -11709,7 +11735,10 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     if (onfinish == nullptr) {
       // We need a safer condition to wait on.
       cond_iofinish = new C_SaferCond();
-      iofinish.reset(cond_iofinish);
+      filer_iofinish.reset(cond_iofinish);
+    } else {
+      //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock'
+      filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get()));
     }
 
     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
@@ -11717,11 +11746,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
 		       offset, size, bl, ceph::real_clock::now(), 0,
 		       in->truncate_size, in->truncate_seq,
-		       iofinish.get());
+		       filer_iofinish.get());
 
     if (onfinish) {
       // handle non-blocking caller (onfinish != nullptr), we can now safely
       // release all the managed pointers
+      filer_iofinish.release();
       iofinish.release();
       onuninline.release();
       cwf.release();
diff --git a/src/client/Client.h b/src/client/Client.h
index 5a1e69394d0..f8c39e2fdd6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -1409,6 +1409,21 @@ private:
     void finish(int r) override;
   };
 
+  // A wrapper callback which takes the 'client_lock' and finishes the context.
+  // One of the usecase is the filer->write_trunc which doesn't hold client_lock
+  // in the call back passed. So, use this wrapper in such cases.
+  class C_Lock_Client_Finisher : public Context {
+  public:
+    C_Lock_Client_Finisher(Client *clnt, Context *onfinish)
+      : clnt(clnt), onfinish(onfinish) {}
+
+  private:
+    Client *clnt;
+    Context *onfinish;
+
+    void finish(int r) override;
+  };
+
   class C_Write_Finisher : public Context {
   public:
     void finish_io(int r);
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
index ff931faffc1..43550f35197 100644
--- a/src/common/Finisher.cc
+++ b/src/common/Finisher.cc
@@ -2,11 +2,40 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "Finisher.h"
+#include "common/perf_counters.h"
+
+#include <fmt/core.h>
 
 #define dout_subsys ceph_subsys_finisher
 #undef dout_prefix
 #define dout_prefix *_dout << "finisher(" << this << ") "
 
+Finisher::Finisher(CephContext *cct_) :
+  cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
+  thread_name("fn_anonymous"),
+  finisher_thread(this) {}
+
+Finisher::Finisher(CephContext *cct_, std::string_view name, std::string &&tn) :
+  cct(cct_), finisher_lock(ceph::make_mutex(fmt::format("Finisher::{}", name))),
+  thread_name(std::move(tn)),
+  finisher_thread(this) {
+  PerfCountersBuilder b(cct, fmt::format("finisher-{}", name),
+			l_finisher_first, l_finisher_last);
+  b.add_u64(l_finisher_queue_len, "queue_len");
+  b.add_time_avg(l_finisher_complete_lat, "complete_latency");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+  logger->set(l_finisher_queue_len, 0);
+  logger->set(l_finisher_complete_lat, 0);
+}
+
+Finisher::~Finisher() {
+  if (logger && cct) {
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+  }
+}
+
 void Finisher::start()
 {
   ldout(cct, 10) << __func__ << dendl;
@@ -20,7 +49,7 @@ void Finisher::stop()
   finisher_stop = true;
   // we don't have any new work to do, but we want the worker to wake up anyway
   // to process the stop condition.
-  finisher_cond.notify_all();
+  finisher_cond.notify_one();
   finisher_lock.unlock();
   finisher_thread.join(); // wait until the worker exits completely
   ldout(cct, 10) << __func__ << " finish" << dendl;
@@ -40,7 +69,7 @@ void Finisher::wait_for_empty()
 
 bool Finisher::is_empty()
 {
-  std::unique_lock ul(finisher_lock);
+  const std::lock_guard l{finisher_lock};
   return finisher_queue.empty();
 }
 
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 9091d0b892a..acee6594ca4 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -19,10 +19,8 @@
 #include "include/common_fwd.h"
 #include "common/Thread.h"
 #include "common/ceph_mutex.h"
-#include "common/perf_counters.h"
 #include "common/Cond.h"
 
-
 /// Finisher queue length performance counter ID.
 enum {
   l_finisher_first = 997082,
@@ -37,23 +35,23 @@ enum {
  * contexts to complete is thread-safe.
  */
 class Finisher {
-  CephContext *cct;
+  CephContext *const cct;
   ceph::mutex finisher_lock; ///< Protects access to queues and finisher_running.
   ceph::condition_variable finisher_cond; ///< Signaled when there is something to process.
   ceph::condition_variable finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
-  bool         finisher_stop; ///< Set when the finisher should stop.
-  bool         finisher_running; ///< True when the finisher is currently executing contexts.
-  bool	       finisher_empty_wait; ///< True mean someone wait finisher empty.
+  bool         finisher_stop = false; ///< Set when the finisher should stop.
+  bool         finisher_running = false; ///< True when the finisher is currently executing contexts.
+  bool	       finisher_empty_wait = false; ///< True mean someone wait finisher empty.
 
   /// Queue for contexts for which complete(0) will be called.
   std::vector<std::pair<Context*,int>> finisher_queue;
   std::vector<std::pair<Context*,int>> in_progress_queue;
 
-  std::string thread_name;
+  const std::string thread_name;
 
   /// Performance counter for the finisher's queue length.
   /// Only active for named finishers.
-  PerfCounters *logger;
+  PerfCounters *logger = nullptr;
 
   void *finisher_thread_entry();
 
@@ -66,56 +64,34 @@ class Finisher {
  public:
   /// Add a context to complete, optionally specifying a parameter for the complete function.
   void queue(Context *c, int r = 0) {
-    std::unique_lock ul(finisher_lock);
-    bool was_empty = finisher_queue.empty();
-    finisher_queue.push_back(std::make_pair(c, r));
-    if (was_empty) {
-      finisher_cond.notify_one();
+    {
+      const std::lock_guard l{finisher_lock};
+      const bool should_notify = finisher_queue.empty() && !finisher_running;
+      finisher_queue.push_back(std::make_pair(c, r));
+      if (should_notify) {
+	finisher_cond.notify_one();
+      }
     }
+
     if (logger)
       logger->inc(l_finisher_queue_len);
   }
 
-  void queue(std::list<Context*>& ls) {
+  // TODO use C++20 concept checks instead of SFINAE
+  template<typename T>
+  auto queue(T &ls) -> decltype(std::distance(ls.begin(), ls.end()), void()) {
     {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
-      }
-      for (auto i : ls) {
-	finisher_queue.push_back(std::make_pair(i, 0));
-      }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
-    }
-    ls.clear();
-  }
-  void queue(std::deque<Context*>& ls) {
-    {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
-      }
-      for (auto i : ls) {
+      const std::lock_guard l{finisher_lock};
+      const bool should_notify = finisher_queue.empty() && !finisher_running;
+      for (Context *i : ls) {
 	finisher_queue.push_back(std::make_pair(i, 0));
       }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
-    }
-    ls.clear();
-  }
-  void queue(std::vector<Context*>& ls) {
-    {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
+      if (should_notify) {
+	finisher_cond.notify_one();
       }
-      for (auto i : ls) {
-	finisher_queue.push_back(std::make_pair(i, 0));
-      }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
     }
+    if (logger)
+      logger->inc(l_finisher_queue_len, ls.size());
     ls.clear();
   }
 
@@ -137,36 +113,17 @@ class Finisher {
 
   bool is_empty();
 
+  std::string_view get_thread_name() const noexcept {
+    return thread_name;
+  }
+
   /// Construct an anonymous Finisher.
   /// Anonymous finishers do not log their queue length.
-  explicit Finisher(CephContext *cct_) :
-    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
-    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
-    thread_name("fn_anonymous"), logger(0),
-    finisher_thread(this) {}
+  explicit Finisher(CephContext *cct_);
 
   /// Construct a named Finisher that logs its queue length.
-  Finisher(CephContext *cct_, std::string name, std::string tn) :
-    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::" + name)),
-    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
-    thread_name(tn), logger(0),
-    finisher_thread(this) {
-    PerfCountersBuilder b(cct, std::string("finisher-") + name,
-			  l_finisher_first, l_finisher_last);
-    b.add_u64(l_finisher_queue_len, "queue_len");
-    b.add_time_avg(l_finisher_complete_lat, "complete_latency");
-    logger = b.create_perf_counters();
-    cct->get_perfcounters_collection()->add(logger);
-    logger->set(l_finisher_queue_len, 0);
-    logger->set(l_finisher_complete_lat, 0);
-  }
-
-  ~Finisher() {
-    if (logger && cct) {
-      cct->get_perfcounters_collection()->remove(logger);
-      delete logger;
-    }
-  }
+  Finisher(CephContext *cct_, std::string_view name, std::string &&tn);
+  ~Finisher();
 };
 
 /// Context that is completed asynchronously on the supplied finisher.
diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc
index 54442709229..246cec9460b 100644
--- a/src/common/HeartbeatMap.cc
+++ b/src/common/HeartbeatMap.cc
@@ -43,11 +43,11 @@ HeartbeatMap::~HeartbeatMap()
   ceph_assert(m_workers.empty());
 }
 
-heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
+heartbeat_handle_d *HeartbeatMap::add_worker(string&& name, pthread_t thread_id)
 {
   std::unique_lock locker{m_rwlock};
   ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
-  heartbeat_handle_d *h = new heartbeat_handle_d(name);
+  heartbeat_handle_d *h = new heartbeat_handle_d(std::move(name));
   ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
                              "heartbeat_handle_d timeout");
   ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h
index 6f486b21ca8..401042cc271 100644
--- a/src/common/HeartbeatMap.h
+++ b/src/common/HeartbeatMap.h
@@ -48,15 +48,15 @@ struct heartbeat_handle_d {
   ceph::timespan suicide_grace = ceph::timespan::zero();
   std::list<heartbeat_handle_d*>::iterator list_item;
 
-  explicit heartbeat_handle_d(const std::string& n)
-    : name(n)
+  explicit heartbeat_handle_d(std::string&& n)
+    : name(std::move(n))
   { }
 };
 
 class HeartbeatMap {
  public:
   // register/unregister
-  heartbeat_handle_d *add_worker(const std::string& name, pthread_t thread_id);
+  heartbeat_handle_d *add_worker(std::string&& name, pthread_t thread_id);
   void remove_worker(const heartbeat_handle_d *h);
 
   // reset the timeout so that it expects another touch within grace amount of time
diff --git a/src/common/LRUSet.h b/src/common/LRUSet.h
index b62956ba460..c8c66e85458 100644
--- a/src/common/LRUSet.h
+++ b/src/common/LRUSet.h
@@ -43,6 +43,7 @@ class LRUSet {
   // lru
   boost::intrusive::list<
     Node,
+    boost::intrusive::constant_time_size<false>,
     boost::intrusive::member_hook<Node,
 				  boost::intrusive::list_member_hook<>,
 				  &Node::lru_item>
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
index ec9cbdf53a6..f82a70701d2 100644
--- a/src/common/SloppyCRCMap.cc
+++ b/src/common/SloppyCRCMap.cc
@@ -73,7 +73,7 @@ void SloppyCRCMap::truncate(uint64_t offset)
   offset -= offset % block_size;
   std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
   while (p != crc_map.end())
-    crc_map.erase(p++);
+    p = crc_map.erase(p);
 }
 
 void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 9a7a31923c1..3903e8c0ed7 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -83,7 +83,7 @@ void *Thread::entry_wrapper()
   if (pid && cpuid >= 0)
     _set_affinity(cpuid);
 
-  ceph_pthread_setname(pthread_self(), thread_name.c_str());
+  ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str());
   return entry();
 }
 
@@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize)
 void Thread::create(const char *name, size_t stacksize)
 {
   ceph_assert(strlen(name) < 16);
-  thread_name = name;
+  Thread::thread_name = name;
 
   int ret = try_create(stacksize);
   if (ret != 0) {
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 5242fb5f307..d3892c1b36b 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -20,11 +20,14 @@
 #include <string_view>
 #include <system_error>
 #include <thread>
+#include <cstring>
 
 #include <pthread.h>
 #include <sys/types.h>
 
+#include "include/ceph_assert.h"
 #include "include/compat.h"
+#include "include/spinlock.h"
 
 extern pid_t ceph_gettid();
 
@@ -33,7 +36,7 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int cpuid;
-  std::string thread_name;
+  static inline thread_local std::string thread_name;
 
   void *entry_wrapper();
 
@@ -61,6 +64,9 @@ class Thread {
   int join(void **prval = 0);
   int detach();
   int set_affinity(int cpuid);
+  static const std::string get_thread_name() {
+    return Thread::thread_name;
+  }
 };
 
 // Functions for with std::thread
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index f4cf8902dac..57d73038364 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -243,6 +243,7 @@ private:
 public:
   typedef boost::intrusive::list<
   TrackedOp,
+  boost::intrusive::constant_time_size<false>,
   boost::intrusive::member_hook<
     TrackedOp,
     boost::intrusive::list_member_hook<>,
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index f1877647877..6a02d5c5bf1 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -282,10 +282,20 @@ public:
   void set_mon_addrs(const MonMap& mm);
   void set_mon_addrs(const std::vector<entity_addrvec_t>& in) {
     auto ptr = std::make_shared<std::vector<entity_addrvec_t>>(in);
+#if defined(__GNUC__) && __GNUC__ < 12
+    // workaround for GCC 11 bug
     atomic_store_explicit(&_mon_addrs, std::move(ptr), std::memory_order_relaxed);
+#else
+    _mon_addrs.store(std::move(ptr), std::memory_order_relaxed);
+#endif
   }
   std::shared_ptr<std::vector<entity_addrvec_t>> get_mon_addrs() const {
+#if defined(__GNUC__) && __GNUC__ < 12
+    // workaround for GCC 11 bug
     auto ptr = atomic_load_explicit(&_mon_addrs, std::memory_order_relaxed);
+#else
+    auto ptr = _mon_addrs.load(std::memory_order_relaxed);
+#endif
     return ptr;
   }
 
@@ -306,7 +316,12 @@ private:
 
   int _crypto_inited;
 
+#if defined(__GNUC__) && __GNUC__ < 12
+  // workaround for GCC 11 bug
   std::shared_ptr<std::vector<entity_addrvec_t>> _mon_addrs;
+#else
+  std::atomic<std::shared_ptr<std::vector<entity_addrvec_t>>> _mon_addrs;
+#endif
 
   /* libcommon service thread.
    * SIGHUP wakes this thread, which then reopens logfiles */
diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
index 059d81f2ac3..6ed8c56d5da 100644
--- a/src/common/ceph_mutex.h
+++ b/src/common/ceph_mutex.h
@@ -83,7 +83,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
   #define ceph_mutex_is_locked(m) true
   #define ceph_mutex_is_locked_by_me(m) true
 }
@@ -131,8 +130,6 @@ namespace ceph {
     return {std::forward<Args>(args)...};
   }
 
-  static constexpr bool mutex_debugging = true;
-
   // debug methods
   #define ceph_mutex_is_locked(m) ((m).is_locked())
   #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
@@ -186,8 +183,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
-
   // debug methods.  Note that these can blindly return true
   // because any code that does anything other than assert these
   // are true is broken.
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
index af2baaa5c67..86ced8d183c 100644
--- a/src/common/cohort_lru.h
+++ b/src/common/cohort_lru.h
@@ -15,6 +15,12 @@
 
 #include <boost/intrusive/list.hpp>
 #include <boost/intrusive/slist.hpp>
+#include <cstdint>
+#include <atomic>
+#include <mutex>
+#include <algorithm>
+#include <functional>
+#include <vector>
 
 #ifdef __CEPH__
 # include "include/ceph_assert.h"
diff --git a/src/common/config.cc b/src/common/config.cc
index e151e94bb90..3a5ee91c347 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -24,6 +24,8 @@
 #include "common/hostname.h"
 #include "common/dout.h"
 
+#include <fmt/core.h>
+
 /* Don't use standard Ceph logging in this file.
  * We can't use logging until it's initialized, and a lot of the necessary
  * initialization happens here.
@@ -131,14 +133,11 @@ md_config_t::md_config_t(ConfigValues& values,
   // Define the debug_* options as well.
   subsys_options.reserve(values.subsys.get_num());
   for (unsigned i = 0; i < values.subsys.get_num(); ++i) {
-    string name = string("debug_") + values.subsys.get_name(i);
-    subsys_options.push_back(
-      Option(name, Option::TYPE_STR, Option::LEVEL_ADVANCED));
+    subsys_options.emplace_back(
+      fmt::format("debug_{}", values.subsys.get_name(i)), Option::TYPE_STR, Option::LEVEL_ADVANCED);
     Option& opt = subsys_options.back();
-    opt.set_default(stringify(values.subsys.get_log_level(i)) + "/" +
-		    stringify(values.subsys.get_gather_level(i)));
-    string desc = string("Debug level for ") + values.subsys.get_name(i);
-    opt.set_description(desc.c_str());
+    opt.set_default(fmt::format("{}/{}", values.subsys.get_log_level(i), values.subsys.get_gather_level(i)));
+    opt.set_description(fmt::format("Debug level for {}", values.subsys.get_name(i)).c_str());
     opt.set_flag(Option::FLAG_RUNTIME);
     opt.set_long_description("The value takes the form 'N' or 'N/M' where N and M are values between 0 and 99.  N is the debug level to log (all values below this are included), and M is the level to gather and buffer in memory.  In the event of a crash, the most recent items <= M are dumped to the log file.");
     opt.set_subsys(i);
@@ -158,7 +157,7 @@ md_config_t::md_config_t(ConfigValues& values,
 	  } else {
 	    // normalize to M/N
 	    n = m;
-	    *value = stringify(m) + "/" + stringify(n);
+	    *value = fmt::format("{}/{}", m, n);
 	  }
 	} else {
 	  *error_message = "value must take the form N or N/M, where N and M are integers";
@@ -775,7 +774,7 @@ int md_config_t::parse_option(ConfigValues& values,
     option_name = opt.name;
     if (ceph_argparse_witharg(
 	  args, i, &val, err,
-	  string(string("--default-") + opt.name).c_str(), (char*)NULL)) {
+	  fmt::format("--default-{}", opt.name).c_str(), (char*)NULL)) {
       if (!err.str().empty()) {
         error_message = err.str();
 	ret = -EINVAL;
@@ -1268,7 +1267,7 @@ Option::value_t md_config_t::_expand_meta(
 		     << Option::to_str(*i->second) << "\n";
 	      }
 	    }
-	    return Option::value_t(std::string("$") + o->name);
+	    return Option::value_t(fmt::format("${}", o->name));
 	  } else {
 	    // recursively evaluate!
 	    string n;
diff --git a/src/common/config_obs_mgr.h b/src/common/config_obs_mgr.h
index 759930df92d..5336538e438 100644
--- a/src/common/config_obs_mgr.h
+++ b/src/common/config_obs_mgr.h
@@ -75,7 +75,7 @@ typename ObserverMgr<ConfigObs>::config_obs_wptr ObserverMgr<ConfigObs>::remove_
   for (auto o = observers.begin(); o != observers.end(); ) {
     if (*o->second == observer) {
       ptr = std::move(o->second);
-      observers.erase(o++);
+      o = observers.erase(o);
       found_obs = true;
     } else {
       ++o;
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
index b9b47d9cef4..12a273b8c84 100644
--- a/src/common/config_proxy.h
+++ b/src/common/config_proxy.h
@@ -31,7 +31,6 @@ class ConfigProxy {
   using rev_obs_map_t = ObsMgr::rev_obs_map;
 
   void _call_observers(rev_obs_map_t& rev_obs) {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     for (auto& [obs, keys] : rev_obs) {
       (*obs)->handle_conf_change(*this, keys);
     }
diff --git a/src/common/intrusive_lru.h b/src/common/intrusive_lru.h
index 564cceef1cc..3ed3625d8a0 100644
--- a/src/common/intrusive_lru.h
+++ b/src/common/intrusive_lru.h
@@ -125,6 +125,7 @@ class intrusive_lru {
 
   using lru_list_t = boost::intrusive::list<
     base_t,
+    boost::intrusive::constant_time_size<false>,
     boost::intrusive::member_hook<
       base_t,
       boost::intrusive::list_member_hook<>,
diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
index 3f907ccf474..44b82260263 100644
--- a/src/common/io_exerciser/RadosIo.cc
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -81,7 +81,7 @@ RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1,
 
 bool RadosIo::readyForIoOp(IoOp &op)
 {
-  ceph_assert(lock.is_locked_by_me()); //Must be called with lock held
+  ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held
   if (!om->readyForIoOp(op)) {
     return false;
   }
@@ -118,7 +118,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info = std::make_shared<AsyncOpInfo>(0, op.length1);
       op_info->bl1 = db->generate_data(0, op.length1);
       op_info->wop.write_full(op_info->bl1);
-      auto create_cb = [this] (boost::system::error_code ec) {
+      auto create_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -132,7 +133,8 @@ void RadosIo::applyIoOp(IoOp &op)
       start_io();
       op_info = std::make_shared<AsyncOpInfo>();
       op_info->wop.remove();
-      auto remove_cb = [this] (boost::system::error_code ec) {
+      auto remove_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -148,9 +150,13 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->rop.read(op.offset1 * block_size,
                         op.length1 * block_size,
                         &op_info->bl1, nullptr);
-      auto read_cb = [this, op_info] (boost::system::error_code ec, bufferlist bl) {
+      auto read_cb = [this, op_info] (boost::system::error_code ec,
+                                      version_t ver,
+                                      bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
-        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
         finish_io();
       };
       librados::async_operate(asio, io, oid,
@@ -174,10 +180,15 @@ void RadosIo::applyIoOp(IoOp &op)
                     op.length2 * block_size,
                     &op_info->bl2, nullptr);
       auto read2_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
                                        bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
-        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
-        db->validate(op_info->bl2, op_info->offset2, op_info->length2);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
+        ceph_assert(db->validate(op_info->bl2,
+                                 op_info->offset2,
+                                 op_info->length2));
         finish_io();
       };
       librados::async_operate(asio, io, oid,
@@ -202,11 +213,18 @@ void RadosIo::applyIoOp(IoOp &op)
                     op.length3 * block_size,
                     &op_info->bl3, nullptr);
       auto read3_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
                                        bufferlist bl) {
         ceph_assert(ec == boost::system::errc::success);
-        db->validate(op_info->bl1, op_info->offset1, op_info->length1);
-        db->validate(op_info->bl2, op_info->offset2, op_info->length2);
-        db->validate(op_info->bl3, op_info->offset3, op_info->length3);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
+        ceph_assert(db->validate(op_info->bl2,
+                                 op_info->offset2,
+                                 op_info->length2));
+        ceph_assert(db->validate(op_info->bl3,
+                                 op_info->offset3,
+                                 op_info->length3));
         finish_io();
       };
       librados::async_operate(asio, io, oid,
@@ -222,7 +240,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->bl1 = db->generate_data(op.offset1, op.length1);
 
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
-      auto write_cb = [this] (boost::system::error_code ec) {
+      auto write_cb = [this] (boost::system::error_code ec,
+                              version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -241,7 +260,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->bl2 = db->generate_data(op.offset2, op.length2);
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
       op_info->wop.write(op.offset2 * block_size, op_info->bl2);
-      auto write2_cb = [this] (boost::system::error_code ec) {
+      auto write2_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
@@ -263,7 +283,8 @@ void RadosIo::applyIoOp(IoOp &op)
       op_info->wop.write(op.offset1 * block_size, op_info->bl1);
       op_info->wop.write(op.offset2 * block_size, op_info->bl2);
       op_info->wop.write(op.offset3 * block_size, op_info->bl3);
-      auto write3_cb = [this] (boost::system::error_code ec) {
+      auto write3_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
index 4d843be75dc..95353425de9 100644
--- a/src/common/map_cacher.hpp
+++ b/src/common/map_cacher.hpp
@@ -16,6 +16,7 @@
 #define MAPCACHER_H
 
 #include "include/Context.h"
+#include "include/expected.hpp"
 #include "common/sharedptr_registry.hpp"
 
 namespace MapCacher {
@@ -130,6 +131,50 @@ public:
     return -EINVAL;
   } ///< @return error value, 0 on success, -ENOENT if no more entries
 
+  /// Fetch first key/value std::pair after specified key
+  struct PosAndData {
+    K last_key;
+    V data;
+  };
+  using MaybePosAndData = tl::expected<PosAndData, int>;
+
+  MaybePosAndData get_1st_after_key(
+      K key  ///< [in] key after which to get next
+  )
+  {
+    ceph_assert(driver);
+    while (true) {
+      std::pair<K, boost::optional<V>> cached;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      ///\todo a driver->get_next() that returns an expected<K, V> would be nice
+      bool got_store{false};
+      std::pair<K, V> store;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+        return tl::unexpected(r);
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+        return tl::unexpected(-ENOENT);
+      } else if (got_cached && (!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  return PosAndData{cached.first, *cached.second};
+	} else {
+	  key = cached.first;
+	  continue;  // value was cached as removed, recurse
+	}
+      } else {
+	return PosAndData{store.first, store.second};
+      }
+    }
+    ceph_abort();  // not reachable
+    return tl::unexpected(-EINVAL);
+  }
+
+
   /// Adds operation setting keys to Transaction
   void set_keys(
     const std::map<K, V> &keys,  ///< [in] keys/values to std::set
diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
index c1a4ff2a435..d56d0ebee99 100644
--- a/src/common/mutex_debug.h
+++ b/src/common/mutex_debug.h
@@ -169,20 +169,16 @@ public:
   }
 
   bool try_lock(bool no_lockdep = false) {
-    bool locked = try_lock_impl();
-    if (locked) {
-      if (enable_lockdep(no_lockdep))
-	_locked();
-      _post_lock();
-    }
-    return locked;
+    ceph_assert(recursive || !is_locked_by_me());
+    return _try_lock(no_lockdep);
   }
 
   void lock(bool no_lockdep = false) {
+    ceph_assert(recursive || !is_locked_by_me());
     if (enable_lockdep(no_lockdep))
       _will_lock(recursive);
 
-    if (try_lock(no_lockdep))
+    if (_try_lock(no_lockdep))
       return;
 
     lock_impl();
@@ -198,6 +194,16 @@ public:
     unlock_impl();
   }
 
+private:
+  bool _try_lock(bool no_lockdep) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
 };
 
 
diff --git a/src/common/options.h b/src/common/options.h
index ad39936d43a..abded4cc0dd 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -207,8 +207,8 @@ struct Option {
   typedef std::function<int(std::string *, std::string *)> validator_fn_t;
   validator_fn_t validator;
 
-  Option(std::string const &name, type_t t, level_t l)
-    : name(name), type(t), level(l)
+  Option(std::string &&name, type_t t, level_t l)
+    : name(std::move(name)), type(t), level(l)
   {
     // While value_t is nullable (via std::monostate), we don't ever
     // want it set that way in an Option instance: within an instance,
diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
index b03a3cab70c..4b4d191e09c 100644
--- a/src/common/scrub_types.cc
+++ b/src/common/scrub_types.cc
@@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_obj_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
@@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_snapset_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
index dd206f56f60..d86fc12b6c8 100644
--- a/src/common/scrub_types.h
+++ b/src/common/scrub_types.h
@@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
 			const pg_shard_t &primary);
   void set_version(uint64_t ver) { version = ver; }
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
@@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
   void set_size_mismatch();
 
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
index 4f564ac044d..c38b225c94b 100644
--- a/src/crimson/common/log.h
+++ b/src/crimson/common/log.h
@@ -90,7 +90,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \
   LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define SUBLOGDPPI(subname_, level_, MSG, dpp, ...) \
-  LOGGER(subname_).log(level_, "{} {}: " MSG, \
+  LOGGER(subname_).log(level_, "{} {} {}: " MSG,			\
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__)
 #define SUBTRACEDPPI(subname_, ...) SUBLOGDPPI(subname_, seastar::log_level::trace, __VA_ARGS__)
@@ -106,7 +106,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define LOGDPP(level_, MSG, dpp, ...) \
   LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define LOGDPPI(level_, MSG, dpp, ...) \
-  LOCAL_LOGGER.log(level_, "{} {}: " MSG, \
+  LOCAL_LOGGER.log(level_, "{} {} {}: " MSG, \
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__)
 #define TRACEDPPI(...) LOGDPPI(seastar::log_level::trace, __VA_ARGS__)
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index fe09cc54510..0dca695ba3a 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -75,14 +75,15 @@ public:
       CollectionRef c,
       const ghobject_t& oid) = 0;
 
-    using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+    using omap_values_t = attrs_t;
     using omap_keys_t = std::set<std::string>;
     virtual read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
       const omap_keys_t& keys) = 0;
 
-    virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    using omap_values_paged_t = std::tuple<bool, omap_values_t>;
+    virtual read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -147,7 +148,8 @@ public:
       return seastar::now();
     }
 
-    virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    using fiemap_ret_t = std::map<uint64_t, uint64_t>;
+    virtual read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index cf8d3c0891d..5dcb7514ee1 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -990,8 +990,12 @@ void Cache::mark_transaction_conflicted(
     }
     efforts.mutate_delta_bytes += delta_stat.bytes;
 
-    for (auto &i: t.pre_alloc_list) {
-      epm.mark_space_free(i->get_paddr(), i->get_length());
+    if (t.get_pending_ool()) {
+      t.get_pending_ool()->is_conflicted = true;
+    } else {
+      for (auto &i: t.pre_alloc_list) {
+	epm.mark_space_free(i->get_paddr(), i->get_length());
+      }
     }
 
     auto& ool_stats = t.get_ool_write_stats();
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index cdad6dfb1b0..76c18bde667 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -158,12 +158,14 @@ parent_tracker_t::~parent_tracker_t() {
 
 std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
 {
-  out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length()
+  out << "LBAMapping(" << rhs.get_key()
+      << "~0x" << std::hex << rhs.get_length() << std::dec
       << "->" << rhs.get_val();
   if (rhs.is_indirect()) {
-    out << " indirect(" << rhs.get_intermediate_base() << "~"
-	<< rhs.get_intermediate_key() << "~"
-	<< rhs.get_intermediate_length() << ")";
+    out << ",indirect(" << rhs.get_intermediate_base()
+        << "~0x" << std::hex << rhs.get_intermediate_length()
+        << "@0x" << rhs.get_intermediate_offset() << std::dec
+        << ")";
   }
   out << ")";
   return out;
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6c5c6c6fcc2..6025725aa33 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -350,7 +350,7 @@ public:
 	<< ", modify_time=" << sea_time_point_printer_t{modify_time}
 	<< ", paddr=" << get_paddr()
 	<< ", prior_paddr=" << prior_poffset_str
-	<< ", length=" << get_length()
+	<< std::hex << ", length=0x" << get_length() << std::dec
 	<< ", state=" << state
 	<< ", last_committed_crc=" << last_committed_crc
 	<< ", refcount=" << use_count()
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 34ac199eed8..0458fbfed74 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -987,7 +987,19 @@ RandomBlockOolWriter::alloc_write_ool_extents(
     return alloc_write_iertr::now();
   }
   return seastar::with_gate(write_guard, [this, &t, &extents] {
-    return do_write(t, extents);
+    seastar::lw_shared_ptr<rbm_pending_ool_t> ptr =
+      seastar::make_lw_shared<rbm_pending_ool_t>();
+    ptr->pending_extents = t.get_pre_alloc_list();
+    assert(!t.is_conflicted());
+    t.set_pending_ool(ptr);
+    return do_write(t, extents
+    ).finally([this, ptr=ptr] {
+      if (ptr->is_conflicted) {
+	for (auto &e : ptr->pending_extents) {
+	  rb_cleaner->mark_space_free(e->get_paddr(), e->get_length());
+	}
+      }
+    });
   });
 }
 
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index 5d6fa3cb1b1..ef10ff9623b 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -173,16 +173,22 @@ public:
     if (!parent_modified()) {
       return;
     }
+    LOG_PREFIX(BtreeLBAMapping::maybe_fix_pos);
     auto &p = static_cast<LBALeafNode&>(*parent);
     p.maybe_fix_mapping_pos(*this);
+    SUBDEBUGT(seastore_lba, "fixed pin {}",
+              ctx.trans, static_cast<LBAMapping&>(*this));
   }
 
   LBAMappingRef refresh_with_pending_parent() final {
+    LOG_PREFIX(BtreeLBAMapping::refresh_with_pending_parent);
     assert(is_parent_valid() && !is_parent_viewable());
     auto &p = static_cast<LBALeafNode&>(*parent);
     auto &viewable_p = static_cast<LBALeafNode&>(
       *p.find_pending_version(ctx.trans, get_key()));
-    return viewable_p.get_mapping(ctx, get_key());
+    auto new_pin = viewable_p.get_mapping(ctx, get_key());
+    SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
+    return new_pin;
   }
 protected:
   std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
index 960ea6ba411..397a014a7c3 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -925,7 +925,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
     std::ostringstream sos;
     sos << "Node" << NODE_TYPE << FIELD_TYPE
         << "@" << extent.get_laddr()
-        << "+" << std::hex << extent.get_length() << std::dec
+        << "+0x" << std::hex << extent.get_length() << std::dec
         << "Lv" << (unsigned)level()
         << (is_level_tail() ? "$" : "");
     name = sos.str();
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 15774332373..d90edbb20db 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -17,6 +17,7 @@
 #include "common/safe_io.h"
 #include "include/stringify.h"
 #include "os/Transaction.h"
+#include "osd/osd_types_fmt.h"
 
 #include "crimson/common/buffer_io.h"
 
@@ -30,8 +31,6 @@
 #include "crimson/os/seastore/onode_manager.h"
 #include "crimson/os/seastore/object_data_handler.h"
 
-
-using std::string;
 using crimson::common::local_conf;
 
 template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
@@ -42,8 +41,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
   auto format(op_type_t op, FormatContext& ctx) const {
     std::string_view name = "unknown";
     switch (op) {
-      case op_type_t::TRANSACTION:
-      name = "transaction";
+      case op_type_t::DO_TRANSACTION:
+      name = "do_transaction";
       break;
     case op_type_t::READ:
       name = "read";
@@ -63,8 +62,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
     case op_type_t::OMAP_GET_VALUES:
       name = "omap_get_values";
       break;
-    case op_type_t::OMAP_LIST:
-      name = "omap_list";
+    case op_type_t::OMAP_GET_VALUES2:
+      name = "omap_get_values2";
       break;
     case op_type_t::MAX:
       name = "unknown";
@@ -143,14 +142,14 @@ void SeaStore::Shard::register_metrics()
   namespace sm = seastar::metrics;
   using op_type_t = crimson::os::seastore::op_type_t;
   std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = {
-    {op_type_t::TRANSACTION,     sm::label_instance("latency", "TRANSACTION")},
-    {op_type_t::READ,            sm::label_instance("latency", "READ")},
-    {op_type_t::WRITE,           sm::label_instance("latency", "WRITE")},
-    {op_type_t::GET_ATTR,        sm::label_instance("latency", "GET_ATTR")},
-    {op_type_t::GET_ATTRS,       sm::label_instance("latency", "GET_ATTRS")},
-    {op_type_t::STAT,            sm::label_instance("latency", "STAT")},
-    {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency",  "OMAP_GET_VALUES")},
-    {op_type_t::OMAP_LIST,       sm::label_instance("latency", "OMAP_LIST")},
+    {op_type_t::DO_TRANSACTION,   sm::label_instance("latency", "DO_TRANSACTION")},
+    {op_type_t::READ,             sm::label_instance("latency", "READ")},
+    {op_type_t::WRITE,            sm::label_instance("latency", "WRITE")},
+    {op_type_t::GET_ATTR,         sm::label_instance("latency", "GET_ATTR")},
+    {op_type_t::GET_ATTRS,        sm::label_instance("latency", "GET_ATTRS")},
+    {op_type_t::STAT,             sm::label_instance("latency", "STAT")},
+    {op_type_t::OMAP_GET_VALUES,  sm::label_instance("latency", "OMAP_GET_VALUES")},
+    {op_type_t::OMAP_GET_VALUES2, sm::label_instance("latency", "OMAP_GET_VALUES2")},
   };
 
   for (auto& [op_type, label] : labels_by_op_type) {
@@ -194,6 +193,9 @@ void SeaStore::Shard::register_metrics()
 
 seastar::future<> SeaStore::start()
 {
+  LOG_PREFIX(SeaStore::start);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
 #ifndef NDEBUG
   bool is_test = true;
@@ -214,19 +216,30 @@ seastar::future<> SeaStore::start()
   }).then([this, is_test] {
     ceph_assert(device);
     return shard_stores.start(root, device.get(), is_test);
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 seastar::future<> SeaStore::test_start(DeviceRef device_obj)
 {
+  LOG_PREFIX(SeaStore::test_start);
+  INFO("...");
+
   ceph_assert(device_obj);
   ceph_assert(root == "");
   device = std::move(device_obj);
-  return shard_stores.start_single(root, device.get(), true);
+  return shard_stores.start_single(root, device.get(), true
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 seastar::future<> SeaStore::stop()
 {
+  LOG_PREFIX(SeaStore::stop);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return seastar::do_for_each(secondaries, [](auto& sec_dev) {
     return sec_dev->stop();
@@ -239,17 +252,28 @@ seastar::future<> SeaStore::stop()
     }
   }).then([this] {
     return shard_stores.stop();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::test_mount()
 {
+  LOG_PREFIX(SeaStore::test_mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.local().mount_managers();
+  return shard_stores.local().mount_managers(
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::mount()
 {
+  LOG_PREFIX(SeaStore::mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return device->mount(
   ).safe_then([this] {
@@ -278,11 +302,13 @@ SeaStore::mount_ertr::future<> SeaStore::mount()
           return set_secondaries();
         });
       });
-    }).safe_then([this] {
-      return shard_stores.invoke_on_all([](auto &local_store) {
-        return local_store.mount_managers();
-      });
     });
+  }).safe_then([this] {
+    return shard_stores.invoke_on_all([](auto &local_store) {
+      return local_store.mount_managers();
+    });
+  }).safe_then([FNAME] {
+    INFO("done");
   }).handle_error(
     crimson::ct_error::assert_all{
       "Invalid error in SeaStore::mount"
@@ -302,9 +328,14 @@ seastar::future<> SeaStore::Shard::mount_managers()
 
 seastar::future<> SeaStore::umount()
 {
+  LOG_PREFIX(SeaStore::umount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.invoke_on_all([](auto &local_store) {
     return local_store.umount();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
@@ -332,7 +363,7 @@ seastar::future<> SeaStore::Shard::umount()
     onode_manager.reset();
   }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::umount"
+      "Invalid error in SeaStoreS::umount"
     }
   );
 }
@@ -345,15 +376,15 @@ seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid)
     auto [ret, fsid] = tuple;
     std::string str_fsid = stringify(new_osd_fsid);
     if (ret == -1) {
-       return write_meta("fsid", stringify(new_osd_fsid));
+      return write_meta("fsid", stringify(new_osd_fsid));
     } else if (ret == 0 && fsid != str_fsid) {
-       ERROR("on-disk fsid {} != provided {}",
-         fsid, stringify(new_osd_fsid));
-       throw std::runtime_error("store fsid error");
-     } else {
+      ERROR("on-disk fsid {} != provided {}",
+            fsid, stringify(new_osd_fsid));
+      throw std::runtime_error("store fsid error");
+    } else {
       return seastar::now();
-     }
-   });
+    }
+  });
 }
 
 seastar::future<>
@@ -379,6 +410,8 @@ SeaStore::Shard::mkfs_managers()
 	"mkfs_seastore",
 	[this](auto& t)
       {
+        LOG_PREFIX(SeaStoreS::mkfs_managers);
+        DEBUGT("...", t);
 	return onode_manager->mkfs(t
 	).si_then([this, &t] {
 	  return collection_manager->mkfs(t);
@@ -412,15 +445,22 @@ seastar::future<> SeaStore::set_secondaries()
 
 SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::test_mkfs);
+  INFO("uuid={} ...", new_osd_fsid);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } 
     return shard_stores.local().mkfs_managers(
     ).then([this, new_osd_fsid] {
       return prepare_meta(new_osd_fsid);
+    }).then([FNAME] {
+      INFO("done");
     });
   });
 }
@@ -448,27 +488,29 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
 
 SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::mkfs);
+  INFO("uuid={}, root={} ...", new_osd_fsid, root);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } else {
       return seastar::do_with(
         secondary_device_set_t(),
-        [this, new_osd_fsid](auto& sds) {
+        [this, new_osd_fsid, FNAME](auto& sds) {
         auto fut = seastar::now();
-        LOG_PREFIX(SeaStore::mkfs);
-        DEBUG("root: {}", root);
         if (!root.empty()) {
           fut = seastar::open_directory(root
-          ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable {
+          ).then([this, &sds, new_osd_fsid, FNAME](seastar::file rdir) mutable {
             std::unique_ptr<seastar::file> root_f =
               std::make_unique<seastar::file>(std::move(rdir));
             auto sub = root_f->list_directory(
-              [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<>
+              [this, &sds, new_osd_fsid, FNAME](auto de) mutable -> seastar::future<>
             {
-              LOG_PREFIX(SeaStore::mkfs);
               DEBUG("found file: {}", de.name);
               if (de.name.find("block.") == 0
                   && de.name.length() > 6 /* 6 for "block." */) {
@@ -533,6 +575,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
         return prepare_meta(new_osd_fsid);
       }).safe_then([this] {
 	return umount();
+      }).safe_then([FNAME] {
+        INFO("done");
       }).handle_error(
         crimson::ct_error::assert_all{
           "Invalid error in SeaStore::mkfs"
@@ -542,18 +586,22 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
   });
 }
 
-using coll_core_t = FuturizedStore::coll_core_t;
+using coll_core_t = SeaStore::coll_core_t;
 seastar::future<std::vector<coll_core_t>>
 SeaStore::list_collections()
 {
+  LOG_PREFIX(SeaStore::list_collections);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map([](auto &local_store) {
     return local_store.list_collections();
-  }).then([](std::vector<std::vector<coll_core_t>> results) {
+  }).then([FNAME](std::vector<std::vector<coll_core_t>> results) {
     std::vector<coll_core_t> collections;
     for (auto& colls : results) {
       collections.insert(collections.end(), colls.begin(), colls.end());
     }
+    DEBUG("got {} collections", collections.size());
     return seastar::make_ready_future<std::vector<coll_core_t>>(
       std::move(collections));
   });
@@ -561,14 +609,18 @@ SeaStore::list_collections()
 
 store_statfs_t SeaStore::Shard::stat() const
 {
-  return transaction_manager->store_stat();
+  LOG_PREFIX(SeaStoreS::stat);
+  auto ss = transaction_manager->store_stat();
+  DEBUG("stat={}", ss);
+  return ss;
 }
 
 seastar::future<store_statfs_t> SeaStore::stat() const
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::stat);
-  DEBUG("");
+  DEBUG("...");
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map_reduce0(
     [](const SeaStore::Shard &local_store) {
       return local_store.stat();
@@ -578,19 +630,30 @@ seastar::future<store_statfs_t> SeaStore::stat() const
       ss.add(ret);
       return std::move(ss);
     }
-  ).then([](store_statfs_t ss) {
+  ).then([FNAME](store_statfs_t ss) {
+    DEBUG("done, stat={}", ss);
     return seastar::make_ready_future<store_statfs_t>(std::move(ss));
   });
 }
 
 seastar::future<store_statfs_t> SeaStore::pool_statfs(int64_t pool_id) const
 {
-   //TODO
-   return SeaStore::stat();
+  LOG_PREFIX(SeaStore::pool_statfs);
+  DEBUG("pool_id={} ...", pool_id);
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  //TODO
+  return SeaStore::stat(
+  ).then([FNAME, pool_id](store_statfs_t ss) {
+    DEBUG("done, pool_id={}, ret={}", pool_id, ss);
+    return seastar::make_ready_future<store_statfs_t>(std::move(ss));
+  });
 }
 
 seastar::future<> SeaStore::report_stats()
 {
+  LOG_PREFIX(SeaStore::report_stats);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   shard_device_stats.resize(seastar::smp::count);
   shard_io_stats.resize(seastar::smp::count);
@@ -609,8 +672,7 @@ seastar::future<> SeaStore::report_stats()
       local_store.get_io_stats(report_detail, seconds);
     shard_cache_stats[seastar::this_shard_id()] =
       local_store.get_cache_stats(report_detail, seconds);
-  }).then([this] {
-    LOG_PREFIX(SeaStore);
+  }).then([this, FNAME] {
     auto now = seastar::lowres_clock::now();
     if (last_tp == seastar::lowres_clock::time_point::min()) {
       last_tp = now;
@@ -857,24 +919,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
         "list_objects",
         [this, ch, start, end, &limit, &ret](auto &t)
       {
+        LOG_PREFIX(SeaStoreS::list_objects);
+        DEBUGT("cid={} start={} end={} limit={} ...",
+               t, ch->get_cid(), start, end, limit);
         return get_coll_bits(
           ch, t
-	).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) {
+	).si_then([FNAME, this, ch, &t, start, end, &limit, &ret](auto bits) {
           if (!bits) {
+            DEBUGT("no bits, return none", t);
             return list_iertr::make_ready_future<
               OnodeManager::list_onodes_bare_ret
 	      >(std::make_tuple(
 		  std::vector<ghobject_t>(),
 		  ghobject_t::get_max()));
           } else {
-	    LOG_PREFIX(SeaStore::list_objects);
-	    DEBUGT("start {}, end {}, limit {}, bits {}",
-	      t, start, end, limit, *bits);
+	    DEBUGT("bits={} ...", t, *bits);
             auto filter = SeaStore::get_objs_range(ch, *bits);
 	    using list_iertr = OnodeManager::list_onodes_iertr;
 	    using repeat_ret = list_iertr::future<seastar::stop_iteration>;
             return trans_intr::repeat(
-              [this, &t, &ret, &limit, end,
+              [this, FNAME, &t, &ret, &limit, end,
 	       filter, ranges = get_ranges(ch, start, end, filter)
 	      ]() mutable -> repeat_ret {
 		if (limit == 0 || ranges.empty()) {
@@ -886,11 +950,10 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		auto pstart = ite->first;
 		auto pend = ite->second;
 		ranges.pop_front();
-		LOG_PREFIX(SeaStore::list_objects);
-		DEBUGT("pstart {}, pend {}, limit {}", t, pstart, pend, limit);
+		DEBUGT("pstart {}, pend {}, limit {} ...", t, pstart, pend, limit);
 		return onode_manager->list_onodes(
 		  t, pstart, pend, limit
-		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end]
+		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end, FNAME]
 			  (auto &&_ret) mutable {
 		  auto &next_objects = std::get<0>(_ret);
 		  auto &ret_objects = std::get<0>(ret);
@@ -901,7 +964,6 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		  std::get<1>(ret) = std::get<1>(_ret);
 		  assert(limit >= next_objects.size());
 		  limit -= next_objects.size();
-		  LOG_PREFIX(SeaStore::list_objects);
 		  DEBUGT("got {} objects, left limit {}",
 		    t, next_objects.size(), limit);
 		  assert(limit == 0 ||
@@ -914,10 +976,13 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		    seastar::stop_iteration
 		    >(seastar::stop_iteration::no);
 		});
-	      }).si_then([&ret] {
-		return list_iertr::make_ready_future<
-		  OnodeManager::list_onodes_bare_ret>(std::move(ret));
-	      });
+	      }
+            ).si_then([&ret, FNAME] {
+              DEBUG("got {} objects, next={}",
+                    std::get<0>(ret).size(), std::get<1>(ret));
+              return list_iertr::make_ready_future<
+                OnodeManager::list_onodes_bare_ret>(std::move(ret));
+            });
           }
         });
       }).safe_then([&ret](auto&& _ret) {
@@ -927,7 +992,7 @@ SeaStore::Shard::list_objects(CollectionRef ch,
       return std::move(ret);
     }).handle_error(
       crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::list_objects"
+        "Invalid error in SeaStoreS::list_objects"
       }
     );
   }).finally([this] {
@@ -939,23 +1004,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 seastar::future<CollectionRef>
 SeaStore::Shard::create_new_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::create_new_collection);
-  DEBUG("{}", cid);
+  LOG_PREFIX(SeaStoreS::create_new_collection);
+  DEBUG("cid={}", cid);
   return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
 }
 
 seastar::future<CollectionRef>
 SeaStore::Shard::open_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::open_collection);
-  DEBUG("{}", cid);
-  return list_collections().then([cid, this] (auto colls_cores) {
+  LOG_PREFIX(SeaStoreS::open_collection);
+  DEBUG("cid={} ...", cid);
+  return list_collections(
+  ).then([cid, this, FNAME] (auto colls_cores) {
     if (auto found = std::find(colls_cores.begin(),
                                colls_cores.end(),
                                std::make_pair(cid, seastar::this_shard_id()));
       found != colls_cores.end()) {
+      DEBUG("cid={} exists", cid);
       return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
     } else {
+      DEBUG("cid={} not exists", cid);
       return seastar::make_ready_future<CollectionRef>();
     }
   });
@@ -965,6 +1033,8 @@ seastar::future<>
 SeaStore::Shard::set_collection_opts(CollectionRef c,
                                         const pool_opts_t& opts)
 {
+  LOG_PREFIX(SeaStoreS::set_collection_opts);
+  DEBUG("cid={}, opts={} not implemented", c->get_cid(), opts);
   //TODO
   return seastar::now();
 }
@@ -986,6 +1056,8 @@ SeaStore::Shard::list_collections()
           "list_collections",
           [this, &ret](auto& t)
         {
+          LOG_PREFIX(SeaStoreS::list_collections);
+          DEBUGT("...", t);
           return transaction_manager->read_collection_root(t
           ).si_then([this, &t](auto coll_root) {
             return collection_manager->list(coll_root, t);
@@ -1004,7 +1076,7 @@ SeaStore::Shard::list_collections()
     }
   ).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::list_collections"
+      "Invalid error in SeaStoreS::list_collections"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1012,6 +1084,42 @@ SeaStore::Shard::list_collections()
   });
 }
 
+SeaStore::base_iertr::future<ceph::bufferlist>
+SeaStore::Shard::_read(
+  Transaction& t,
+  Onode& onode,
+  uint64_t offset,
+  std::size_t len,
+  uint32_t op_flags)
+{
+  LOG_PREFIX(SeaStoreS::_read);
+  size_t size = onode.get_layout().size;
+  if (offset >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
+           t, offset, len, size, op_flags);
+    return seastar::make_ready_future<ceph::bufferlist>();
+  }
+
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
+         t, offset, len, size, op_flags);
+  size_t corrected_len = (len == 0) ?
+    size - offset :
+    std::min(size - offset, len);
+
+  return ObjectDataHandler(max_object_size).read(
+    ObjectDataHandler::context_t{
+      *transaction_manager,
+      t,
+      onode,
+    },
+    offset,
+    corrected_len
+  ).si_then([FNAME, &t](auto bl) {
+    DEBUGT("got bl length=0x{:x}", t, bl.length());
+    return bl;
+  });
+}
+
 SeaStore::Shard::read_errorator::future<ceph::bufferlist>
 SeaStore::Shard::read(
   CollectionRef ch,
@@ -1020,9 +1128,6 @@ SeaStore::Shard::read(
   size_t len,
   uint32_t op_flags)
 {
-  LOG_PREFIX(SeaStore::read);
-  DEBUG("oid {} offset {} len {}", oid, offset, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1030,29 +1135,11 @@ SeaStore::Shard::read(
     ch,
     oid,
     Transaction::src_t::READ,
-    "read_obj",
+    "read",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
-      size_t size = onode.get_layout().size;
-
-      if (offset >= size) {
-	return seastar::make_ready_future<ceph::bufferlist>();
-      }
-
-      size_t corrected_len = (len == 0) ?
-	size - offset :
-	std::min(size - offset, len);
-
-      return ObjectDataHandler(max_object_size).read(
-        ObjectDataHandler::context_t{
-          *transaction_manager,
-          t,
-          onode,
-        },
-        offset,
-        corrected_len);
-    }
-  ).finally([this] {
+    [this, offset, len, op_flags](auto &t, auto &onode) {
+    return _read(t, onode, offset, len, op_flags);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1063,9 +1150,7 @@ SeaStore::Shard::exists(
   CollectionRef c,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::exists);
-  DEBUG("oid {}", oid);
-
+  LOG_PREFIX(SeaStoreS::exists);
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1073,12 +1158,14 @@ SeaStore::Shard::exists(
     c,
     oid,
     Transaction::src_t::READ,
-    "oid_exists",
+    "exists",
     op_type_t::READ,
-    [](auto&, auto&) {
+    [FNAME](auto& t, auto&) {
+    DEBUGT("exists", t);
     return seastar::make_ready_future<bool>(true);
   }).handle_error(
-    crimson::ct_error::enoent::handle([] {
+    crimson::ct_error::enoent::handle([FNAME] {
+      DEBUG("not exists");
       return seastar::make_ready_future<bool>(false);
     }),
     crimson::ct_error::assert_all{"unexpected error"}
@@ -1095,66 +1182,78 @@ SeaStore::Shard::readv(
   interval_set<uint64_t>& m,
   uint32_t op_flags)
 {
+  LOG_PREFIX(SeaStoreS::readv);
+  DEBUG("cid={} oid={} op_flags=0x{:x} {} intervals",
+        ch->get_cid(), _oid, op_flags, m.num_intervals());
+
   return seastar::do_with(
     _oid,
     ceph::bufferlist{},
-    [=, this, &m](auto &oid, auto &ret) {
+    [ch, op_flags, this, FNAME, &m](auto &oid, auto &ret) {
     return crimson::do_for_each(
       m,
-      [=, this, &oid, &ret](auto &p) {
+      [ch, op_flags, this, &oid, &ret](auto &p) {
       return read(
 	ch, oid, p.first, p.second, op_flags
 	).safe_then([&ret](auto bl) {
         ret.claim_append(bl);
       });
-    }).safe_then([&ret] {
+    }).safe_then([&ret, FNAME] {
+      DEBUG("got bl length=0x{:x}", ret.length());
       return read_errorator::make_ready_future<ceph::bufferlist>
         (std::move(ret));
     });
   });
-  return read_errorator::make_ready_future<ceph::bufferlist>();
 }
 
 using crimson::os::seastore::omap_manager::BtreeOMapManager;
 
+SeaStore::Shard::_omap_get_value_ret
+SeaStore::Shard::_get_attr(
+  Transaction& t,
+  Onode& onode,
+  std::string_view name) const
+{
+  LOG_PREFIX(SeaStoreS::_get_attr);
+  auto& layout = onode.get_layout();
+  if (name == OI_ATTR && layout.oi_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+    DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  if (name == SS_ATTR && layout.ss_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+    DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  DEBUGT("name={} ...", t, name);
+  return _omap_get_value(
+    t,
+    layout.xattr_root.get(
+      onode.get_metadata_hint(device->get_block_size())),
+    name);
+}
+
 SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::get_attr(
   CollectionRef ch,
   const ghobject_t& oid,
   std::string_view name) const
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::get_attr);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<ceph::bufferlist>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
-    [=, this](auto &t, auto& onode) -> _omap_get_value_ret {
-      auto& layout = onode.get_layout();
-      if (name == OI_ATTR && layout.oi_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      if (name == SS_ATTR && layout.ss_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      return _omap_get_value(
-        t,
-        layout.xattr_root.get(
-          onode.get_metadata_hint(device->get_block_size())),
-        name);
-    }
-  ).handle_error(
+    [this, name](auto &t, auto& onode) {
+    return _get_attr(t, onode, name);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1164,48 +1263,53 @@ SeaStore::Shard::get_attr(
   });
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::attrs_t>
+SeaStore::Shard::_get_attrs(
+  Transaction& t,
+  Onode& onode)
+{
+  LOG_PREFIX(SeaStoreS::_get_attrs);
+  DEBUGT("...", t);
+  auto& layout = onode.get_layout();
+  return omap_list(onode, layout.xattr_root, t, std::nullopt,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([&layout, &t, FNAME](auto p) {
+    auto& attrs = std::get<1>(p);
+    DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
+           t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
+    ceph::bufferlist bl;
+    if (layout.oi_size) {
+      bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+      attrs.emplace(OI_ATTR, std::move(bl));
+    }
+    if (layout.ss_size) {
+      bl.clear();
+      bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+      attrs.emplace(SS_ATTR, std::move(bl));
+    }
+    return seastar::make_ready_future<attrs_t>(std::move(attrs));
+  });
+}
+
 SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
 SeaStore::Shard::get_attrs(
   CollectionRef ch,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::get_attrs);
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<attrs_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
-    "get_addrs",
+    "get_attrs",
     op_type_t::GET_ATTRS,
-    [=, this](auto &t, auto& onode) {
-      auto& layout = onode.get_layout();
-      return omap_list(onode, layout.xattr_root, t, std::nullopt,
-        OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max()
-      ).si_then([&layout, &t, FNAME](auto p) {
-        auto& attrs = std::get<1>(p);
-        ceph::bufferlist bl;
-        if (layout.oi_size) {
-          bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-          attrs.emplace(OI_ATTR, std::move(bl));
-         DEBUGT("set oi from onode layout", t);
-        }
-        if (layout.ss_size) {
-          bl.clear();
-          bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-          attrs.emplace(SS_ATTR, std::move(bl));
-         DEBUGT("set ss from onode layout", t);
-        }
-        return seastar::make_ready_future<omap_values_t>(std::move(attrs));
-      });
-    }
-  ).handle_error(
+    [this](auto &t, auto& onode) {
+    return _get_attrs(t, onode);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1215,6 +1319,23 @@ SeaStore::Shard::get_attrs(
   });
 }
 
+seastar::future<struct stat> SeaStore::Shard::_stat(
+  Transaction& t,
+  Onode& onode,
+  const ghobject_t& oid)
+{
+  LOG_PREFIX(SeaStoreS::_stat);
+  struct stat st;
+  auto &olayout = onode.get_layout();
+  st.st_size = olayout.size;
+  st.st_blksize = device->get_block_size();
+  st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
+  st.st_nlink = 1;
+  DEBUGT("oid={}, size={}, blksize={}",
+         t, oid, st.st_size, st.st_blksize);
+  return seastar::make_ready_future<struct stat>(st);
+}
+
 seastar::future<struct stat> SeaStore::Shard::stat(
   CollectionRef c,
   const ghobject_t& oid)
@@ -1222,26 +1343,17 @@ seastar::future<struct stat> SeaStore::Shard::stat(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  LOG_PREFIX(SeaStore::stat);
   return repeat_with_onode<struct stat>(
     c,
     oid,
     Transaction::src_t::READ,
     "stat",
     op_type_t::STAT,
-    [=, this](auto &t, auto &onode) {
-      struct stat st;
-      auto &olayout = onode.get_layout();
-      st.st_size = olayout.size;
-      st.st_blksize = device->get_block_size();
-      st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
-      st.st_nlink = 1;
-      DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size);
-      return seastar::make_ready_future<struct stat>(st);
-    }
-  ).handle_error(
+    [this, oid](auto &t, auto &onode) {
+    return _stat(t, onode, oid);
+  }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::stat"
+      "Invalid error in SeaStoreS::stat"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1257,6 +1369,22 @@ SeaStore::Shard::omap_get_header(
   return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const omap_keys_t& keys)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("{} keys ...", t, keys.size());
+  omap_root_t omap_root = onode.get_layout().omap_root.get(
+    onode.get_metadata_hint(device->get_block_size()));
+  return _omap_get_values(
+    t,
+    std::move(omap_root),
+    keys);
+}
+
 SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
@@ -1266,22 +1394,15 @@ SeaStore::Shard::omap_get_values(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   return repeat_with_onode<omap_values_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
     [this, keys](auto &t, auto &onode) {
-      omap_root_t omap_root = onode.get_layout().omap_root.get(
-	onode.get_metadata_hint(device->get_block_size()));
-      return _omap_get_values(
-	t,
-	std::move(omap_root),
-	keys);
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, keys);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1298,58 +1419,62 @@ SeaStore::Shard::_omap_get_value(
     std::move(root),
     std::string(key),
     [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
-      if (root.is_null()) {
+    LOG_PREFIX(SeaStoreS::_omap_get_value);
+    if (root.is_null()) {
+      DEBUGT("key={} is absent because of null root", t, key);
+      return crimson::ct_error::enodata::make();
+    }
+    return manager.omap_get_value(root, t, key
+    ).si_then([&key, &t, FNAME](auto opt) -> _omap_get_value_ret {
+      if (!opt) {
+        DEBUGT("key={} is absent", t, key);
         return crimson::ct_error::enodata::make();
       }
-      return manager.omap_get_value(root, t, key
-      ).si_then([](auto opt) -> _omap_get_value_ret {
-        if (!opt) {
-          return crimson::ct_error::enodata::make();
-        }
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
-      });
-    }
-  );
+      DEBUGT("key={}, value length=0x{:x}", t, key, opt->length());
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
+    });
+  });
 }
 
-SeaStore::Shard::_omap_get_values_ret
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::_omap_get_values(
   Transaction &t,
   omap_root_t &&omap_root,
   const omap_keys_t &keys) const
 {
+  LOG_PREFIX(SeaStoreS::_omap_get_values);
   if (omap_root.is_null()) {
+    DEBUGT("{} keys are absent because of null root", t, keys.size());
     return seastar::make_ready_future<omap_values_t>();
   }
   return seastar::do_with(
     BtreeOMapManager(*transaction_manager),
     std::move(omap_root),
     omap_values_t(),
-    [&](auto &manager, auto &root, auto &ret) {
-      return trans_intr::do_for_each(
-        keys.begin(),
-        keys.end(),
-        [&](auto &key) {
-          return manager.omap_get_value(
-            root,
-            t,
-            key
-          ).si_then([&ret, &key](auto &&p) {
-            if (p) {
-              bufferlist bl;
-              bl.append(*p);
-              ret.emplace(
-                std::move(key),
-                std::move(bl));
-            }
-            return seastar::now();
-          });
+    [&t, &keys, FNAME](auto &manager, auto &root, auto &ret) {
+    return trans_intr::do_for_each(
+      keys.begin(),
+      keys.end(),
+      [&t, &manager, &root, &ret](auto &key) {
+      return manager.omap_get_value(
+        root,
+        t,
+        key
+      ).si_then([&ret, &key](auto &&p) {
+        if (p) {
+          bufferlist bl;
+          bl.append(*p);
+          ret.emplace(
+            std::move(key),
+            std::move(bl));
         }
-      ).si_then([&ret] {
-        return std::move(ret);
+        return seastar::now();
       });
-    }
-  );
+    }).si_then([&t, &ret, &keys, FNAME] {
+      DEBUGT("{} keys got {} values", t, keys.size(), ret.size());
+      return std::move(ret);
+    });
+  });
 }
 
 SeaStore::Shard::omap_list_ret
@@ -1377,51 +1502,74 @@ SeaStore::Shard::omap_list(
   });
 }
 
-SeaStore::Shard::omap_get_values_ret_t
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_paged_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const std::optional<std::string>& start)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("start={} ...", t, start.has_value() ? *start : "");
+  return omap_list(
+    onode,
+    onode.get_layout().omap_root,
+    t,
+    start,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([FNAME, &t](omap_values_paged_t ret) {
+    DEBUGT("got {} values, complete={}",
+           t, std::get<1>(ret).size(), std::get<0>(ret));
+    return ret;
+  });
+}
+
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<string> &start)
+  const std::optional<std::string> &start)
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::omap_get_values);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>;
-  return repeat_with_onode<ret_bare_t>(
-    c,
+  return repeat_with_onode<omap_values_paged_t>(
+    ch,
     oid,
     Transaction::src_t::READ,
-    "omap_list",
-    op_type_t::OMAP_LIST,
+    "omap_get_values2",
+    op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
-      return omap_list(
-	onode,
-	onode.get_layout().omap_root,
-	t,
-	start,
-	OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max());
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, start);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
 }
 
-SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
+SeaStore::base_iertr::future<SeaStore::Shard::fiemap_ret_t>
+SeaStore::Shard::_fiemap(
   Transaction &t,
   Onode &onode,
   uint64_t off,
   uint64_t len) const
 {
+  LOG_PREFIX(SeaStoreS::_fiemap);
+  size_t size = onode.get_layout().size;
+  if (off >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
+           t, off, len, size);
+    return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+  }
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
+         t, off, len, size);
+  size_t adjust_len = (len == 0) ?
+    size - off:
+    std::min(size - off, len);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &t, &onode] (auto &objhandler) {
+    [this, off, adjust_len, &t, &onode](auto &objhandler) {
     return objhandler.fiemap(
       ObjectDataHandler::context_t{
         *transaction_manager,
@@ -1429,39 +1577,31 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
         onode,
       },
       off,
-      len);
+      adjust_len);
+  }).si_then([FNAME, &t](auto ret) {
+    DEBUGT("got {} intervals", t, ret.size());
+    return ret;
   });
 }
 
-SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::fiemap_ret_t>
 SeaStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
   uint64_t len)
 {
-  LOG_PREFIX(SeaStore::fiemap);
-  DEBUG("oid: {}, off: {}, len: {} ", oid, off, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  return repeat_with_onode<std::map<uint64_t, uint64_t>>(
+  return repeat_with_onode<fiemap_ret_t>(
     ch,
     oid,
     Transaction::src_t::READ,
-    "fiemap_read",
+    "fiemap",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> _fiemap_ret {
-    size_t size = onode.get_layout().size;
-    if (off >= size) {
-      INFOT("fiemap offset is over onode size!", t);
-      return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
-    }
-    size_t adjust_len = (len == 0) ?
-      size - off:
-      std::min(size - off, len);
-    return _fiemap(t, onode, off, adjust_len);
+    [this, off, len](auto &t, auto &onode) {
+    return _fiemap(t, onode, off, len);
   }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
@@ -1469,7 +1609,7 @@ SeaStore::Shard::fiemap(
 }
 
 void SeaStore::Shard::on_error(ceph::os::Transaction &t) {
-  LOG_PREFIX(SeaStore::on_error);
+  LOG_PREFIX(SeaStoreS::on_error);
   ERROR(" transaction dump:\n");
   JSONFormatter f(true);
   f.open_object_section("transaction");
@@ -1490,17 +1630,22 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
   ++(shard_stats.starting_io_num);
 
   // repeat_with_internal_context ensures ordering via collection lock
+  auto num_bytes = _t.get_num_bytes();
   return repeat_with_internal_context(
     _ch,
     std::move(_t),
     Transaction::src_t::MUTATE,
     "do_transaction",
-    op_type_t::TRANSACTION,
-    [this](auto &ctx) {
-      return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
-        LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
-        SUBDEBUGT(seastore_t, "start with {} objects",
-                  t, ctx.iter.objects.size());
+    op_type_t::DO_TRANSACTION,
+    [this, num_bytes](auto &ctx) {
+      LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
+      return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
+        DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+               t, ctx.ch->get_cid(),
+               ctx.ext_transaction.get_num_ops(),
+               num_bytes,
+               ctx.iter.colls.size(),
+               ctx.iter.objects.size());
 #ifndef NDEBUG
 	TRACET(" transaction dump:\n", t);
 	JSONFormatter f(true);
@@ -1534,6 +1679,8 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
         }).si_then([this, &ctx] {
           return transaction_manager->submit_transaction(*ctx.transaction);
         });
+      }).safe_then([FNAME, &ctx] {
+        DEBUGT("done", *ctx.transaction);
       });
     }
   ).finally([this] {
@@ -1573,27 +1720,31 @@ SeaStore::Shard::_do_transaction_step(
   std::vector<OnodeRef> &d_onodes,
   ceph::os::Transaction::iterator &i)
 {
-  LOG_PREFIX(SeaStore::Shard::_do_transaction_step);
+  LOG_PREFIX(SeaStoreS::_do_transaction_step);
   auto op = i.decode_op();
-  SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op);
 
   using ceph::os::Transaction;
-  if (op->op == Transaction::OP_NOP)
+  if (op->op == Transaction::OP_NOP) {
+    DEBUGT("op NOP", *ctx.transaction);
     return tm_iertr::now();
+  }
 
   switch (op->op) {
     case Transaction::OP_RMCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op RMCOLL, cid={} ...", *ctx.transaction, cid);
       return _remove_collection(ctx, cid);
     }
     case Transaction::OP_MKCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op MKCOLL, cid={} ...", *ctx.transaction, cid);
       return _create_collection(ctx, cid, op->split_bits);
     }
     case Transaction::OP_COLL_HINT:
     {
+      DEBUGT("op COLL_HINT", *ctx.transaction);
       ceph::bufferlist hint;
       i.decode_bl(hint);
       return tm_iertr::now();
@@ -1611,14 +1762,18 @@ SeaStore::Shard::_do_transaction_step(
     create = true;
   }
   if (!onodes[op->oid]) {
+    const ghobject_t& oid = i.get_oid(op->oid);
     if (!create) {
-      fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_onode(*ctx.transaction, oid);
     } else {
-      fut = onode_manager->get_or_create_onode(
-        *ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get_or_create oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_or_create_onode(*ctx.transaction, oid);
     }
   }
-  return fut.si_then([&, op](auto get_onode) {
+  return fut.si_then([&, op, this, FNAME](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
@@ -1628,11 +1783,13 @@ SeaStore::Shard::_do_transaction_step(
     if ((op->op == Transaction::OP_CLONE
 	  || op->op == Transaction::OP_COLL_MOVE_RENAME)
 	&& !d_onodes[op->dest_oid]) {
+      const ghobject_t& dest_oid = i.get_oid(op->dest_oid);
+      DEBUGT("op {}, get_or_create dest oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, dest_oid);
       //TODO: use when_all_succeed after making onode tree
       //      support parallel extents loading
-      return onode_manager->get_or_create_onode(
-	*ctx.transaction, i.get_oid(op->dest_oid)
-      ).si_then([&, op](auto dest_onode) {
+      return onode_manager->get_or_create_onode(*ctx.transaction, dest_oid
+      ).si_then([&onodes, &d_onodes, op](auto dest_onode) {
 	assert(dest_onode);
 	auto &d_o = onodes[op->dest_oid];
 	assert(!d_o);
@@ -1644,13 +1801,13 @@ SeaStore::Shard::_do_transaction_step(
     } else {
       return OnodeManager::get_or_create_onode_iertr::now();
     }
-  }).si_then([&, op, this]() -> tm_ret {
-    LOG_PREFIX(SeaStore::_do_transaction_step);
+  }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret {
+    const ghobject_t& oid = i.get_oid(op->oid);
     try {
       switch (op->op) {
       case Transaction::OP_REMOVE:
       {
-	TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
+        DEBUGT("op REMOVE, oid={} ...", *ctx.transaction, oid);
         return _remove(ctx, onodes[op->oid]
 	).si_then([&onodes, &d_onodes, op] {
 	  onodes[op->oid].reset();
@@ -1660,6 +1817,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_CREATE:
       case Transaction::OP_TOUCH:
       {
+        DEBUGT("op CREATE/TOUCH, oid={} ...", *ctx.transaction, oid);
         return _touch(ctx, onodes[op->oid]);
       }
       case Transaction::OP_WRITE:
@@ -1669,6 +1827,8 @@ SeaStore::Shard::_do_transaction_step(
         uint32_t fadvise_flags = i.get_fadvise_flags();
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op WRITE, oid={}, 0x{:x}~0x{:x}, flags=0x{:x} ...",
+               *ctx.transaction, oid, off, len, fadvise_flags);
         return _write(
 	  ctx, onodes[op->oid], off, len, std::move(bl),
 	  fadvise_flags);
@@ -1676,6 +1836,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_TRUNCATE:
       {
         uint64_t off = op->off;
+        DEBUGT("op TRUNCATE, oid={}, 0x{:x} ...", *ctx.transaction, oid, off);
         return _truncate(ctx, onodes[op->oid], off);
       }
       case Transaction::OP_SETATTR:
@@ -1684,80 +1845,96 @@ SeaStore::Shard::_do_transaction_step(
         std::map<std::string, bufferlist> to_set;
         ceph::bufferlist& bl = to_set[name];
         i.decode_bl(bl);
+        DEBUGT("op SETATTR, oid={}, attr name={}, value length=0x{:x} ...",
+               *ctx.transaction, oid, name, bl.length());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_SETATTRS:
       {
         std::map<std::string, bufferlist> to_set;
         i.decode_attrset(to_set);
+        DEBUGT("op SETATTRS, oid={}, attrs size={} ...",
+               *ctx.transaction, oid, to_set.size());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_RMATTR:
       {
         std::string name = i.decode_string();
+        DEBUGT("op RMATTR, oid={}, attr name={} ...",
+               *ctx.transaction, oid, name);
         return _rmattr(ctx, onodes[op->oid], name);
       }
       case Transaction::OP_RMATTRS:
       {
+        DEBUGT("op RMATTRS, oid={} ...", *ctx.transaction, oid);
         return _rmattrs(ctx, onodes[op->oid]);
       }
       case Transaction::OP_OMAP_SETKEYS:
       {
         std::map<std::string, ceph::bufferlist> aset;
         i.decode_attrset(aset);
+        DEBUGT("op OMAP_SETKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, aset.size());
         return _omap_set_values(ctx, onodes[op->oid], std::move(aset));
       }
       case Transaction::OP_OMAP_SETHEADER:
       {
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op OMAP_SETHEADER, oid={}, length=0x{:x} ...",
+               *ctx.transaction, oid, bl.length());
         return _omap_set_header(ctx, onodes[op->oid], std::move(bl));
       }
       case Transaction::OP_OMAP_RMKEYS:
       {
         omap_keys_t keys;
         i.decode_keyset(keys);
+        DEBUGT("op OMAP_RMKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, keys.size());
         return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys));
       }
       case Transaction::OP_OMAP_RMKEYRANGE:
       {
-        string first, last;
+        std::string first, last;
         first = i.decode_string();
         last = i.decode_string();
+        DEBUGT("op OMAP_RMKEYRANGE, oid={}, first={}, last={} ...",
+               *ctx.transaction, oid, first, last);
         return _omap_rmkeyrange(
 	  ctx, onodes[op->oid],
 	  std::move(first), std::move(last));
       }
       case Transaction::OP_OMAP_CLEAR:
       {
+        DEBUGT("op OMAP_CLEAR, oid={} ...", *ctx.transaction, oid);
         return _omap_clear(ctx, onodes[op->oid]);
       }
       case Transaction::OP_ZERO:
       {
         objaddr_t off = op->off;
         extent_len_t len = op->len;
+        DEBUGT("op ZERO, oid={}, 0x{:x}~0x{:x} ...",
+               *ctx.transaction, oid, off, len);
         return _zero(ctx, onodes[op->oid], off, len);
       }
       case Transaction::OP_SETALLOCHINT:
       {
+        DEBUGT("op SETALLOCHINT, oid={}, not implemented",
+               *ctx.transaction, oid);
         // TODO
         return tm_iertr::now();
       }
       case Transaction::OP_CLONE:
       {
-	TRACET("cloning {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
+        DEBUGT("op CLONE, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
       }
       case Transaction::OP_COLL_MOVE_RENAME:
       {
+        DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	ceph_assert(op->cid == op->dest_cid);
-	TRACET("renaming {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
 	return _rename(
 	  ctx, onodes[op->oid], d_onodes[op->dest_oid]
 	).si_then([&onodes, &d_onodes, op] {
@@ -1793,7 +1970,7 @@ SeaStore::Shard::_do_transaction_step(
       return seastar::now();
     }),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::do_transaction_step"
+      "Invalid error in SeaStoreS::do_transaction_step"
     }
   );
 }
@@ -1829,7 +2006,7 @@ SeaStore::Shard::_rename(
   ).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_rename"}
+      "Invalid error in SeaStoreS::_rename"}
   );
 }
 
@@ -1850,7 +2027,7 @@ SeaStore::Shard::_remove_omaps(
       ).handle_error_interruptible(
 	crimson::ct_error::input_output_error::pass_further(),
 	crimson::ct_error::assert_all{
-	  "Invalid error in SeaStore::_remove"
+	  "Invalid error in SeaStoreS::_remove_omaps"
 	}
       );
     });
@@ -1863,8 +2040,6 @@ SeaStore::Shard::_remove(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_remove);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return _remove_omaps(
     ctx,
     onode,
@@ -1892,7 +2067,7 @@ SeaStore::Shard::_remove(
   }).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all(
-      "Invalid error in SeaStore::_remove"
+      "Invalid error in SeaStoreS::_remove"
     )
   );
 }
@@ -1902,8 +2077,6 @@ SeaStore::Shard::_touch(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_touch);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return tm_iertr::now();
 }
 
@@ -1915,8 +2088,6 @@ SeaStore::Shard::_write(
   ceph::bufferlist &&_bl,
   uint32_t fadvise_flags)
 {
-  LOG_PREFIX(SeaStore::_write);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   const auto &object_size = onode->get_layout().size;
   if (offset + len > object_size) {
     onode->update_onode_size(
@@ -2007,8 +2178,6 @@ SeaStore::Shard::_clone(
   OnodeRef &onode,
   OnodeRef &d_onode)
 {
-  LOG_PREFIX(SeaStore::_clone);
-  DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
     [this, &ctx, &onode, &d_onode](auto &objHandler) {
@@ -2034,9 +2203,10 @@ SeaStore::Shard::_zero(
   objaddr_t offset,
   extent_len_t len)
 {
-  LOG_PREFIX(SeaStore::_zero);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   if (offset + len >= max_object_size) {
+    LOG_PREFIX(SeaStoreS::_zero);
+    ERRORT("0x{:x}~0x{:x} >= 0x{:x}",
+           *ctx.transaction, offset, len, max_object_size);
     return crimson::ct_error::input_output_error::make();
   }
   const auto &object_size = onode->get_layout().size;
@@ -2092,8 +2262,6 @@ SeaStore::Shard::_omap_set_values(
   OnodeRef &onode,
   std::map<std::string, ceph::bufferlist> &&aset)
 {
-  LOG_PREFIX(SeaStore::_omap_set_values);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size());
   return _omap_set_kvs(
     onode,
     onode->get_layout().omap_root,
@@ -2112,8 +2280,6 @@ SeaStore::Shard::_omap_set_header(
   OnodeRef &onode,
   ceph::bufferlist &&header)
 {
-  LOG_PREFIX(SeaStore::_omap_set_header);
-  DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length());
   std::map<std::string, bufferlist> to_set;
   to_set[OMAP_HEADER_XATTR_KEY] = header;
   return _setattrs(ctx, onode,std::move(to_set));
@@ -2124,10 +2290,8 @@ SeaStore::Shard::_omap_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_omap_clear);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode);
-  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY))
-    .si_then([this, &ctx, &onode]() -> tm_ret {
+  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)
+  ).si_then([this, &ctx, &onode]() -> tm_ret {
     if (auto omap_root = onode->get_layout().omap_root.get(
       onode->get_metadata_hint(device->get_block_size()));
       omap_root.is_null()) {
@@ -2142,8 +2306,8 @@ SeaStore::Shard::_omap_clear(
         auto &omap_root) {
         return omap_manager.omap_clear(
           omap_root,
-          *ctx.transaction)
-        .si_then([&] {
+          *ctx.transaction
+        ).si_then([&] {
           if (omap_root.must_update()) {
 	    onode->update_omap_root(*ctx.transaction, omap_root);
           }
@@ -2159,8 +2323,6 @@ SeaStore::Shard::_omap_rmkeys(
   OnodeRef &onode,
   omap_keys_t &&keys)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeys);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
   auto omap_root = onode->get_layout().omap_root.get(
     onode->get_metadata_hint(device->get_block_size()));
   if (omap_root.is_null()) {
@@ -2201,10 +2363,9 @@ SeaStore::Shard::_omap_rmkeyrange(
   std::string first,
   std::string last)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeyrange);
-  DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last);
   if (first > last) {
-    ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last);
+    LOG_PREFIX(SeaStoreS::_omap_rmkeyrange);
+    ERRORT("range error, first:{} > last:{}", *ctx.transaction, first, last);
     ceph_abort();
   }
   auto omap_root = onode->get_layout().omap_root.get(
@@ -2247,8 +2408,6 @@ SeaStore::Shard::_truncate(
   OnodeRef &onode,
   uint64_t size)
 {
-  LOG_PREFIX(SeaStore::_truncate);
-  DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
   onode->update_onode_size(*ctx.transaction, size);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
@@ -2269,9 +2428,7 @@ SeaStore::Shard::_setattrs(
   OnodeRef &onode,
   std::map<std::string, bufferlist>&& aset)
 {
-  LOG_PREFIX(SeaStore::_setattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
-
+  LOG_PREFIX(SeaStoreS::_setattrs);
   auto fut = tm_iertr::now();
   auto& layout = onode->get_layout();
   if (auto it = aset.find(OI_ATTR); it != aset.end()) {
@@ -2333,8 +2490,6 @@ SeaStore::Shard::_rmattr(
   OnodeRef &onode,
   std::string name)
 {
-  LOG_PREFIX(SeaStore::_rmattr);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   auto& layout = onode->get_layout();
   if ((name == OI_ATTR) && (layout.oi_size > 0)) {
     onode->clear_object_info(*ctx.transaction);
@@ -2356,7 +2511,7 @@ SeaStore::Shard::_xattr_rmattr(
   OnodeRef &onode,
   std::string &&name)
 {
-  LOG_PREFIX(SeaStore::_xattr_rmattr);
+  LOG_PREFIX(SeaStoreS::_xattr_rmattr);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2384,8 +2539,6 @@ SeaStore::Shard::_rmattrs(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_rmattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   onode->clear_object_info(*ctx.transaction);
   onode->clear_snapset(*ctx.transaction);
   return _xattr_clear(ctx, onode);
@@ -2396,7 +2549,7 @@ SeaStore::Shard::_xattr_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_xattr_clear);
+  LOG_PREFIX(SeaStoreS::_xattr_clear);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2446,7 +2599,7 @@ SeaStore::Shard::_create_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2478,7 +2631,7 @@ SeaStore::Shard::_remove_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2489,40 +2642,53 @@ SeaStore::Shard::_get_collection(const coll_t& cid)
   return new SeastoreCollection{cid};
 }
 
+seastar::future<> SeaStore::write_meta(
+  const std::string& key,
+  const std::string& value) {
+  LOG_PREFIX(SeaStore::write_meta);
+  DEBUG("key={} value={} ...", key, value);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return seastar::do_with(key, value,
+    [this, FNAME](auto& key, auto& value) {
+    return shard_stores.local().write_meta(key, value
+    ).then([this, &key, &value] {
+      return mdstore->write_meta(key, value);
+    }).safe_then([FNAME, &key, &value] {
+      DEBUG("key={} value={} done", key, value);
+    }).handle_error(
+      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+    );
+  });
+}
+
 seastar::future<> SeaStore::Shard::write_meta(
   const std::string& key,
   const std::string& value)
 {
-  LOG_PREFIX(SeaStore::write_meta);
-  DEBUG("key: {}; value: {}", key, value);
-
   ++(shard_stats.io_num);
   ++(shard_stats.pending_io_num);
   // For TM::submit_transaction()
   ++(shard_stats.processing_inlock_io_num);
 
-  return seastar::do_with(
-      key, value,
-      [this, FNAME](auto& key, auto& value) {
-	return repeat_eagain([this, FNAME, &key, &value] {
-	  ++(shard_stats.repeat_io_num);
-
-	  return transaction_manager->with_transaction_intr(
-	    Transaction::src_t::MUTATE,
-            "write_meta",
-	    [this, FNAME, &key, &value](auto& t)
-          {
-            DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
-            return transaction_manager->update_root_meta(
-              t, key, value
-            ).si_then([this, &t] {
-              return transaction_manager->submit_transaction(t);
-            });
-          });
-	});
-      }
-  ).handle_error(
-    crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+  return repeat_eagain([this, &key, &value] {
+    ++(shard_stats.repeat_io_num);
+
+    return transaction_manager->with_transaction_intr(
+      Transaction::src_t::MUTATE,
+      "write_meta",
+      [this, &key, &value](auto& t)
+    {
+      LOG_PREFIX(SeaStoreS::write_meta);
+      DEBUGT("key={} value={} ...", t, key, value);
+      return transaction_manager->update_root_meta(
+        t, key, value
+      ).si_then([this, &t] {
+        return transaction_manager->submit_transaction(t);
+      });
+    });
+  }).handle_error(
+    crimson::ct_error::assert_all{"Invalid error in SeaStoreS::write_meta"}
   ).finally([this] {
     assert(shard_stats.pending_io_num);
     --(shard_stats.pending_io_num);
@@ -2535,13 +2701,17 @@ seastar::future<> SeaStore::Shard::write_meta(
 seastar::future<std::tuple<int, std::string>>
 SeaStore::read_meta(const std::string& key)
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::read_meta);
-  DEBUG("key: {}", key);
-  return mdstore->read_meta(key).safe_then([](auto v) {
+  DEBUG("key={} ...", key);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return mdstore->read_meta(key
+  ).safe_then([key, FNAME](auto v) {
     if (v) {
+      DEBUG("key={}, value={}", key, *v);
       return std::make_tuple(0, std::move(*v));
     } else {
+      ERROR("key={} failed", key);
       return std::make_tuple(-1, std::string(""));
     }
   }).handle_error(
@@ -2598,7 +2768,7 @@ shard_stats_t SeaStore::Shard::get_io_stats(
   ret.minus(last_shard_stats);
 
   if (report_detail && seconds != 0) {
-    LOG_PREFIX(SeaStore::get_io_stats);
+    LOG_PREFIX(SeaStoreS::get_io_stats);
     auto calc_conflicts = [](uint64_t ios, uint64_t repeats) {
       return (double)(repeats-ios)/ios;
     };
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index fb495a422f6..185072744f2 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -35,14 +35,14 @@ using OnodeRef = boost::intrusive_ptr<Onode>;
 class TransactionManager;
 
 enum class op_type_t : uint8_t {
-    TRANSACTION = 0,
+    DO_TRANSACTION = 0,
     READ,
     WRITE,
     GET_ATTR,
     GET_ATTRS,
     STAT,
     OMAP_GET_VALUES,
-    OMAP_LIST,
+    OMAP_GET_VALUES2,
     MAX
 };
 
@@ -71,20 +71,19 @@ struct col_obj_ranges_t {
 
 class SeaStore final : public FuturizedStore {
 public:
+  using base_ertr = TransactionManager::base_ertr;
+  using base_iertr = TransactionManager::base_iertr;
+
   class MDStore {
   public:
-    using base_iertr = crimson::errorator<
-      crimson::ct_error::input_output_error
-    >;
-
-    using write_meta_ertr = base_iertr;
+    using write_meta_ertr = base_ertr;
     using write_meta_ret = write_meta_ertr::future<>;
     virtual write_meta_ret write_meta(
       const std::string &key,
       const std::string &val
     ) = 0;
 
-    using read_meta_ertr = base_iertr;
+    using read_meta_ertr = base_ertr;
     using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
     virtual read_meta_ret read_meta(const std::string &key) = 0;
 
@@ -136,10 +135,7 @@ public:
       const omap_keys_t& keys) final;
 
     /// Retrieves paged set of values > start (if present)
-    using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
-    using omap_get_values_ret_t = read_errorator::future<
-      omap_get_values_ret_bare_t>;
-    omap_get_values_ret_t omap_get_values(
+    read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -170,7 +166,7 @@ public:
      * stages and locks as do_transaction. */
     seastar::future<> flush(CollectionRef ch) final;
 
-    read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
@@ -190,7 +186,6 @@ public:
       secondaries.emplace_back(&sec_dev);
     }
 
-    using coll_core_t = FuturizedStore::coll_core_t;
     seastar::future<std::vector<coll_core_t>> list_collections();
 
     seastar::future<> write_meta(const std::string& key,
@@ -305,18 +300,21 @@ public:
       auto begin_time = std::chrono::steady_clock::now();
       return seastar::do_with(
         oid, Ret{}, std::forward<F>(f),
-        [this, src, op_type, begin_time, tname
+        [this, ch, src, op_type, begin_time, tname
         ](auto &oid, auto &ret, auto &f)
       {
-        return repeat_eagain([&, this, src, tname] {
+        return repeat_eagain([&, this, ch, src, tname] {
           assert(src == Transaction::src_t::READ);
           ++(shard_stats.repeat_read_num);
 
           return transaction_manager->with_transaction_intr(
             src,
             tname,
-            [&, this](auto& t)
+            [&, this, ch, tname](auto& t)
           {
+            LOG_PREFIX(SeaStoreS::repeat_with_onode);
+            SUBDEBUGT(seastore, "{} cid={} oid={} ...",
+                      t, tname, ch->get_cid(), oid);
             return onode_manager->get_onode(t, oid
             ).si_then([&](auto onode) {
               return seastar::do_with(std::move(onode), [&](auto& onode) {
@@ -334,14 +332,16 @@ public:
       });
     }
 
-    using _fiemap_ret = ObjectDataHandler::fiemap_ret;
-    _fiemap_ret _fiemap(
-      Transaction &t,
-      Onode &onode,
-      uint64_t off,
-      uint64_t len) const;
+    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
+    using omap_list_ret = OMapManager::omap_list_ret;
+    omap_list_ret omap_list(
+      Onode& onode,
+      const omap_root_le_t& omap_root,
+      Transaction& t,
+      const std::optional<std::string>& start,
+      OMapManager::omap_list_config_t config) const;
 
-    using _omap_get_value_iertr = OMapManager::base_iertr::extend<
+    using _omap_get_value_iertr = base_iertr::extend<
       crimson::ct_error::enodata
       >;
     using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
@@ -350,25 +350,51 @@ public:
       omap_root_t &&root,
       std::string_view key) const;
 
-    using _omap_get_values_iertr = OMapManager::base_iertr;
-    using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
-    _omap_get_values_ret _omap_get_values(
+    base_iertr::future<omap_values_t> _omap_get_values(
       Transaction &t,
       omap_root_t &&root,
       const omap_keys_t &keys) const;
 
     friend class SeaStoreOmapIterator;
 
-    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
-    using omap_list_ret = OMapManager::omap_list_ret;
-    omap_list_ret omap_list(
-      Onode &onode,
-      const omap_root_le_t& omap_root,
+    base_iertr::future<ceph::bufferlist> _read( 
       Transaction& t,
-      const std::optional<std::string>& start,
-      OMapManager::omap_list_config_t config) const;
+      Onode& onode,
+      uint64_t offset,
+      std::size_t len,
+      uint32_t op_flags);
+
+    _omap_get_value_ret _get_attr(
+      Transaction& t,
+      Onode& onode,
+      std::string_view name) const;
+
+    base_iertr::future<attrs_t> _get_attrs(
+      Transaction& t,
+      Onode& onode);
+
+    seastar::future<struct stat> _stat(
+      Transaction& t,
+      Onode& onode,
+      const ghobject_t& oid);
+
+    base_iertr::future<omap_values_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const omap_keys_t& keys);
 
-    using tm_iertr = TransactionManager::base_iertr;
+    base_iertr::future<omap_values_paged_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const std::optional<std::string>& start);
+
+    base_iertr::future<fiemap_ret_t> _fiemap(
+      Transaction &t,
+      Onode &onode,
+      uint64_t off,
+      uint64_t len) const;
+
+    using tm_iertr = base_iertr;
     using tm_ret = tm_iertr::future<>;
     tm_ret _do_transaction_step(
       internal_context_t &ctx,
@@ -535,17 +561,7 @@ public:
     return shard_stores.local().get_fsid();
   }
 
-  seastar::future<> write_meta(
-    const std::string& key,
-    const std::string& value) final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.local().write_meta(
-      key, value).then([this, key, value] {
-      return mdstore->write_meta(key, value);
-    }).handle_error(
-      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
-    );
-  }
+  seastar::future<> write_meta(const std::string& key, const std::string& value) final;
 
   seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
 
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index e1430b30019..f379dd0117c 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -54,7 +54,9 @@ std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id)
   } else if (_id == DEVICE_ID_ROOT) {
     return out << "Dev(ROOT)";
   } else {
-    return out << "Dev(" << (unsigned)_id << ")";
+    return out << "Dev(0x"
+               << std::hex << (unsigned)_id << std::dec
+               << ")";
   }
 }
 
@@ -64,7 +66,7 @@ std::ostream &operator<<(std::ostream &out, const segment_id_t &segment)
     return out << "Seg[NULL]";
   } else {
     return out << "Seg[" << device_id_printer_t{segment.device_id()}
-               << "," << segment.device_segment_id()
+               << ",0x" << std::hex << segment.device_segment_id() << std::dec
                << "]";
   }
 }
@@ -93,12 +95,12 @@ std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq)
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_t &laddr) {
-  return out << 'L' << std::hex << laddr.value << std::dec;
+  return out << "L0x" << std::hex << laddr.value << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_offset_t &laddr_offset) {
   return out << laddr_offset.get_aligned_laddr()
-	     << "+" << std::hex << laddr_offset.get_offset() << std::dec;
+	     << "+0x" << std::hex << laddr_offset.get_offset() << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr)
@@ -123,18 +125,18 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
   } else if (has_device_off(id)) {
     auto &s = rhs.as_res_paddr();
     out << device_id_printer_t{id}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) {
     auto &s = rhs.as_seg_paddr();
     out << s.get_segment_id()
-        << ","
-        << s.get_segment_off();
+        << ",0x"
+        << std::hex << s.get_segment_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
     auto &s = rhs.as_blk_paddr();
     out << device_id_printer_t{s.get_device_id()}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else {
     out << "INVALID!";
   }
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 52515937a9e..5d8ad00ba22 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -80,6 +80,11 @@ struct rewrite_stats_t {
   }
 };
 
+struct rbm_pending_ool_t {
+  bool is_conflicted = false;
+  std::list<CachedExtentRef> pending_extents;
+};
+
 /**
  * Transaction
  *
@@ -554,6 +559,18 @@ public:
     return static_cast<T&>(*view);
   }
 
+  void set_pending_ool(seastar::lw_shared_ptr<rbm_pending_ool_t> ptr) {
+    pending_ool = ptr;
+  }
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> get_pending_ool() {
+    return pending_ool;
+  }
+
+  const auto& get_pre_alloc_list() {
+    return pre_alloc_list;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -650,6 +667,8 @@ private:
   const src_t src;
 
   transaction_id_t trans_id = TRANS_ID_NULL;
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
 };
 using TransactionRef = Transaction::Ref;
 
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index a76b7fbe0c9..f4e3b0858f2 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -48,7 +48,7 @@ TransactionManager::TransactionManager(
 TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
 {
   LOG_PREFIX(TransactionManager::mkfs);
-  INFO("enter");
+  INFO("...");
   return epm->mount(
   ).safe_then([this] {
     return journal->open_for_mkfs();
@@ -94,14 +94,15 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
   }).safe_then([this] {
     return close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
   });
 }
 
-TransactionManager::mount_ertr::future<> TransactionManager::mount()
+TransactionManager::mount_ertr::future<>
+TransactionManager::mount()
 {
   LOG_PREFIX(TransactionManager::mount);
-  INFO("enter");
+  INFO("...");
   cache->init();
   return epm->mount(
   ).safe_then([this] {
@@ -168,16 +169,17 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount()
     return epm->open_for_write();
   }).safe_then([FNAME, this] {
     epm->start_background();
-    INFO("completed");
+    INFO("done");
   }).handle_error(
     mount_ertr::pass_further{},
     crimson::ct_error::assert_all{"unhandled error"}
   );
 }
 
-TransactionManager::close_ertr::future<> TransactionManager::close() {
+TransactionManager::close_ertr::future<>
+TransactionManager::close() {
   LOG_PREFIX(TransactionManager::close);
-  INFO("enter");
+  INFO("...");
   return epm->stop_background(
   ).then([this] {
     return cache->close();
@@ -187,7 +189,7 @@ TransactionManager::close_ertr::future<> TransactionManager::close() {
   }).safe_then([this] {
     return epm->close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
     return seastar::now();
   });
 }
@@ -229,28 +231,26 @@ TransactionManager::ref_ret TransactionManager::remove(
   LogicalCachedExtentRef &ref)
 {
   LOG_PREFIX(TransactionManager::remove);
-  TRACET("{}", t, *ref);
+  DEBUGT("{} ...", t, *ref);
   return lba_manager->decref_extent(t, ref->get_laddr()
   ).si_then([this, FNAME, &t, ref](auto result) {
-    DEBUGT("extent refcount is decremented to {} -- {}",
-           t, result.refcount, *ref);
     if (result.refcount == 0) {
       cache->retire_extent(t, ref);
     }
+    DEBUGT("removed {}~0x{:x} refcount={} -- {}",
+           t, result.addr, result.length, result.refcount, *ref);
     return result.refcount;
   });
 }
 
-TransactionManager::ref_ret TransactionManager::_dec_ref(
+TransactionManager::ref_ret TransactionManager::remove(
   Transaction &t,
   laddr_t offset)
 {
-  LOG_PREFIX(TransactionManager::_dec_ref);
-  TRACET("{}", t, offset);
+  LOG_PREFIX(TransactionManager::remove);
+  DEBUGT("{} ...", t, offset);
   return lba_manager->decref_extent(t, offset
   ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
-    DEBUGT("extent refcount is decremented to {} -- {}~{}, {}",
-           t, result.refcount, offset, result.length, result.addr);
     auto fut = ref_iertr::now();
     if (result.refcount == 0) {
       if (result.addr.is_paddr() &&
@@ -259,8 +259,9 @@ TransactionManager::ref_ret TransactionManager::_dec_ref(
           t, result.addr.get_paddr(), result.length);
       }
     }
-
-    return fut.si_then([result=std::move(result)] {
+    return fut.si_then([result=std::move(result), offset, &t, FNAME] {
+      DEBUGT("removed {}~0x{:x} refcount={} -- offset={}",
+             t, result.addr, result.length, result.refcount, offset);
       return result.refcount;
     });
   });
@@ -271,19 +272,21 @@ TransactionManager::refs_ret TransactionManager::remove(
   std::vector<laddr_t> offsets)
 {
   LOG_PREFIX(TransactionManager::remove);
-  DEBUG("{} offsets", offsets.size());
+  DEBUGT("{} offsets ...", t, offsets.size());
   return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
-      [this, &t] (auto &&offsets, auto &refcnt) {
-      return trans_intr::do_for_each(offsets.begin(), offsets.end(),
-        [this, &t, &refcnt] (auto &laddr) {
-        return this->remove(t, laddr).si_then([&refcnt] (auto ref) {
-          refcnt.push_back(ref);
-          return ref_iertr::now();
-        });
-      }).si_then([&refcnt] {
-        return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt));
+    [this, &t, FNAME](auto &&offsets, auto &refcnts) {
+    return trans_intr::do_for_each(offsets.begin(), offsets.end(),
+      [this, &t, &refcnts](auto &laddr) {
+      return this->remove(t, laddr
+      ).si_then([&refcnts](auto ref) {
+        refcnts.push_back(ref);
+        return ref_iertr::now();
       });
+    }).si_then([&refcnts, &t, FNAME] {
+      DEBUGT("removed {} offsets", t, refcnts.size());
+      return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnts));
     });
+  });
 }
 
 TransactionManager::submit_transaction_iertr::future<>
@@ -340,6 +343,7 @@ TransactionManager::update_lba_mappings(
         return;
       }
       if (extent->is_logical()) {
+        assert(is_logical_type(extent->get_type()));
         // for rewritten extents, last_committed_crc should have been set
         // because the crc of the original extent may be reused.
         // also see rewrite_logical_extent()
@@ -359,6 +363,7 @@ TransactionManager::update_lba_mappings(
 #endif
         lextents.emplace_back(extent->template cast<LogicalCachedExtent>());
       } else {
+        assert(is_physical_type(extent->get_type()));
         pextents.emplace_back(extent);
       }
     };
@@ -515,7 +520,6 @@ TransactionManager::rewrite_logical_extent(
     ERRORT("extent has been invalidated -- {}", t, *extent);
     ceph_abort();
   }
-  TRACET("rewriting extent -- {}", t, *extent);
 
   auto lextent = extent->cast<LogicalCachedExtent>();
   cache->retire_extent(t, extent);
@@ -529,7 +533,7 @@ TransactionManager::rewrite_logical_extent(
       lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
     nlextent->rewrite(t, *lextent, 0);
 
-    DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+    DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
 
 #ifndef NDEBUG
     if (get_checksum_needed(lextent->get_paddr())) {
@@ -566,16 +570,16 @@ TransactionManager::rewrite_logical_extent(
       0,
       lextent->get_length(),
       extent_ref_count_t(0),
-      [this, lextent, &t](auto &extents, auto &off, auto &left, auto &refcount) {
+      [this, FNAME, lextent, &t]
+      (auto &extents, auto &off, auto &left, auto &refcount) {
       return trans_intr::do_for_each(
         extents,
-        [lextent, this, &t, &off, &left, &refcount](auto &nextent) {
-        LOG_PREFIX(TransactionManager::rewrite_logical_extent);
+        [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
         bool first_extent = (off == 0);
         ceph_assert(left >= nextent->get_length());
         auto nlextent = nextent->template cast<LogicalCachedExtent>();
         nlextent->rewrite(t, *lextent, off);
-        DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+        DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
 
         /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
          * extents since we're going to do it again once we either do the ool write
@@ -629,10 +633,18 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   {
     auto updated = cache->update_extent_from_transaction(t, extent);
     if (!updated) {
-      DEBUGT("extent is already retired, skipping -- {}", t, *extent);
+      DEBUGT("target={} {} already retired, skipping -- {}", t,
+             rewrite_gen_printer_t{target_generation},
+             sea_time_point_printer_t{modify_time},
+             *extent);
       return rewrite_extent_iertr::now();
     }
+
     extent = updated;
+    DEBUGT("target={} {} -- {} ...", t,
+           rewrite_gen_printer_t{target_generation},
+           sea_time_point_printer_t{modify_time},
+           *extent);
     ceph_assert(!extent->is_pending_io());
   }
 
@@ -650,9 +662,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
       // FIXME: is_dirty() is true for mutation pending extents
       // which shouldn't do inplace rewrite because a pending transaction
       // may fail.
-      DEBUGT("delta overwriting extent -- {}", t, *extent);
       t.add_inplace_rewrite_extent(extent);
       extent->set_inplace_rewrite_generation();
+      DEBUGT("rewritten as inplace rewrite -- {}", t, *extent);
       return rewrite_extent_iertr::now();
     }
     extent->set_target_rewrite_generation(INIT_GENERATION);
@@ -665,23 +677,25 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
     t.get_rewrite_stats().account_n_dirty();
   }
 
-  if (is_backref_node(extent->get_type())) {
-    DEBUGT("rewriting backref extent -- {}", t, *extent);
-    return backref_manager->rewrite_extent(t, extent);
-  }
-
   if (is_root_type(extent->get_type())) {
-    DEBUGT("rewriting root extent -- {}", t, *extent);
     cache->duplicate_for_write(t, extent);
+    DEBUGT("rewritten root {}", t, *extent);
     return rewrite_extent_iertr::now();
   }
 
+  auto fut = rewrite_extent_iertr::now();
   if (extent->is_logical()) {
-    return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+    assert(is_logical_type(extent->get_type()));
+    fut = rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+  } else if (is_backref_node(extent->get_type())) {
+    fut = backref_manager->rewrite_extent(t, extent);
   } else {
-    DEBUGT("rewriting physical extent -- {}", t, *extent);
-    return lba_manager->rewrite_extent(t, extent);
+    assert(is_lba_node(extent->get_type()));
+    fut = lba_manager->rewrite_extent(t, extent);
   }
+  return fut.si_then([FNAME, &t] {
+    DEBUGT("rewritten", t);
+  });
 }
 
 TransactionManager::get_extents_if_live_ret
@@ -693,7 +707,7 @@ TransactionManager::get_extents_if_live(
   extent_len_t len)
 {
   LOG_PREFIX(TransactionManager::get_extents_if_live);
-  TRACET("{} {}~{} {}", t, type, laddr, len, paddr);
+  DEBUGT("{} {}~0x{:x} {} ...", t, type, laddr, len, paddr);
 
   // This only works with segments to check if alive,
   // as parallel transactions may split the extent at the same time.
@@ -703,7 +717,7 @@ TransactionManager::get_extents_if_live(
   ).si_then([=, this, &t](auto extent)
 	    -> get_extents_if_live_ret {
     if (extent && extent->get_length() == len) {
-      DEBUGT("{} {}~{} {} is live in cache -- {}",
+      DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
              t, type, laddr, len, paddr, *extent);
       std::list<CachedExtentRef> res;
       res.emplace_back(std::move(extent));
@@ -757,7 +771,9 @@ TransactionManager::get_extents_if_live(
               list.emplace_back(std::move(ret));
               return seastar::now();
             });
-          }).si_then([&list] {
+          }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+            DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
+                   t, type, laddr, len, paddr, list.size());
             return get_extents_if_live_ret(
               interruptible::ready_future_marker{},
               std::move(list));
@@ -778,11 +794,11 @@ TransactionManager::get_extents_if_live(
       ).si_then([=, &t](auto ret) {
         std::list<CachedExtentRef> res;
         if (ret) {
-          DEBUGT("{} {}~{} {} is live as physical extent -- {}",
+          DEBUGT("{} {}~0x{:x} {} is absent and alive as physical extent -- {}",
                  t, type, laddr, len, paddr, *ret);
           res.emplace_back(std::move(ret));
         } else {
-          DEBUGT("{} {}~{} {} is not live as physical extent",
+          DEBUGT("{} {}~0x{:x} {} is not alive as physical extent",
                  t, type, laddr, len, paddr);
         }
         return get_extents_if_live_ret(
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 828b8a25592..c7a94a9ef11 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -106,8 +106,12 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::get_pin);
-    SUBTRACET(seastore_tm, "{}", t, offset);
-    return lba_manager->get_mapping(t, offset);
+    SUBDEBUGT(seastore_tm, "{} ...", t, offset);
+    return lba_manager->get_mapping(t, offset
+    ).si_then([FNAME, &t](LBAMappingRef pin) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *pin);
+      return pin;
+    });
   }
 
   /**
@@ -122,9 +126,13 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::get_pins);
-    SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, offset, length);
     return lba_manager->get_mappings(
-      t, offset, length);
+      t, offset, length
+    ).si_then([FNAME, &t](lba_pin_list_t pins) {
+      SUBDEBUGT(seastore_tm, "got {} pins", t, pins.size());
+      return pins;
+    });
   }
 
   /**
@@ -142,15 +150,15 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} {} ...",
+              t, offset, length, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset, length] (auto pin)
       -> read_extent_ret<T> {
       if (length != pin->get_length() || !pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} len {} got wrong pin {}",
-            t, offset, length, *pin);
+        SUBERRORT(seastore_tm, "{}~0x{:x} {} got wrong {}",
+                  t, offset, length, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -167,15 +175,15 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}", t, offset);
+    SUBDEBUGT(seastore_tm, "{} {} ...",
+              t, offset, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset] (auto pin)
       -> read_extent_ret<T> {
       if (!pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} got wrong pin {}",
-            t, offset, *pin);
+        SUBERRORT(seastore_tm, "{} {} got wrong {}",
+                  t, offset, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -187,6 +195,8 @@ public:
     Transaction &t,
     LBAMappingRef pin)
   {
+    LOG_PREFIX(TransactionManager::read_pin);
+    SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
     auto fut = base_iertr::make_ready_future<LBAMappingRef>();
     if (!pin->is_parent_viewable()) {
       if (pin->is_parent_valid()) {
@@ -212,52 +222,12 @@ public:
       } else {
 	return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
       }
+    }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *ext);
+      return ext;
     });
   }
 
-  template <typename T>
-  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
-  get_extent_if_linked(
-    Transaction &t,
-    LBAMappingRef pin)
-  {
-    ceph_assert(pin->is_parent_viewable());
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    auto v = pin->get_logical_extent(t);
-    if (v.has_child()) {
-      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
-#ifndef NDEBUG
-        auto lextent = extent->template cast<LogicalCachedExtent>();
-        auto pin_laddr = pin->get_key();
-        if (pin->is_indirect()) {
-          pin_laddr = pin->get_intermediate_base();
-        }
-        assert(lextent->get_laddr() == pin_laddr);
-#endif
-	return extent->template cast<T>();
-      });
-    } else {
-      return pin;
-    }
-  }
-
-  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
-    Transaction &t,
-    LBAMappingRef pin,
-    extent_types_t type)
-  {
-    ceph_assert(!pin->parent_modified());
-    auto v = pin->get_logical_extent(t);
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    if (v.has_child()) {
-      return std::move(v.get_child_fut());
-    } else {
-      return pin_to_extent_by_type(t, std::move(pin), type);
-    }
-  }
-
   /// Obtain mutable copy of extent
   LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
     LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -265,24 +235,15 @@ public:
       t,
       ref)->cast<LogicalCachedExtent>();
     if (!ret->has_laddr()) {
-      SUBDEBUGT(seastore_tm,
-	"duplicating extent for write -- {} -> {}",
-	t,
-	*ref,
-	*ret);
+      SUBDEBUGT(seastore_tm, "duplicate from {}", t, *ref);
       ret->set_laddr(ref->get_laddr());
     } else {
-      SUBTRACET(seastore_tm,
-	"extent is already duplicated -- {}",
-	t,
-	*ref);
       assert(ref->is_mutable());
       assert(&*ref == &*ret);
     }
     return ret;
   }
 
-
   using ref_iertr = LBAManager::ref_iertr;
   using ref_ret = ref_iertr::future<extent_ref_count_t>;
 
@@ -302,26 +263,15 @@ public:
    * remove
    *
    * Remove the extent and the corresponding lba mapping,
-   * users must make sure that lba mapping's refcount is 1
+   * users must make sure that lba mapping's refcount > 1
    */
   ref_ret remove(
     Transaction &t,
     LogicalCachedExtentRef &ref);
 
-  /**
-   * remove
-   *
-   * 1. Remove the indirect mapping(s), and if refcount drops to 0,
-   *    also remove the direct mapping and retire the extent.
-   * 
-   * 2. Remove the direct mapping(s) and retire the extent if
-   * 	refcount drops to 0.
-   */
   ref_ret remove(
     Transaction &t,
-    laddr_t offset) {
-    return _dec_ref(t, offset);
-  }
+    laddr_t offset);
 
   /// remove refcount for list of offset
   using refs_ret = ref_iertr::future<std::vector<unsigned>>;
@@ -346,23 +296,23 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto ext = cache->alloc_new_non_data_extent<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (!ext) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
       *ext
-    ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
-      LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-      SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
+    ).si_then([ext=std::move(ext), &t, FNAME](auto &&) mutable {
+      SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -385,14 +335,15 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_data_extents);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto exts = cache->alloc_new_data_extents<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (exts.empty()) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extents(
@@ -403,7 +354,7 @@ public:
       EXTENT_DEFAULT_REF_COUNT
     ).si_then([exts=std::move(exts), &t, FNAME](auto &&) mutable {
       for (auto &ext : exts) {
-	SUBDEBUGT(seastore_tm, "new extent: {}", t, *ext);
+	SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       }
       return alloc_extent_iertr::make_ready_future<
 	std::vector<TCachedExtentRef<T>>>(std::move(exts));
@@ -411,15 +362,21 @@ public:
   }
 
   template <typename T>
-  read_extent_ret<T> get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) {
+  read_extent_ret<T> get_mutable_extent_by_laddr(
+      Transaction &t,
+      laddr_t laddr,
+      extent_len_t len) {
+    LOG_PREFIX(TransactionManager::get_mutable_extent_by_laddr);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, laddr, len);
     return get_pin(t, laddr
     ).si_then([this, &t, len](auto pin) {
       ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved());
       ceph_assert(!pin->is_clone());
       ceph_assert(pin->get_length() == len);
       return this->read_pin<T>(t, std::move(pin));
-    }).si_then([this, &t](auto extent) {
+    }).si_then([this, &t, FNAME](auto extent) {
       auto ext = get_mutable_extent(t, extent)->template cast<T>();
+      SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
       return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -476,10 +433,8 @@ public:
       extent_len_t original_len = pin->get_length();
       paddr_t original_paddr = pin->get_val();
       LOG_PREFIX(TransactionManager::remap_pin);
-      SUBDEBUGT(seastore_tm,
-	"original laddr: {}, original paddr: {}, original length: {},"
-	" remap to {} extents",
-	t, original_laddr, original_paddr, original_len, remaps.size());
+      SUBDEBUGT(seastore_tm, "{}~0x{:x} {} into {} remaps ... {}",
+                t, original_laddr, original_len, original_paddr, remaps.size(), *pin);
       // The according extent might be stable or pending.
       auto fut = base_iertr::now();
       if (!pin->is_indirect()) {
@@ -536,14 +491,13 @@ public:
 	    auto remap_len = remap.len;
 	    auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr();
 	    auto remap_paddr = original_paddr.add_offset(remap_offset);
+	    SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...",
+	              t, remap_laddr, remap_len, remap_paddr);
 	    ceph_assert(remap_len < original_len);
 	    ceph_assert(remap_offset + remap_len <= original_len);
 	    ceph_assert(remap_len != 0);
 	    ceph_assert(remap_offset % cache->get_block_size() == 0);
 	    ceph_assert(remap_len % cache->get_block_size() == 0);
-	    SUBDEBUGT(seastore_tm,
-	      "remap laddr: {}, remap paddr: {}, remap length: {}", t,
-	      remap_laddr, remap_paddr, remap_len);
 	    auto extent = cache->alloc_remapped_extent<T>(
 	      t,
 	      remap_laddr,
@@ -555,13 +509,15 @@ public:
 	  }
 	});
       }
-      return fut.si_then([this, &t, &pin, &remaps, &extents] {
+      return fut.si_then([this, &t, &pin, &remaps, &extents, FNAME] {
 	return lba_manager->remap_mappings(
 	  t,
 	  std::move(pin),
 	  std::vector<remap_entry>(remaps.begin(), remaps.end()),
 	  std::move(extents)
-	).si_then([](auto ret) {
+	).si_then([FNAME, &t](auto ret) {
+	  SUBDEBUGT(seastore_tm, "remapped {} pins",
+	            t, ret.remapped_mappings.size());
 	  return Cache::retire_extent_iertr::make_ready_future<
 	    std::vector<LBAMappingRef>>(std::move(ret.remapped_mappings));
 	});
@@ -581,11 +537,15 @@ public:
     laddr_t hint,
     extent_len_t len) {
     LOG_PREFIX(TransactionManager::reserve_region);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
+    SUBDEBUGT(seastore_tm, "hint {}~0x{:x} ...", t, hint, len);
     return lba_manager->reserve_region(
       t,
       hint,
-      len);
+      len
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "reserved {}", t, *pin);
+      return pin;
+    });
   }
 
   /*
@@ -612,15 +572,17 @@ public:
         : mapping.get_key();
 
     LOG_PREFIX(TransactionManager::clone_pin);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
-      t, mapping.get_length(), hint, intermediate_key);
+    SUBDEBUGT(seastore_tm, "{} clone to hint {} ...", t, mapping, hint);
     return lba_manager->clone_mapping(
       t,
       hint,
       mapping.get_length(),
       intermediate_key,
       intermediate_base
-    );
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "cloned as {}", t, *pin);
+      return pin;
+    });
   }
 
   /* alloc_extents
@@ -635,10 +597,10 @@ public:
      extent_len_t len,
      int num) {
      LOG_PREFIX(TransactionManager::alloc_extents);
-     SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}",
-               t, len, hint, num);
+     SUBDEBUGT(seastore_tm, "hint {}~({} * 0x{:x}) ...",
+               t, hint, num, len);
      return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
-       [this, &t, hint, len, num] (auto &extents) {
+       [this, &t, hint, len, num, FNAME](auto &extents) {
        return trans_intr::do_for_each(
                        boost::make_counting_iterator(0),
                        boost::make_counting_iterator(num),
@@ -647,7 +609,8 @@ public:
            [&extents](auto &&node) {
            extents.push_back(node);
          });
-       }).si_then([&extents] {
+       }).si_then([&extents, &t, FNAME] {
+         SUBDEBUGT(seastore_tm, "allocated {} extents", t, extents.size());
          return alloc_extents_iertr::make_ready_future
                 <std::vector<TCachedExtentRef<T>>>(std::move(extents));
        });
@@ -753,7 +716,7 @@ public:
     const std::string& key,
     const std::string& value) {
     LOG_PREFIX(TransactionManager::update_root_meta);
-    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value);
+    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
     ).si_then([this, &t, &key, &value](RootBlockRef root) {
@@ -808,7 +771,7 @@ public:
     return cache->get_root(t).si_then([&t](auto croot) {
       LOG_PREFIX(TransactionManager::read_collection_root);
       auto ret = croot->get_root().collection_root.get();
-      SUBTRACET(seastore_tm, "{}~{}",
+      SUBTRACET(seastore_tm, "{}~0x{:x}",
                 t, ret.get_location(), ret.get_size());
       return ret;
     });
@@ -821,7 +784,7 @@ public:
    */
   void write_collection_root(Transaction &t, coll_root_t cmroot) {
     LOG_PREFIX(TransactionManager::write_collection_root);
-    SUBDEBUGT(seastore_tm, "{}~{}",
+    SUBDEBUGT(seastore_tm, "{}~0x{:x}",
               t, cmroot.get_location(), cmroot.get_size());
     auto croot = cache->get_root_fast(t);
     croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
@@ -853,6 +816,49 @@ private:
 
   shard_stats_t& shard_stats;
 
+  template <typename T>
+  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+  get_extent_if_linked(
+    Transaction &t,
+    LBAMappingRef pin)
+  {
+    ceph_assert(pin->is_parent_viewable());
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    auto v = pin->get_logical_extent(t);
+    if (v.has_child()) {
+      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+#ifndef NDEBUG
+        auto lextent = extent->template cast<LogicalCachedExtent>();
+        auto pin_laddr = pin->get_key();
+        if (pin->is_indirect()) {
+          pin_laddr = pin->get_intermediate_base();
+        }
+        assert(lextent->get_laddr() == pin_laddr);
+#endif
+	return extent->template cast<T>();
+      });
+    } else {
+      return pin;
+    }
+  }
+
+  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
+    Transaction &t,
+    LBAMappingRef pin,
+    extent_types_t type)
+  {
+    ceph_assert(!pin->parent_modified());
+    auto v = pin->get_logical_extent(t);
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    if (v.has_child()) {
+      return std::move(v.get_child_fut());
+    } else {
+      return pin_to_extent_by_type(t, std::move(pin), type);
+    }
+  }
+
   rewrite_extent_ret rewrite_logical_extent(
     Transaction& t,
     LogicalCachedExtentRef extent);
@@ -862,11 +868,6 @@ private:
     ExtentPlacementManager::dispatch_result_t dispatch_result,
     std::optional<journal_seq_t> seq_to_trim = std::nullopt);
 
-  /// Remove refcount for offset
-  ref_ret _dec_ref(
-    Transaction &t,
-    laddr_t offset);
-
   using update_lba_mappings_ret = LBAManager::update_mappings_ret;
   update_lba_mappings_ret update_lba_mappings(
     Transaction &t,
@@ -886,7 +887,7 @@ private:
     Transaction &t,
     LBAMappingRef pin) {
     LOG_PREFIX(TransactionManager::pin_to_extent);
-    SUBTRACET(seastore_tm, "getting extent {}", t, *pin);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
     static_assert(is_logical_type(T::TYPE));
     using ret = pin_to_extent_ret<T>;
     auto &pref = *pin;
@@ -950,7 +951,8 @@ private:
       extent_types_t type)
   {
     LOG_PREFIX(TransactionManager::pin_to_extent_by_type);
-    SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} type {} ...",
+              t, *pin, type);
     assert(is_logical_type(type));
     auto &pref = *pin;
     return cache->get_absent_extent_by_type(
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 683dc6ea649..522a93a1ddc 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -52,6 +52,12 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     return peering_state.is_backfilling();
   }
 
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {
+    return peering_state.prepare_backfill_for_missing(soid, v, peers);
+  }
   PeeringFacade(PeeringState& peering_state)
     : peering_state(peering_state) {
   }
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf..018e58b68f8 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -225,7 +225,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
   const BackfillInterval& backfill_info) const
 {
   return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
-	 !backfill_info.extends_to_end();
+	 !backfill_info.extends_to_end() && backfill_info.empty();
 }
 
 void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
@@ -266,6 +266,7 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
   logger().debug("{}: check={}", __func__, check);
   const auto& primary_bi = backfill_state().backfill_info;
   result_t result { {}, primary_bi.begin };
+  std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
 
   for (const auto& bt : peering_state().get_backfill_targets()) {
     const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
@@ -273,9 +274,13 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
     // Find all check peers that have the wrong version
     if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
         check == primary_bi.begin && check == peer_bi.begin) {
-      if(peer_bi.objects.begin()->second != obj_v &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (peer_bi.objects.begin()->second != obj_v) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       } else {
         // it's fine, keep it! OR already recovering
       }
@@ -284,12 +289,22 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
       // Only include peers that we've caught up to their backfill line
       // otherwise, they only appear to be missing this object
       // because their peer_bi.begin > backfill_info.begin.
-      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt)) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       }
     }
   }
+  for (auto &backfill : backfills) {
+    auto &soid = backfill.first;
+    auto &obj_v = backfill.second.first;
+    auto &peers = backfill.second.second;
+    backfill_listener().enqueue_push(soid, obj_v, peers);
+  }
   return result;
 }
 
@@ -327,16 +342,29 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
   }
   trim_backfill_infos();
 
-  while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+				   primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    logger().debug("{}: reached end for current local chunk", __func__);
+    post_event(RequestPrimaryScanning{});
+    return;
+  }
+
+  do {
     if (!backfill_listener().budget_available()) {
       post_event(RequestWaiting{});
       return;
     } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
-                                      primary_bi)) {
+				      primary_bi)) {
       // Count simultaneous scans as a single op and let those complete
       post_event(RequestReplicasScanning{});
       return;
     }
+
+    if (all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+      break;
+    }
     // Get object within set of peers to operate on and the set of targets
     // for which that object applies.
     if (const hobject_t check = \
@@ -355,30 +383,23 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      primary_bi.pop_front();
+      if (!primary_bi.empty()) {
+	primary_bi.pop_front();
+      }
     }
     backfill_listener().maybe_flush();
-  }
+  } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
-  if (should_rescan_primary(backfill_state().peer_backfill_info,
-                            primary_bi)) {
-    // need to grab one another chunk of the object namespace and restart
-    // the queueing.
-    logger().debug("{}: reached end for current local chunk",
-                   __func__);
-    post_event(RequestPrimaryScanning{});
-  } else {
-    if (backfill_state().progress_tracker->tracked_objects_completed()
-	&& Enqueuing::all_enqueued(peering_state(),
-				   backfill_state().backfill_info,
-				   backfill_state().peer_backfill_info)) {
-      backfill_state().last_backfill_started = hobject_t::get_max();
-      backfill_listener().update_peers_last_backfill(hobject_t::get_max());
-    }
-    logger().debug("{}: reached end for both local and all peers "
-                   "but still has in-flight operations", __func__);
-    post_event(RequestWaiting{});
+  if (backfill_state().progress_tracker->tracked_objects_completed()
+      && Enqueuing::all_enqueued(peering_state(),
+				 backfill_state().backfill_info,
+				 backfill_state().peer_backfill_info)) {
+    backfill_state().last_backfill_started = hobject_t::get_max();
+    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   }
+  logger().debug("{}: reached end for both local and all peers "
+		 "but still has in-flight operations", __func__);
+  post_event(RequestWaiting{});
 }
 
 // -- PrimaryScanning
@@ -403,7 +424,7 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
 {
   logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -480,7 +501,7 @@ BackfillState::ReplicasScanning::react(ObjectPushed evt)
 {
   logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -496,16 +517,8 @@ BackfillState::Waiting::react(ObjectPushed evt)
 {
   logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
-  if (!Enqueuing::all_enqueued(peering_state(),
-                               backfill_state().backfill_info,
-                               backfill_state().peer_backfill_info)) {
-    return transit<Enqueuing>();
-  } else {
-    // we still have something to wait on
-    logger().debug("Waiting::react() on ObjectPushed; still waiting");
-    return discard_event();
-  }
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
+  return transit<Enqueuing>();;
 }
 
 // -- Done
@@ -559,7 +572,8 @@ void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
 
 void BackfillState::ProgressTracker::complete_to(
   const hobject_t& obj,
-  const pg_stat_t& stats)
+  const pg_stat_t& stats,
+  bool may_push_to_max)
 {
   logger().debug("{}: obj={}",
                  __func__, obj);
@@ -570,6 +584,7 @@ void BackfillState::ProgressTracker::complete_to(
   } else {
     ceph_abort_msg("completing untracked object shall not happen");
   }
+  auto new_last_backfill = peering_state().earliest_backfill();
   for (auto it = std::begin(registry);
        it != std::end(registry) &&
          it->second.stage != op_stage_t::enqueued_push;
@@ -579,15 +594,18 @@ void BackfillState::ProgressTracker::complete_to(
     peering_state().update_complete_backfill_object_stats(
       soid,
       *item.stats);
+    assert(soid > new_last_backfill);
+    new_last_backfill = soid;
   }
-  if (Enqueuing::all_enqueued(peering_state(),
+  if (may_push_to_max &&
+      Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
                               backfill_state().peer_backfill_info) &&
       tracked_objects_completed()) {
     backfill_state().last_backfill_started = hobject_t::get_max();
     backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   } else {
-    backfill_listener().update_peers_last_backfill(obj);
+    backfill_listener().update_peers_last_backfill(new_last_backfill);
   }
 }
 
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 6c36db81813..ddc0cbf7355 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -336,7 +336,8 @@ struct BackfillState::BackfillListener {
 
   virtual void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) = 0;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) = 0;
 
   virtual void enqueue_drop(
     const pg_shard_t& target,
@@ -375,6 +376,10 @@ struct BackfillState::PeeringFacade {
   virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) = 0;
   virtual bool is_backfilling() const = 0;
+  virtual void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) = 0;
   virtual ~PeeringFacade() {}
 };
 
@@ -421,7 +426,7 @@ public:
 
   bool enqueue_push(const hobject_t&);
   void enqueue_drop(const hobject_t&);
-  void complete_to(const hobject_t&, const pg_stat_t&);
+  void complete_to(const hobject_t&, const pg_stat_t&, bool may_push_to_max);
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index df4f73d4077..9bf60140374 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -504,7 +504,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_snaps.find(clone);
       if (p == ss.clone_snaps.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_snaps, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -518,7 +518,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_overlap.find(clone);
       if (p == ss.clone_overlap.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_overlap, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -532,7 +532,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_size.find(clone);
       if (p == ss.clone_size.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_size, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -551,7 +551,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
   }
   resp.seq = ss.seq;
   logger().error(
-    "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}",
+    "OpsExecuter::do_list_snaps: {}, resp.clones.size(): {}",
     os.oi.soid,
     resp.clones.size());
   resp.encode(osd_op.outdata);
@@ -678,16 +678,32 @@ OpsExecuter::do_execute_op(OSDOp& osd_op)
       whiteout = true;
     }
     return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) {
-      int num_bytes = 0;
-      // Calculate num_bytes to be removed
-      if (obc->obs.oi.soid.is_snap()) {
-        ceph_assert(obc->ssc->snapset.clone_overlap.count(obc->obs.oi.soid.snap));
-        num_bytes = obc->ssc->snapset.get_clone_bytes(obc->obs.oi.soid.snap);
-      } else {
-        num_bytes = obc->obs.oi.size;
-      }
-      return backend.remove(os, txn, *osd_op_params,
-                            delta_stats, whiteout, num_bytes);
+      struct emptyctx_t {};
+      return with_effect_on_obc(
+	emptyctx_t{},
+	[&](auto &ctx) {
+	  int num_bytes = 0;
+	  // Calculate num_bytes to be removed
+	  if (obc->obs.oi.soid.is_snap()) {
+	    ceph_assert(obc->ssc->snapset.clone_overlap.count(
+			  obc->obs.oi.soid.snap));
+	    num_bytes = obc->ssc->snapset.get_clone_bytes(
+	      obc->obs.oi.soid.snap);
+	  } else {
+	    num_bytes = obc->obs.oi.size;
+	  }
+	  return backend.remove(os, txn, *osd_op_params,
+				delta_stats, whiteout, num_bytes);
+	},
+	[](auto &&ctx, ObjectContextRef obc, Ref<PG>) {
+	  return seastar::do_for_each(
+	    obc->watchers,
+	    [](auto &p) { return p.second->remove(); }
+	  ).then([obc] {
+	    obc->watchers.clear();
+	    return seastar::now();
+	  });
+	});
     });
   }
   case CEPH_OSD_OP_CALL:
@@ -957,7 +973,7 @@ void OpsExecuter::CloningContext::apply_to(
   processed_obc.ssc->snapset = std::move(new_snapset);
 }
 
-OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>>
+std::vector<pg_log_entry_t>
 OpsExecuter::flush_clone_metadata(
   std::vector<pg_log_entry_t>&& log_entries,
   SnapMapper& snap_mapper,
@@ -965,7 +981,6 @@ OpsExecuter::flush_clone_metadata(
   ceph::os::Transaction& txn)
 {
   assert(!txn.empty());
-  auto maybe_snap_mapped = interruptor::now();
   update_clone_overlap();
   if (cloning_ctx) {
     std::move(*cloning_ctx).apply_to(log_entries, *obc);
@@ -977,12 +992,7 @@ OpsExecuter::flush_clone_metadata(
   }
   logger().debug("{} done, initial snapset={}, new snapset={}",
     __func__, obc->obs.oi.soid, obc->ssc->snapset);
-  return std::move(
-    maybe_snap_mapped
-  ).then_interruptible([log_entries=std::move(log_entries)]() mutable {
-    return interruptor::make_ready_future<std::vector<pg_log_entry_t>>(
-      std::move(log_entries));
-  });
+  return std::move(log_entries);
 }
 
 ObjectContextRef OpsExecuter::prepare_clone(
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 0dea7d0515e..e770e825b32 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -40,7 +40,7 @@ namespace crimson::osd {
 class PG;
 
 // OpsExecuter -- a class for executing ops targeting a certain object.
-class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
+class OpsExecuter {
   friend class SnapTrimObjSubEvent;
 
   using call_errorator = crimson::errorator<
@@ -170,16 +170,12 @@ public:
 
   object_stat_sum_t delta_stats;
 private:
-  // an operation can be divided into two stages: main and effect-exposing
-  // one. The former is performed immediately on call to `do_osd_op()` while
-  // the later on `submit_changes()` – after successfully processing main
-  // stages of all involved operations. When any stage fails, none of all
-  // scheduled effect-exposing stages will be executed.
-  // when operation requires this division, some variant of `with_effect()`
-  // should be used.
+  // with_effect can be used to schedule operations to be performed
+  // at commit time.  effects will be discarded if the operation does
+  // not commit.
   struct effect_t {
     // an effect can affect PG, i.e. create a watch timeout
-    virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0;
+    virtual seastar::future<> execute(Ref<PG> pg) = 0;
     virtual ~effect_t() = default;
   };
 
@@ -213,10 +209,10 @@ private:
    * execute_clone
    *
    * If snapc contains a snap which occurred logically after the last write
-   * seen by this object (see OpsExecutor::should_clone()), we first need
+   * seen by this object (see OpsExecuter::should_clone()), we first need
    * make a clone of the object at its current state.  execute_clone primes
    * txn with that clone operation and returns an
-   * OpsExecutor::CloningContext which will allow us to fill in the corresponding
+   * OpsExecuter::CloningContext which will allow us to fill in the corresponding
    * metadata and log_entries once the operations have been processed.
    *
    * Note that this strategy differs from classic, which instead performs this
@@ -267,7 +263,7 @@ private:
   */
   void update_clone_overlap();
 
-  interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata(
+  std::vector<pg_log_entry_t> flush_clone_metadata(
     std::vector<pg_log_entry_t>&& log_entries,
     SnapMapper& snap_mapper,
     OSDriver& osdriver,
@@ -400,7 +396,7 @@ public:
   execute_op(OSDOp& osd_op);
 
   using rep_op_fut_tuple =
-    std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>;
+    std::tuple<interruptible_future<>, interruptible_future<>>;
   using rep_op_fut_t =
     interruptible_future<rep_op_fut_tuple>;
   template <typename MutFunc>
@@ -475,7 +471,7 @@ auto OpsExecuter::with_effect_on_obc(
          effect_func(std::move(effect_func)),
          obc(std::move(obc)) {
     }
-    osd_op_errorator::future<> execute(Ref<PG> pg) final {
+    seastar::future<> execute(Ref<PG> pg) final {
       return std::move(effect_func)(std::move(ctx),
                                     std::move(obc),
                                     std::move(pg));
@@ -502,15 +498,14 @@ OpsExecuter::flush_changes_n_do_ops_effects(
   assert(obc);
 
   auto submitted = interruptor::now();
-  auto all_completed =
-    interruptor::make_interruptible(osd_op_errorator::now());
+  auto all_completed = interruptor::now();
 
   if (cloning_ctx) {
     ceph_assert(want_mutate);
   }
 
   if (want_mutate) {
-    auto log_entries = co_await flush_clone_metadata(
+    auto log_entries = flush_clone_metadata(
       prepare_transaction(ops),
       snap_mapper,
       osdriver,
@@ -536,7 +531,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
     // need extra ref pg due to apply_stats() which can be executed after
     // informing snap mapper
     all_completed =
-      std::move(all_completed).safe_then_interruptible([this, pg=this->pg] {
+      std::move(all_completed).then_interruptible([this, pg=this->pg] {
       // let's do the cleaning of `op_effects` in destructor
       return interruptor::do_for_each(op_effects,
         [pg=std::move(pg)](auto& op_effect) {
@@ -552,21 +547,19 @@ OpsExecuter::flush_changes_n_do_ops_effects(
 
 template <class Func>
 struct OpsExecuter::RollbackHelper {
-  void rollback_obc_if_modified(const std::error_code& e);
-  seastar::lw_shared_ptr<OpsExecuter> ox;
+  void rollback_obc_if_modified();
+  OpsExecuter *ox;
   Func func;
 };
 
 template <class Func>
 inline OpsExecuter::RollbackHelper<Func>
 OpsExecuter::create_rollbacker(Func&& func) {
-  return {shared_from_this(), std::forward<Func>(func)};
+  return {this, std::forward<Func>(func)};
 }
 
-
 template <class Func>
-void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
-  const std::error_code& e)
+void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified()
 {
   // Oops, an operation had failed. do_osd_ops() altogether with
   // OpsExecuter already dropped the ObjectStore::Transaction if
@@ -584,10 +577,9 @@ void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
   assert(ox);
   const auto need_rollback = ox->has_seen_write();
   crimson::get_logger(ceph_subsys_osd).debug(
-    "{}: object {} got error {}, need_rollback={}",
+    "{}: object {} got error, need_rollback={}",
     __func__,
     ox->obc->get_oid(),
-    e,
     need_rollback);
   if (need_rollback) {
     func(ox->obc);
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 8d2d10fbd7c..34ad97ceb06 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -23,6 +23,7 @@
 #include "messages/MOSDOp.h"
 #include "messages/MOSDPeeringOp.h"
 #include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGRemove.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 #include "messages/MOSDRepOpReply.h"
@@ -863,6 +864,8 @@ OSD::do_ms_dispatch(
     [[fallthrough]];
   case MSG_OSD_PG_LOG:
     return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+  case MSG_OSD_PG_REMOVE:
+    return handle_pg_remove(conn, boost::static_pointer_cast<MOSDPGRemove>(m));
   case MSG_OSD_REPOP:
     return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
   case MSG_OSD_REPOPREPLY:
@@ -1555,6 +1558,27 @@ seastar::future<> OSD::handle_peering_op(
     std::move(*evt)).second;
 }
 
+seastar::future<> OSD::handle_pg_remove(
+  crimson::net::ConnectionRef conn,
+  Ref<MOSDPGRemove> m)
+{
+  LOG_PREFIX(OSD::handle_pg_remove);
+  const int from = m->get_source().num();
+  std::vector<seastar::future<>> futs;
+  for (auto &pg : m->pg_list) {
+    DEBUG("{} from {}", pg, from);
+    futs.emplace_back(
+      pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+	conn,
+	pg_shard_t{from, pg.shard},
+	pg,
+	m->get_epoch(),
+	m->get_epoch(),
+	PeeringState::DeleteStart()).second);
+  }
+  return seastar::when_all_succeed(std::move(futs));
+}
+
 seastar::future<> OSD::check_osdmap_features()
 {
   LOG_PREFIX(OSD::check_osdmap_features);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index de39d808274..d7d54d5d2c3 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -208,6 +208,8 @@ private:
                                         Ref<MOSDRepOpReply> m);
   seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
                                       Ref<MOSDPeeringOp> m);
+  seastar::future<> handle_pg_remove(crimson::net::ConnectionRef conn,
+				     Ref<MOSDPGRemove> m);
   seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
                                            Ref<MOSDFastDispatchOp> m);
   seastar::future<> handle_scrub_command(crimson::net::ConnectionRef conn,
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
index fb0432edb8f..fd8b049c0bf 100644
--- a/src/crimson/osd/osd_operation.h
+++ b/src/crimson/osd/osd_operation.h
@@ -40,6 +40,37 @@ struct PerShardPipeline {
   } create_or_wait_pg;
 };
 
+struct PGPeeringPipeline {
+  struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
+  } await_map;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
+  } process;
+};
+
+struct CommonPGPipeline {
+  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
+    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
+  } wait_for_active;
+  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
+    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
+  } recover_missing;
+  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
+    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
+  } check_already_complete_get_obc;
+  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
+    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
+  } lock_obc;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "CommonPGPipeline::process";
+  } process;
+  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+  } wait_repop;
+};
+
+
 enum class OperationTypeCode {
   client_request = 0,
   peering_event,
diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
index 530732ba710..d2786a95e4d 100644
--- a/src/crimson/osd/osd_operation_external_tracking.h
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -36,7 +36,6 @@ struct LttngBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -117,10 +116,6 @@ struct LttngBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
 
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
@@ -171,7 +166,6 @@ struct HistoricBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -252,11 +246,6 @@ struct HistoricBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
-
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
               const ClientRequest::PGPipeline::LockOBC& blocker) override {
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index 8e9a7c4d749..a89fb2c84bc 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -403,11 +403,6 @@ ClientRequest::process_op(
 		   *pg, *this, this_instance_id);
 	  return do_process(
 	    ihref, pg, obc, this_instance_id
-	  ).handle_error_interruptible(
-	    crimson::ct_error::eagain::handle(
-	      [this, pg, this_instance_id, &ihref]() mutable {
-		return process_op(ihref, pg, this_instance_id);
-	      })
 	  );
 	}
       );
@@ -437,7 +432,7 @@ ClientRequest::process_op(
   co_await std::move(process);
 }
 
-ClientRequest::do_process_iertr::future<>
+ClientRequest::interruptible_future<>
 ClientRequest::do_process(
   instance_handle_t &ihref,
   Ref<PG> pg, crimson::osd::ObjectContextRef obc,
@@ -507,22 +502,128 @@ ClientRequest::do_process(
     co_return;
   }
 
-  auto [submitted, all_completed] = co_await pg->do_osd_ops(
-    m, r_conn, obc, op_info, snapc
+  OpsExecuter ox(pg, obc, op_info, *m, r_conn, snapc);
+  auto ret = co_await pg->run_executer(
+    ox, obc, op_info, m->ops
+  ).si_then([]() -> std::optional<std::error_code> {
+    return std::nullopt;
+  }).handle_error_interruptible(crimson::ct_error::all_same_way(
+    [](auto e) -> std::optional<std::error_code> {
+      return e;
+    })
   );
-  co_await std::move(submitted);
 
-  co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+  auto should_log_error = [](std::error_code e) -> bool {
+    switch (e.value()) {
+    case EDQUOT:
+    case ENOSPC:
+    case EAGAIN:
+      return false;
+    default:
+      return true;
+    }
+  };
 
-  auto reply = co_await std::move(all_completed);
+  if (ret && !should_log_error(*ret)) {
+    co_await reply_op_error(pg, -ret->value());
+    co_return;
+  }
+
+  {
+    auto all_completed = interruptor::now();
+    if (ret) {
+      assert(should_log_error(*ret));
+      if (op_info.may_write()) {
+	auto rep_tid = pg->shard_services.get_tid();
+	auto version = co_await pg->submit_error_log(
+	  m, op_info, obc, *ret, rep_tid);
+
+	all_completed = pg->complete_error_log(
+	  rep_tid, version);
+      }
+      // simply return the error below, leaving all_completed alone
+    } else {
+      auto submitted = interruptor::now();
+      std::tie(submitted, all_completed) = co_await pg->submit_executer(
+	std::move(ox), m->ops);
+      co_await std::move(submitted);
+    }
+    co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+
+    co_await std::move(all_completed);
+  }
 
   co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this);
-  DEBUGDPP("{}.{}: sending response",
-	   *pg, *this, this_instance_id);
-  // TODO: gate the crosscore sending
-  co_await interruptor::make_interruptible(
-    get_foreign_connection().send_with_throttling(std::move(reply))
-  );
+
+  if (ret) {
+    int err = -ret->value();
+    DEBUGDPP("{}: replying with error {}", *pg, *this, err);
+
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(), err, pg->get_osdmap_epoch(), 0, false);
+
+    if (!m->ops.empty() && m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
+      reply->set_result(0);
+    }
+
+    // For all ops except for CMPEXT, the correct error value is encoded
+    // in e. For CMPEXT, osdop.rval has the actual error value.
+    if (err == -ct_error::cmp_fail_error_value) {
+      assert(!m->ops.empty());
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0) {
+	  reply->set_result(osdop.rval);
+	  break;
+	}
+      }
+    }
+
+    reply->set_enoent_reply_versions(
+      pg->peering_state.get_info().last_update,
+      pg->peering_state.get_info().last_user_version);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply)));
+  } else {
+    int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+    if (op_info.may_read() && result >= 0) {
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+	  result = osdop.rval.code;
+	  break;
+	}
+      }
+    } else if (result > 0 && op_info.may_write() && !op_info.allows_returnvec()) {
+      result = 0;
+    } else if (result < 0 &&
+	     (m->ops.empty() ?
+	      0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+      result = 0;
+    }
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(),
+      result,
+      pg->get_osdmap_epoch(),
+      0,
+      false);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    if (obc->obs.exists) {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				obc->obs.oi.user_version);
+    } else {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				pg->peering_state.get_info().last_user_version);
+    }
+    
+    DEBUGDPP("{}.{}: sending response {}",
+	     *pg, *this, this_instance_id, *m);
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply))
+    );
+  }
 }
 
 bool ClientRequest::is_misdirected(const PG& pg) const
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index ea7aade22ac..6ee57e9874c 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -14,7 +14,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "crimson/osd/pg_map.h"
 #include "crimson/osd/scrub/pg_scrubber.h"
@@ -104,7 +103,6 @@ public:
       PGPipeline::RecoverMissing::BlockingEvent,
       scrub::PGScrubber::BlockingEvent,
       PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
-      PGPipeline::GetOBC::BlockingEvent,
       PGPipeline::LockOBC::BlockingEvent,
       PGPipeline::Process::BlockingEvent,
       PGPipeline::WaitRepop::BlockingEvent,
@@ -276,12 +274,7 @@ private:
   interruptible_future<> with_sequencer(FuncT&& func);
   interruptible_future<> reply_op_error(const Ref<PG>& pg, int err);
 
-
-  using do_process_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  do_process_iertr::future<> do_process(
+  interruptible_future<> do_process(
     instance_handle_t &ihref,
     Ref<PG> pg,
     crimson::osd::ObjectContextRef obc,
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
deleted file mode 100644
index 2b2d03ae4b3..00000000000
--- a/src/crimson/osd/osd_operations/common/pg_pipeline.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include "osd/osd_op_util.h"
-#include "crimson/osd/osd_operation.h"
-
-namespace crimson::osd {
-
-class CommonPGPipeline {
-protected:
-  friend class InternalClientRequest;
-  friend class SnapTrimEvent;
-  friend class SnapTrimObjSubEvent;
-
-  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
-    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
-  } wait_for_active;
-  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
-    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
-  } recover_missing;
-  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
-    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
-  } check_already_complete_get_obc;
-  struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::get_obc";
-  } get_obc;
-  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
-  } lock_obc;
-  struct Process : OrderedExclusivePhaseT<Process> {
-    static constexpr auto type_name = "CommonPGPipeline::process";
-  } process;
-  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
-    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
-  } wait_repop;
-};
-
-} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index a19bb0826f0..9e5867caf80 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -50,91 +50,107 @@ CommonPGPipeline& InternalClientRequest::client_pp()
   return pg->request_pg_pipeline;
 }
 
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::do_process(
+  crimson::osd::ObjectContextRef obc,
+  std::vector<OSDOp> &osd_ops)
+{
+  LOG_PREFIX(InternalClientRequest::do_process);
+  auto params = get_do_osd_ops_params();
+  OpsExecuter ox(
+    pg, obc, op_info, params, params.get_connection(), SnapContext{});
+  co_await pg->run_executer(
+    ox, obc, op_info, osd_ops
+  ).handle_error_interruptible(
+    crimson::ct_error::all_same_way(
+      [this, FNAME](auto e) {
+	ERRORDPPI("{}: got unexpected error {}", *pg, *this, e);
+	ceph_assert(0 == "should not return an error");
+	return interruptor::now();
+      })
+  );
+
+  auto [submitted, completed] = co_await pg->submit_executer(
+    std::move(ox), osd_ops);
+
+  co_await std::move(submitted);
+  co_await std::move(completed);
+}
+
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::with_interruption()
+{
+  LOG_PREFIX(InternalClientRequest::with_interruption);
+  co_await enter_stage<interruptor>(
+    client_pp().wait_for_active
+  );
+
+  co_await with_blocking_event<PGActivationBlocker::BlockingEvent,
+			       interruptor>([this] (auto&& trigger) {
+    return pg->wait_for_active_blocker.wait(std::move(trigger));
+  });
+
+  co_await enter_stage<interruptor>(client_pp().recover_missing);
+
+  bool unfound = co_await do_recover_missing(
+    pg, get_target_oid(), osd_reqid_t());
+
+  if (unfound) {
+    throw std::system_error(
+      std::make_error_code(std::errc::operation_canceled),
+      fmt::format("{} is unfound, drop it!", get_target_oid()));
+  }
+  co_await enter_stage<interruptor>(
+    client_pp().check_already_complete_get_obc);
+
+  DEBUGI("{}: getting obc lock", *this);
+
+  auto osd_ops = create_osd_ops();
+
+  DEBUGI("InternalClientRequest: got {} OSDOps to execute",
+	 std::size(osd_ops));
+  [[maybe_unused]] const int ret = op_info.set_from_op(
+    std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+  assert(ret == 0);
+  // call with_locked_obc() in order, but wait concurrently for loading.
+  enter_stage_sync(client_pp().lock_obc);
+
+  auto fut = pg->with_locked_obc(
+    get_target_oid(), op_info,
+    [&osd_ops, this](auto, auto obc) {
+      return enter_stage<interruptor>(client_pp().process
+      ).then_interruptible(
+	[obc=std::move(obc), &osd_ops, this]() mutable {
+	  return do_process(std::move(obc), osd_ops);
+	});
+    }).handle_error_interruptible(
+      crimson::ct_error::assert_all("unexpected error")
+    );
+  co_await std::move(fut);
+
+  logger().debug("{}: complete", *this);
+  co_await interruptor::make_interruptible(handle.complete());
+  co_return;
+}
+
 seastar::future<> InternalClientRequest::start()
 {
   track_event<StartEvent>();
-  return crimson::common::handle_system_shutdown([this] {
-      LOG_PREFIX(InternalClientRequest::start);
-      DEBUGI("{}: in repeat", *this);
-
-      return interruptor::with_interruption([this]() mutable {
-        return enter_stage<interruptor>(
-	  client_pp().wait_for_active
-        ).then_interruptible([this] {
-          return with_blocking_event<PGActivationBlocker::BlockingEvent,
-	  			     interruptor>([this] (auto&& trigger) {
-            return pg->wait_for_active_blocker.wait(std::move(trigger));
-          });
-        }).then_interruptible([this] {
-          return enter_stage<interruptor>(
-            client_pp().recover_missing);
-        }).then_interruptible([this] {
-          return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-        }).then_interruptible([this](bool unfound) {
-          if (unfound) {
-            throw std::system_error(
-              std::make_error_code(std::errc::operation_canceled),
-              fmt::format("{} is unfound, drop it!", get_target_oid()));
-          }
-          return enter_stage<interruptor>(
-            client_pp().get_obc);
-        }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
-          LOG_PREFIX(InternalClientRequest::start);
-          DEBUGI("{}: getting obc lock", *this);
-          return seastar::do_with(create_osd_ops(),
-            [this](auto& osd_ops) mutable {
-            LOG_PREFIX(InternalClientRequest::start);
-            DEBUGI("InternalClientRequest: got {} OSDOps to execute",
-                           std::size(osd_ops));
-            [[maybe_unused]] const int ret = op_info.set_from_op(
-              std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
-            assert(ret == 0);
-            // call with_locked_obc() in order, but wait concurrently for loading.
-            enter_stage_sync(client_pp().lock_obc);
-            return pg->with_locked_obc(get_target_oid(), op_info,
-              [&osd_ops, this](auto, auto obc) {
-              return enter_stage<interruptor>(client_pp().process
-              ).then_interruptible(
-                [obc=std::move(obc), &osd_ops, this] {
-                return pg->do_osd_ops(
-                  std::move(obc),
-                  osd_ops,
-                  std::as_const(op_info),
-                  get_do_osd_ops_params()
-                ).safe_then_unpack_interruptible(
-                  [](auto submitted, auto all_completed) {
-                    return all_completed.handle_error_interruptible(
-                      crimson::ct_error::eagain::handle([] {
-                        return seastar::now();
-                      }));
-                  }, crimson::ct_error::eagain::handle([] {
-                    return interruptor::now();
-                  })
-                );
-              });
-            });
-          });
-        }).si_then([this] {
-          logger().debug("{}: complete", *this);
-          return handle.complete();
-        }).handle_error_interruptible(
-          PG::load_obc_ertr::all_same_way([] {
-            return seastar::now();
-          })
-	);
-      }, [](std::exception_ptr eptr) {
-	return seastar::now();
-      }, pg, start_epoch
-
-    ).then([this] {
-      track_event<CompletionEvent>();
-    }).handle_exception_type([](std::system_error &error) {
-      logger().debug("error {}, message: {}", error.code(), error.what());
-      return seastar::now();
-    }).finally([this] {
-      logger().debug("{}: exit", *this);
-      handle.exit();
-    });
+  LOG_PREFIX(InternalClientRequest::start);
+  DEBUGI("{}: in repeat", *this);
+
+  return interruptor::with_interruption([this]() mutable {
+    return with_interruption();
+  }, [](std::exception_ptr eptr) {
+    return seastar::now();
+  }, pg, start_epoch).then([this] {
+    track_event<CompletionEvent>();
+  }).handle_exception_type([](std::system_error &error) {
+    logger().debug("error {}, message: {}", error.code(), error.what());
+    return seastar::now();
+  }).finally([this] {
+    logger().debug("{}: exit", *this);
+    handle.exit();
   });
 }
 
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index f198e584643..6023db0a8db 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -6,7 +6,6 @@
 #include "crimson/common/type_helpers.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 
@@ -41,6 +40,11 @@ private:
 
   CommonPGPipeline& client_pp();
 
+  InternalClientRequest::interruptible_future<> with_interruption();
+  InternalClientRequest::interruptible_future<> do_process(
+    crimson::osd::ObjectContextRef obc,
+    std::vector<OSDOp> &osd_ops);
+
   seastar::future<> do_process();
 
   Ref<PG> pg;
@@ -56,7 +60,7 @@ public:
     CommonPGPipeline::WaitForActive::BlockingEvent,
     PGActivationBlocker::BlockingEvent,
     CommonPGPipeline::RecoverMissing::BlockingEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::LockOBC::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CompletionEvent
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
index 1e6bd957289..85de5c711d6 100644
--- a/src/crimson/osd/osd_operations/peering_event.h
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -23,15 +23,6 @@ class ShardServices;
 class PG;
 class BackfillRecovery;
 
-  struct PGPeeringPipeline {
-    struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
-    } await_map;
-    struct Process : OrderedExclusivePhaseT<Process> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
-    } process;
-  };
-
 template <class T>
 class PeeringEvent : public PhasedOperationT<T> {
   T* that() {
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index 7512b3d108d..9ed0b73cfb4 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -396,7 +396,7 @@ SnapTrimObjSubEvent::start()
   });
 
   co_await enter_stage<interruptor>(
-    client_pp().get_obc);
+    client_pp().check_already_complete_get_obc);
 
   logger().debug("{}: getting obc for {}", *this, coid);
   // end of commonality
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 06d8f43c2f3..1164b3169d2 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -9,7 +9,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/common/subop_blocker.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "osd/osd_types.h"
@@ -170,7 +169,7 @@ public:
 
   std::tuple<
     StartEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CommonPGPipeline::WaitRepop::BlockingEvent,
     CompletionEvent
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index d210773ca30..744a1dbc02b 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -13,6 +13,9 @@
 #include <boost/range/numeric.hpp>
 #include <fmt/format.h>
 #include <fmt/ostream.h>
+
+#include <seastar/util/defer.hh>
+
 #include "include/utime_fmt.h"
 
 #include "common/hobject.h"
@@ -481,6 +484,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
   auto [objs_to_rm, next] = fut.get();
   if (objs_to_rm.empty()) {
     logger().info("all objs removed, removing coll for {}", pgid);
+    t.remove(coll_ref->get_cid(), pgid.make_snapmapper_oid());
     t.remove(coll_ref->get_cid(), pgmeta_oid);
     t.remove_collection(coll_ref->get_cid());
     (void) shard_services.get_store().do_transaction(
@@ -490,7 +494,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
     return {next, false};
   } else {
     for (auto &obj : objs_to_rm) {
-      if (obj == pgmeta_oid) {
+      if (obj == pgmeta_oid || obj.is_internal_pg_local()) {
         continue;
       }
       logger().trace("pg {}, removing obj {}", pgid, obj);
@@ -517,7 +521,8 @@ Context *PG::on_clean()
 {
   recovery_handler->on_pg_clean();
   scrubber.on_primary_active_clean();
-  return nullptr;
+  recovery_finisher = new C_PG_FinishRecovery(*this);
+  return recovery_finisher;
 }
 
 seastar::future<> PG::clear_temp_objects()
@@ -973,150 +978,6 @@ ObjectContextRef duplicate_obc(const ObjectContextRef &obc) {
   return object_context;
 }
 
-template <class Ret, class SuccessFunc, class FailureFunc>
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>>
-PG::do_osd_ops_execute(
-  seastar::lw_shared_ptr<OpsExecuter> ox,
-  ObjectContextRef obc,
-  const OpInfo &op_info,
-  Ref<MOSDOp> m,
-  std::vector<OSDOp>& ops,
-  SuccessFunc&& success_func,
-  FailureFunc&& failure_func)
-{
-  assert(ox);
-  auto rollbacker = ox->create_rollbacker(
-    [object_context=duplicate_obc(obc)] (auto& obc) mutable {
-    obc->update_from(*object_context);
-  });
-  auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func));
-  return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) {
-    logger().debug(
-      "do_osd_ops_execute: object {} - handling op {}",
-      ox->get_target(),
-      ceph_osd_op_name(osd_op.op.op));
-    return ox->execute_op(osd_op);
-  }).safe_then_interruptible([this, ox, &ops] {
-    logger().debug(
-      "do_osd_ops_execute: object {} all operations successful",
-      ox->get_target());
-    // check for full
-    if ((ox->delta_stats.num_bytes > 0 ||
-      ox->delta_stats.num_objects > 0) &&
-      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
-      const auto& m = ox->get_message();
-      if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
-        m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
-        logger().info(" full, but proceeding due to FULL_FORCE or MDS");
-      } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
-        // they tried, they failed.
-        logger().info(" full, replying to FULL_TRY op");
-        if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::edquot::make()));
-        else
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::enospc::make()));
-      } else {
-        // drop request
-        logger().info(" full, dropping request (bad client)");
-        return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-          seastar::now(),
-          OpsExecuter::osd_op_ierrorator::future<>(
-            crimson::ct_error::eagain::make()));
-      }
-    }
-    return std::move(*ox).flush_changes_n_do_ops_effects(
-      ops,
-      snap_mapper,
-      osdriver,
-      [this] (auto&& txn,
-              auto&& obc,
-              auto&& osd_op_p,
-              auto&& log_entries) {
-	logger().debug(
-	  "do_osd_ops_execute: object {} submitting txn",
-	  obc->get_oid());
-        mutate_object(obc, txn, osd_op_p);
-	return submit_transaction(
-          std::move(obc),
-          std::move(txn),
-          std::move(osd_op_p),
-          std::move(log_entries));
-    });
-  }).safe_then_unpack_interruptible(
-    [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc]
-    (auto submitted_fut, auto _all_completed_fut) mutable {
-
-    auto all_completed_fut = _all_completed_fut.safe_then_interruptible_tuple(
-      std::move(success_func),
-      crimson::ct_error::object_corrupted::handle(
-      [rollbacker, this, obc] (const std::error_code& e) mutable {
-      // this is a path for EIO. it's special because we want to fix the obejct
-      // and try again. that is, the layer above `PG::do_osd_ops` is supposed to
-      // restart the execution.
-      rollbacker.rollback_obc_if_modified(e);
-      return repair_object(obc->obs.oi.soid,
-                           obc->obs.oi.version
-      ).then_interruptible([] {
-        return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()};
-      });
-    }), OpsExecuter::osd_op_errorator::all_same_way(
-        [rollbacker, failure_func_ptr]
-        (const std::error_code& e) mutable {
-          // handle non-fatal errors only
-          ceph_assert(e.value() == EDQUOT ||
-                      e.value() == ENOSPC ||
-                      e.value() == EAGAIN);
-          rollbacker.rollback_obc_if_modified(e);
-          return (*failure_func_ptr)(e);
-    }));
-
-    return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-      std::move(submitted_fut),
-      std::move(all_completed_fut)
-    );
-  }, OpsExecuter::osd_op_errorator::all_same_way(
-    [this, op_info, m, obc,
-     rollbacker, failure_func_ptr]
-    (const std::error_code& e) mutable {
-    ceph_tid_t rep_tid = shard_services.get_tid();
-    rollbacker.rollback_obc_if_modified(e);
-    // record error log
-    auto maybe_submit_error_log =
-      interruptor::make_ready_future<std::optional<eversion_t>>(std::nullopt);
-    // call submit_error_log only for non-internal clients
-    if constexpr (!std::is_same_v<Ret, void>) {
-      if(op_info.may_write()) {
-        maybe_submit_error_log =
-          submit_error_log(m, op_info, obc, e, rep_tid);
-      }
-    }
-    return maybe_submit_error_log.then_interruptible(
-    [this, failure_func_ptr, e, rep_tid] (auto version) {
-      auto all_completed =
-      [this, failure_func_ptr, e, rep_tid,  version] {
-        if (version.has_value()) {
-          return complete_error_log(rep_tid, version.value()
-          ).then_interruptible([failure_func_ptr, e] {
-            return (*failure_func_ptr)(e);
-          });
-        } else {
-          return (*failure_func_ptr)(e);
-        }
-      };
-      return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-        std::move(seastar::now()),
-        std::move(all_completed())
-      );
-    });
-  }));
-}
-
 PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
                                          const eversion_t& version)
 {
@@ -1146,7 +1007,7 @@ PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
   return result;
 }
 
-PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
+PG::interruptible_future<eversion_t> PG::submit_error_log(
   Ref<MOSDOp> m,
   const OpInfo &op_info,
   ObjectContextRef obc,
@@ -1212,142 +1073,84 @@ PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
         get_collection_ref(), std::move(t)
       ).then([this] {
         peering_state.update_trim_to();
-        return seastar::make_ready_future<std::optional<eversion_t>>(projected_last_update);
+        return seastar::make_ready_future<eversion_t>(projected_last_update);
       });
     });
   });
 }
 
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
-PG::do_osd_ops(
-  Ref<MOSDOp> m,
-  crimson::net::ConnectionXcoreRef conn,
+PG::run_executer_fut PG::run_executer(
+  OpsExecuter &ox,
   ObjectContextRef obc,
   const OpInfo &op_info,
-  const SnapContext& snapc)
+  std::vector<OSDOp>& ops)
 {
-  if (__builtin_expect(stopping, false)) {
-    throw crimson::common::system_shutdown_exception();
-  }
-  return do_osd_ops_execute<MURef<MOSDOpReply>>(
-    seastar::make_lw_shared<OpsExecuter>(
-      Ref<PG>{this}, obc, op_info, *m, conn, snapc),
-    obc,
-    op_info,
-    m,
-    m->ops,
-    // success_func
-    [this, m, obc, may_write = op_info.may_write(),
-     may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] {
-      // TODO: should stop at the first op which returns a negative retval,
-      //       cmpext uses it for returning the index of first unmatched byte
-      int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
-      if (may_read && result >= 0) {
-        for (auto &osdop : m->ops) {
-          if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-            result = osdop.rval.code;
-            break;
-          }
-        }
-      } else if (result > 0 && may_write && !rvec) {
-        result = 0;
-      } else if (result < 0 && (m->ops.empty() ?
-        0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-        result = 0;
-      }
-      auto reply = crimson::make_message<MOSDOpReply>(m.get(),
-                                             result,
-                                             get_osdmap_epoch(),
-                                             0,
-                                             false);
-      reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-      logger().debug(
-        "do_osd_ops: {} - object {} sending reply",
-        *m,
-        m->get_hobj());
-      if (obc->obs.exists) {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          obc->obs.oi.user_version);
-      } else {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          peering_state.get_info().last_user_version);
-      }
-      return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-        std::move(reply));
-    },
-    // failure_func
-    [m, this]
-    (const std::error_code& e) {
-    logger().error("do_osd_ops_execute::failure_func {} got error: {}",
-                    *m, e);
-    return log_reply(m, e);
+  LOG_PREFIX(PG::run_executer);
+  auto rollbacker = ox.create_rollbacker(
+    [stored_obc=duplicate_obc(obc)](auto &obc) mutable {
+      obc->update_from(*stored_obc);
+    });
+  auto rollback_on_error = seastar::defer([&rollbacker] {
+    rollbacker.rollback_obc_if_modified();
   });
-}
 
-PG::do_osd_ops_iertr::future<MURef<MOSDOpReply>>
-PG::log_reply(
-  Ref<MOSDOp> m,
-  const std::error_code& e)
-{
-  auto reply = crimson::make_message<MOSDOpReply>(
-    m.get(), -e.value(), get_osdmap_epoch(), 0, false);
-  if (m->ops.empty() ? 0 :
-    m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
-      reply->set_result(0);
-    }
-  // For all ops except for CMPEXT, the correct error value is encoded
-  // in e.value(). For CMPEXT, osdop.rval has the actual error value.
-  if (e.value() == ct_error::cmp_fail_error_value) {
-    assert(!m->ops.empty());
-    for (auto &osdop : m->ops) {
-      if (osdop.rval < 0) {
-        reply->set_result(osdop.rval);
-        break;
+  for (auto &op: ops) {
+    DEBUGDPP("object {} handle op {}", *this, ox.get_target(), op);
+    co_await ox.execute_op(op);
+  }
+  DEBUGDPP("object {} all operations successful", *this, ox.get_target());
+
+  // check for full
+  if ((ox.delta_stats.num_bytes > 0 ||
+       ox.delta_stats.num_objects > 0) &&
+      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
+    const auto& m = ox.get_message();
+    if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
+	m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+      INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this);
+    } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+      // they tried, they failed.
+      INFODPP("full, replying to FULL_TRY op", *this);
+      if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	co_await run_executer_fut(
+	  crimson::ct_error::edquot::make());
+      } else {
+	co_await run_executer_fut(
+	  crimson::ct_error::enospc::make());
       }
+    } else {
+      // drop request
+      INFODPP("full, dropping request (bad client)", *this);
+      co_await run_executer_fut(
+	crimson::ct_error::eagain::make());
     }
   }
-  reply->set_enoent_reply_versions(
-    peering_state.get_info().last_update,
-    peering_state.get_info().last_user_version);
-  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-  return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-    std::move(reply));
-}
-
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>>
-PG::do_osd_ops(
-  ObjectContextRef obc,
-  std::vector<OSDOp>& ops,
-  const OpInfo &op_info,
-  const do_osd_ops_params_t &&msg_params)
-{
-  // This overload is generally used for internal client requests,
-  // use an empty SnapContext.
-  return seastar::do_with(
-    std::move(msg_params),
-    [=, this, &ops, &op_info](auto &msg_params) {
-    return do_osd_ops_execute<void>(
-      seastar::make_lw_shared<OpsExecuter>(
-        Ref<PG>{this},
-        obc,
-        op_info,
-        msg_params,
-        msg_params.get_connection(),
-        SnapContext{}
-      ),
-      obc,
-      op_info,
-      Ref<MOSDOp>(),
-      ops,
-      // success_func
-      [] {
-        return do_osd_ops_iertr::now();
-      },
-      // failure_func
-      [] (const std::error_code& e) {
-        return do_osd_ops_iertr::now();
-      });
-  });
+  rollback_on_error.cancel();
+}
+
+PG::submit_executer_fut PG::submit_executer(
+  OpsExecuter &&ox,
+  const std::vector<OSDOp>& ops) {
+  LOG_PREFIX(PG::submit_executer);
+  // transaction must commit at this point
+  return std::move(
+    ox
+  ).flush_changes_n_do_ops_effects(
+    ops,
+    snap_mapper,
+    osdriver,
+    [FNAME, this](auto&& txn,
+		  auto&& obc,
+		  auto&& osd_op_p,
+		  auto&& log_entries) {
+      DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
+      mutate_object(obc, txn, osd_op_p);
+      return submit_transaction(
+	std::move(obc),
+	std::move(txn),
+	std::move(osd_op_p),
+	std::move(log_entries));
+    });
 }
 
 PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
@@ -1885,4 +1688,19 @@ void PG::cancel_pglog_based_recovery_op() {
   pglog_based_recovery_op->cancel();
   reset_pglog_based_recovery_op();
 }
+
+void PG::C_PG_FinishRecovery::finish(int r) {
+  LOG_PREFIX(PG::C_PG_FinishRecovery::finish);
+  auto &peering_state = pg.get_peering_state();
+  if (peering_state.is_deleting() || !peering_state.is_clean()) {
+    DEBUGDPP("raced with delete or repair", pg);
+    return;
+  }
+  if (this == pg.recovery_finisher) {
+    peering_state.purge_strays();
+    pg.recovery_finisher = nullptr;
+  } else {
+    DEBUGDPP("stale recovery finsher", pg);
+  }
+}
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 93279a18c56..604f49005ff 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -375,7 +375,7 @@ public:
   }
   void check_blocklisted_watchers() final;
   void clear_primary_state() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
 
   void queue_check_readable(epoch_t last_peering_reset,
@@ -394,7 +394,7 @@ public:
   void on_replica_activate() final;
   void on_activate_complete() final;
   void on_new_interval() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
   Context *on_clean() final;
   void on_activate_committed() final {
@@ -621,7 +621,7 @@ public:
   void dump_primary(Formatter*);
   interruptible_future<> complete_error_log(const ceph_tid_t& rep_tid,
                                        const eversion_t& version);
-  interruptible_future<std::optional<eversion_t>> submit_error_log(
+  interruptible_future<eversion_t> submit_error_log(
     Ref<MOSDOp> m,
     const OpInfo &op_info,
     ObjectContextRef obc,
@@ -645,41 +645,35 @@ private:
     }
   } background_process_lock;
 
-  using do_osd_ops_ertr = crimson::errorator<
-   crimson::ct_error::eagain>;
-  using do_osd_ops_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  template <typename Ret = void>
-  using pg_rep_op_fut_t =
-    std::tuple<interruptible_future<>,
-               do_osd_ops_iertr::future<Ret>>;
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops(
-    Ref<MOSDOp> m,
-    crimson::net::ConnectionXcoreRef conn,
+  using run_executer_ertr = crimson::compound_errorator_t<
+    OpsExecuter::osd_op_errorator,
+    crimson::errorator<
+      crimson::ct_error::edquot,
+      crimson::ct_error::eagain,
+      crimson::ct_error::enospc
+      >
+    >;
+  using run_executer_iertr = crimson::interruptible::interruptible_errorator<
+    ::crimson::osd::IOInterruptCondition,
+    run_executer_ertr>;
+  using run_executer_fut = run_executer_iertr::future<>;
+  run_executer_fut run_executer(
+    OpsExecuter &ox,
     ObjectContextRef obc,
     const OpInfo &op_info,
-    const SnapContext& snapc);
+    std::vector<OSDOp>& ops);
+
+  using submit_executer_ret = std::tuple<
+    interruptible_future<>,
+    interruptible_future<>>;
+  using submit_executer_fut = interruptible_future<
+    submit_executer_ret>;
+  submit_executer_fut submit_executer(
+    OpsExecuter &&ox,
+    const std::vector<OSDOp>& ops);
 
   struct do_osd_ops_params_t;
-  do_osd_ops_iertr::future<MURef<MOSDOpReply>> log_reply(
-    Ref<MOSDOp> m,
-    const std::error_code& e);
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops(
-    ObjectContextRef obc,
-    std::vector<OSDOp>& ops,
-    const OpInfo &op_info,
-    const do_osd_ops_params_t &&params);
-  template <class Ret, class SuccessFunc, class FailureFunc>
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute(
-    seastar::lw_shared_ptr<OpsExecuter> ox,
-    ObjectContextRef obc,
-    const OpInfo &op_info,
-    Ref<MOSDOp> m,
-    std::vector<OSDOp>& ops,
-    SuccessFunc&& success_func,
-    FailureFunc&& failure_func);
+
   interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
   interruptible_future<
     std::tuple<interruptible_future<>, interruptible_future<>>>
@@ -712,9 +706,17 @@ public:
   }
   seastar::future<> stop();
 private:
+  class C_PG_FinishRecovery : public Context {
+  public:
+    explicit C_PG_FinishRecovery(PG &pg) : pg(pg) {}
+    void finish(int r) override;
+  private:
+    PG& pg;
+  };
   std::unique_ptr<PGBackend> backend;
   std::unique_ptr<RecoveryBackend> recovery_backend;
   std::unique_ptr<PGRecovery> recovery_handler;
+  C_PG_FinishRecovery *recovery_finisher;
 
   PeeringState peering_state;
   eversion_t projected_last_update;
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
index fa8201b61c2..24a381b4cf7 100644
--- a/src/crimson/osd/pg_backend.cc
+++ b/src/crimson/osd/pg_backend.cc
@@ -1289,7 +1289,7 @@ void PGBackend::clone(
   const ObjectState& d_os,
   ceph::os::Transaction& txn)
 {
-  // See OpsExecutor::execute_clone documentation
+  // See OpsExecuter::execute_clone documentation
   txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
   {
     ceph::bufferlist bv;
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index 4f874d526b3..ec3af0d2b00 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -528,10 +528,12 @@ void PGRecovery::request_primary_scan(
 
 void PGRecovery::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &peers)
 {
-  logger().info("{}: obj={} v={}",
-                 __func__, obj, v);
+  logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers);
+  auto &peering_state = pg->get_peering_state();
+  peering_state.prepare_backfill_for_missing(obj, v, peers);
   auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj);
   if (!added)
     return;
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 6cd29c3dc52..705b3176b97 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -110,7 +110,8 @@ private:
     const hobject_t& begin) final;
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) final;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index 5f7c4a62447..a053d9d5044 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -767,20 +767,26 @@ seastar::future<> ShardServices::dispatch_context_transaction(
   LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
   if (ctx.transaction.empty()) {
     DEBUG("empty transaction");
-    return seastar::now();
+    co_await get_store().flush(col);
+    Context* on_commit(
+      ceph::os::Transaction::collect_all_contexts(ctx.transaction));
+    if (on_commit) {
+      on_commit->complete(0);
+    }
+    co_return;
   }
 
   DEBUG("do_transaction ...");
-  auto ret = get_store().do_transaction(
+  co_await get_store().do_transaction(
     col,
     ctx.transaction.claim_and_reset());
-  return ret;
+  co_return;
 }
 
 seastar::future<> ShardServices::dispatch_context_messages(
   BufferedRecoveryMessages &&ctx)
 {
-  LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
+  LOG_PREFIX(OSDSingletonState::dispatch_context_messages);
   auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
     [FNAME, this](auto& osd_messages) {
       auto& [peer, messages] = osd_messages;
diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc
index d4930ea35c0..4b8a8131bcf 100644
--- a/src/exporter/DaemonMetricCollector.cc
+++ b/src/exporter/DaemonMetricCollector.cc
@@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter
     if (sockClientsPing) {
       bool ok;
       sock_client.ping(&ok);
+      std::string ceph_daemon_socket_up_desc(
+      "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy).");
+      labels_t ceph_daemon_socket_up_labels;
+      ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname());
+      ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name);
+      add_metric(builder, static_cast<int>(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc,
+             "gauge", ceph_daemon_socket_up_labels);
       if (!ok) {
         failures++;
         continue;
-      } 
+      }
     }
     std::string counter_dump_response = dump_response.size() > 0 ? dump_response :
       asok_request(sock_client, "counter dump", daemon_name);
diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h
index d2e929b4d67..3302e95df91 100644
--- a/src/exporter/DaemonMetricCollector.h
+++ b/src/exporter/DaemonMetricCollector.h
@@ -42,11 +42,11 @@ public:
   std::map<std::string, AdminSocketClient> clients;
   std::string metrics;
   std::pair<labels_t, std::string> add_fixed_name_metrics(std::string metric_name);
+  void update_sockets();
 
 private:
   std::mutex metrics_mutex;
   std::unique_ptr<MetricsBuilder> builder;
-  void update_sockets();
   void request_loop(boost::asio::steady_timer &timer);
 
   void dump_asok_metric(boost::json::object perf_info,
diff --git a/src/log/Entry.h b/src/log/Entry.h
index 3677c8eb951..db39eca0ef3 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -4,9 +4,12 @@
 #ifndef __CEPH_LOG_ENTRY_H
 #define __CEPH_LOG_ENTRY_H
 
+#include "include/compat.h"
+
 #include "log/LogClock.h"
 
 #include "common/StackStringStream.h"
+#include "common/Thread.h"
 
 #include "boost/container/small_vector.hpp"
 
@@ -14,6 +17,7 @@
 
 #include <string_view>
 
+
 namespace ceph {
 namespace logging {
 
@@ -27,7 +31,10 @@ public:
     m_thread(pthread_self()),
     m_prio(pr),
     m_subsys(sub)
-  {}
+  {
+    strncpy(m_thread_name, Thread::get_thread_name().data(), 16);
+    m_thread_name[15] = '\0';
+  }
   Entry(const Entry &) = default;
   Entry& operator=(const Entry &) = default;
   Entry(Entry &&e) = default;
@@ -40,6 +47,7 @@ public:
   time m_stamp;
   pthread_t m_thread;
   short m_prio, m_subsys;
+  char m_thread_name[16];
 
   static log_clock& clock() {
     static log_clock clock;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 69f6df82ecb..49dd03c06c0 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -493,13 +493,13 @@ void Log::dump_recent()
   _flush(m_flush, false);
 
   _log_message("--- begin dump of recent events ---", true);
-  std::set<pthread_t> recent_pthread_ids;
+  std::set<std::pair<pthread_t, const char *>> recent_pthread_ids;
   {
     EntryVector t;
     t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end()));
     m_recent.clear();
     for (const auto& e : t) {
-      recent_pthread_ids.emplace(e.m_thread);
+      recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name));
     }
     _flush(t, true);
   }
@@ -515,14 +515,11 @@ void Log::dump_recent()
 			   m_stderr_log, m_stderr_crash), true);
 
   _log_message("--- pthread ID / name mapping for recent threads ---", true);
-  for (const auto pthread_id : recent_pthread_ids)
+  for (auto& [pthread_id, pthread_name] : recent_pthread_ids)
   {
-    char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
-    ceph_pthread_getname(pthread_id, pthread_name, sizeof(pthread_name));
     // we want the ID to be printed in the same format as we use for a log entry.
     // The reason is easier grepping.
-    _log_message(fmt::format("  {:x} / {}",
-			     tid_to_int(pthread_id), pthread_name), true);
+    _log_message(fmt::format("  {:x} / {}", tid_to_int(pthread_id), pthread_name), true);
   }
 
   _log_message(fmt::format("  max_recent {:9}", m_recent.capacity()), true);
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 059b540feb0..642d3428a27 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -17,6 +17,7 @@
 #include "common/likely.h"
 #include "common/HeartbeatMap.h"
 
+#include "include/compat.h" // for ceph_pthread_setname()
 #include "include/stringify.h"
 #include "include/util.h"
 
@@ -73,6 +74,7 @@ void Beacon::init(const MDSMap &mdsmap)
   _notify_mdsmap(mdsmap);
 
   sender = std::thread([this]() {
+    ceph_pthread_setname(pthread_self(), "beacon");
     std::unique_lock<std::mutex> lock(mutex);
     bool sent;
     while (!finished) {
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index f000da7928a..76e9fee68f8 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1319,7 +1319,7 @@ void CDir::take_dentry_waiting(std::string_view dname, snapid_t first, snapid_t
 	     << it->first.snapid
 	     << " on " << *this << dendl;
     std::copy(it->second.begin(), it->second.end(), std::back_inserter(ls));
-    waiting_on_dentry.erase(it++);
+    it = waiting_on_dentry.erase(it);
   }
 
   if (waiting_on_dentry.empty())
@@ -2823,8 +2823,6 @@ void CDir::_committed(int r, version_t v)
 
   auto it = waiting_for_commit.begin();
   while (it != waiting_for_commit.end()) {
-    auto _it = it;
-    ++_it;
     if (it->first > committed_version) {
       dout(10) << " there are waiters for " << it->first << ", committing again" << dendl;
       _commit(it->first, -1);
@@ -2834,8 +2832,7 @@ void CDir::_committed(int r, version_t v)
     for (const auto &waiter : it->second)
       t.push_back(waiter);
     mdcache->mds->queue_waiters(t);
-    waiting_for_commit.erase(it);
-    it = _it;
+    it = waiting_for_commit.erase(it);
 
     if (!(++count % mdcache->mds->heartbeat_reset_grace()))
       mdcache->mds->heartbeat_reset();
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 0e9b6996ad2..dfad411d323 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -4589,8 +4589,11 @@ void InodeStoreBase::dump(Formatter *f) const
     for (const auto& [key, val] : *xattrs) {
       f->open_object_section("xattr");
       f->dump_string("key", key);
-      std::string v(val.c_str(), val.length());
-      f->dump_string("val", v);
+      if (val.length()) {
+        f->dump_string("val", std::string(val.c_str(), val.length()));
+      } else {
+        f->dump_string("val", "");
+      }
       f->close_section();
     }
   }
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index c433c77b453..eb2b529dcfa 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -960,17 +960,15 @@ void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_p
     dir->set_dir_auth(auth);
     
     // move items nested beneath me, under me.
-    set<CDir*>::iterator p = subtrees[root].begin();
+    auto p = subtrees[root].begin();
     while (p != subtrees[root].end()) {
-      set<CDir*>::iterator next = p;
-      ++next;
       if (get_subtree_root((*p)->get_parent_dir()) == dir) {
 	// move under me
 	dout(10) << "  claiming child bound " << **p << dendl;
 	subtrees[dir].insert(*p); 
-	subtrees[root].erase(p);
-      }
-      p = next;
+	p = subtrees[root].erase(p);
+      } else
+	++p;
     }
     
     // i am a bound of the parent subtree.
@@ -1113,17 +1111,15 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, m
     dir->set_dir_auth(auth);
     
     // move items nested beneath me, under me.
-    set<CDir*>::iterator p = subtrees[root].begin();
+    auto p = subtrees[root].begin();
     while (p != subtrees[root].end()) {
-      set<CDir*>::iterator next = p;
-      ++next;
       if (get_subtree_root((*p)->get_parent_dir()) == dir) {
 	// move under me
 	dout(10) << "  claiming child bound " << **p << dendl;
 	subtrees[dir].insert(*p); 
-	subtrees[root].erase(p);
-      }
-      p = next;
+	p = subtrees[root].erase(p);
+      } else
+	++p;
     }
     
     // i am a bound of the parent subtree.
@@ -1172,8 +1168,8 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, m
   }
   // merge stray bounds?
   while (!subtrees[dir].empty()) {
-    set<CDir*> copy = subtrees[dir];
-    for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
+    const auto copy = subtrees[dir];
+    for (auto p = copy.begin(); p != copy.end(); ++p) {
       if (bounds.count(*p) == 0) {
 	CDir *stray = *p;
 	dout(10) << "  swallowing extra subtree at " << *stray << dendl;
@@ -1214,7 +1210,7 @@ void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir
   }
   dout(10) << " by ino: " << byino << dendl;
 
-  for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
+  for (auto p = byino.begin(); p != byino.end(); ++p) {
     p->second.simplify();
     CInode *diri = get_inode(p->first);
     if (!diri)
@@ -1222,7 +1218,7 @@ void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir
     dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
 
     fragtree_t tmpdft;
-    for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+    for (auto q = p->second.begin(); q != p->second.end(); ++q)
       tmpdft.force_to_leaf(g_ceph_context, *q);
 
     for (const auto& fg : p->second) {
@@ -1267,7 +1263,7 @@ void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
     ino_fragset[df.ino].insert_raw(df.frag);
   }
   // get frags
-  for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
+  for (auto p = ino_fragset.begin();
        p != ino_fragset.end();
        ++p) {
     p->second.simplify();
@@ -1347,7 +1343,7 @@ void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
   } else {
     // find them
     CDir *root = get_subtree_root(dir);
-    for (set<CDir*>::iterator p = subtrees[root].begin();
+    for (auto p = subtrees[root].begin();
 	 p != subtrees[root].end();
 	 ++p) {
       CDir *t = *p;
@@ -1415,7 +1411,7 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
   CDir *newdir = diri->get_parent_dir();
 
   if (pop) {
-    map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
+    auto p = projected_subtree_renames.find(diri);
     ceph_assert(p != projected_subtree_renames.end());
     ceph_assert(!p->second.empty());
     ceph_assert(p->second.front().first == olddir);
@@ -1815,7 +1811,7 @@ void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
 
   if (cur->last != CEPH_NOSNAP) {
     ceph_assert(cur->dirty_old_rstats.empty());
-    set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
+    auto q = snaps.lower_bound(std::max(first, floor));
     if (q == snaps.end() || *q > cur->last)
       return;
   }
@@ -2487,7 +2483,7 @@ void MDCache::logged_leader_update(metareqid_t reqid)
  */
 void MDCache::finish_committed_leaders()
 {
-  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+  for (auto p = uncommitted_leaders.begin();
        p != uncommitted_leaders.end();
        ++p) {
     p->second.recovering = false;
@@ -2536,16 +2532,16 @@ void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag
 				      map<dirfrag_t,vector<dirfrag_t> >& subtrees)
 {
   if (subtrees.count(oldparent)) {
-      vector<dirfrag_t>& v = subtrees[oldparent];
+      auto& v = subtrees[oldparent];
       dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
-      for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
+      for (auto it = v.begin(); it != v.end(); ++it)
 	if (*it == df) {
 	  v.erase(it);
 	  break;
 	}
     }
   if (subtrees.count(newparent)) {
-    vector<dirfrag_t>& v = subtrees[newparent];
+    auto& v = subtrees[newparent];
     dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
     v.push_back(df);
   }
@@ -2766,7 +2762,7 @@ void MDCache::send_peer_resolves()
   map<mds_rank_t, ref_t<MMDSResolve>> resolves;
 
   if (mds->is_resolve()) {
-    for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
+    for (auto p = uncommitted_peers.begin();
 	 p != uncommitted_peers.end();
 	 ++p) {
       mds_rank_t leader = p->second.leader;
@@ -2777,7 +2773,7 @@ void MDCache::send_peer_resolves()
   } else {
     set<mds_rank_t> resolve_set;
     mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
-    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+    for (auto p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
       MDRequestRef& mdr = p->second;
@@ -2828,7 +2824,7 @@ void MDCache::send_subtree_resolves()
   }
 
   map<mds_rank_t, ref_t<MMDSResolve>> resolves;
-  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+  for (auto p = recovery_set.begin();
        p != recovery_set.end();
        ++p) {
     if (*p == mds->get_nodeid())
@@ -2841,7 +2837,7 @@ void MDCache::send_subtree_resolves()
   map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
 
   // known
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -2858,7 +2854,7 @@ void MDCache::send_subtree_resolves()
       set<CDir*> bounds;
       get_subtree_bounds(dir, bounds);
       vector<dirfrag_t> dfls;
-      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
+      for (auto q = bounds.begin(); q != bounds.end(); ++q)
 	dfls.push_back((*q)->dirfrag());
 
       my_ambig_imports[dir->dirfrag()] = dfls;
@@ -2870,7 +2866,7 @@ void MDCache::send_subtree_resolves()
       }
       // bounds too
       vector<dirfrag_t> dfls;
-      for (set<CDir*>::iterator q = subtrees[dir].begin();
+      for (auto q = subtrees[dir].begin();
 	   q != subtrees[dir].end();
 	   ++q) {
 	CDir *bound = *q;
@@ -2883,7 +2879,7 @@ void MDCache::send_subtree_resolves()
   }
 
   // ambiguous
-  for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+  for (auto p = my_ambiguous_imports.begin();
        p != my_ambiguous_imports.end();
        ++p) {
     my_ambig_imports[p->first] = p->second;
@@ -2896,9 +2892,9 @@ void MDCache::send_subtree_resolves()
     while (i < p->second.size()) {
       dirfrag_t b = p->second[i];
       if (my_subtrees.count(b)) {
-	vector<dirfrag_t>& bb = my_subtrees[b];
+	auto& bb = my_subtrees[b];
 	dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
-	for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+	for (auto r = bb.begin(); r != bb.end(); ++r)
 	  p->second.push_back(*r);
 	my_subtrees.erase(b);
 	p->second.erase(p->second.begin() + i);
@@ -2963,7 +2959,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
 
   // clean up any requests peer to/from this node
   list<MDRequestRef> finish;
-  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+  for (auto p = active_requests.begin();
        p != active_requests.end();
        ++p) {
     MDRequestRef& mdr = p->second;
@@ -3061,7 +3057,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
     }
   }
 
-  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+  for (auto p = uncommitted_leaders.begin();
        p != uncommitted_leaders.end();
        ++p) {
     // The failed MDS may have already committed the peer update
@@ -3080,7 +3076,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
   kick_find_ino_peers(who);
   kick_open_ino_peers(who);
 
-  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+  for (auto p = fragments.begin();
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
@@ -3089,18 +3085,17 @@ void MDCache::handle_mds_failure(mds_rank_t who)
       if (info.notify_ack_waiting.erase(who) &&
 	  info.notify_ack_waiting.empty()) {
 	fragment_drop_locks(info);
-	fragment_maybe_finish(p++);
+	p = fragment_maybe_finish(p);
       } else {
 	++p;
       }
       continue;
     }
 
-    ++p;
     dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
     std::vector<CDir*> dirs;
     info.dirs.swap(dirs);
-    fragments.erase(df);
+    p = fragments.erase(p);
     fragment_unmark_unfreeze_dirs(dirs);
   }
 
@@ -3126,7 +3121,7 @@ void MDCache::handle_mds_recovery(mds_rank_t who)
   MDSContext::vec waiters;
 
   // wake up any waiters in their subtrees
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -3241,7 +3236,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
 	  ceph_assert(get_inode(ino));
 
-	  for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
+	  for (auto q = cap_exports.begin();
 	      q != cap_exports.end();
 	      ++q) {
 	    Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
@@ -3283,10 +3278,8 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
   if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
     survivor = true;
     // check for any import success/failure (from this node)
-    map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+    auto p = my_ambiguous_imports.begin();
     while (p != my_ambiguous_imports.end()) {
-      map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
-      ++next;
       CDir *dir = get_dirfrag(p->first);
       ceph_assert(dir);
       dout(10) << "checking ambiguous import " << *dir << dendl;
@@ -3305,7 +3298,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  bool inside = true;
 	  set<CDir*> bounds;
 	  get_force_dirfrag_bound_set(q.second, bounds);
-	  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+	  for (auto p = bounds.begin(); p != bounds.end(); ++p) {
 	    CDir *bound = *p;
 	    if (bound->contains(dir)) {
 	      inside = false;  // nope, bound is dir or parent of dir, not inside.
@@ -3316,7 +3309,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	    claimed_by_sender = true;
 	}
 
-	my_ambiguous_imports.erase(p);  // no longer ambiguous.
+	p = my_ambiguous_imports.erase(p);  // no longer ambiguous.
 	if (claimed_by_sender) {
 	  dout(7) << "ambiguous import failed on " << *dir << dendl;
 	  migrator->import_reverse(dir);
@@ -3324,8 +3317,8 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  dout(7) << "ambiguous import succeeded on " << *dir << dendl;
 	  migrator->import_finish(dir, true);
 	}
-      }
-      p = next;
+      } else
+	++p;
     }
   }    
 
@@ -3507,9 +3500,9 @@ void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t
   if (su == nullptr) {
     return;
   }
-  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
+  for(auto p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
     uncommitted_peer_rename_olddir[*p]++;
-  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
+  for(auto p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
     uncommitted_peer_unlink[*p]++;
 }
 
@@ -3533,9 +3526,9 @@ void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
     return;
   }
   // discard the non-auth subtree we renamed out of
-  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
+  for(auto p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
     CInode *diri = *p;
-    map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
+    auto it = uncommitted_peer_rename_olddir.find(diri);
     ceph_assert(it != uncommitted_peer_rename_olddir.end());
     it->second--;
     if (it->second == 0) {
@@ -3553,9 +3546,9 @@ void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
       ceph_assert(it->second > 0);
   }
   // removed the inodes that were unlinked by peer update
-  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
+  for(auto p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
     CInode *in = *p;
-    map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
+    auto it = uncommitted_peer_unlink.find(in);
     ceph_assert(it != uncommitted_peer_unlink.end());
     it->second--;
     if (it->second == 0) {
@@ -3598,13 +3591,13 @@ void MDCache::disambiguate_other_imports()
 
   bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
   // other nodes' ambiguous imports
-  for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
+  for (auto p = other_ambiguous_imports.begin();
        p != other_ambiguous_imports.end();
        ++p) {
     mds_rank_t who = p->first;
     dout(10) << "ambiguous imports for mds." << who << dendl;
 
-    for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
@@ -3639,7 +3632,7 @@ void MDCache::disambiguate_my_imports()
   // my ambiguous imports
   mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
   while (!my_ambiguous_imports.empty()) {
-    map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
+    auto q = my_ambiguous_imports.begin();
 
     CDir *dir = get_dirfrag(q->first);
     ceph_assert(dir);
@@ -3667,7 +3660,7 @@ void MDCache::disambiguate_my_imports()
   mds->mdlog->flush();
 
   // verify all my subtrees are unambiguous!
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -3692,7 +3685,7 @@ void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
 {
   // make a list
   vector<dirfrag_t> binos;
-  for (set<CDir*>::iterator p = bounds.begin();
+  for (auto p = bounds.begin();
        p != bounds.end();
        ++p) 
     binos.push_back((*p)->dirfrag());
@@ -3849,14 +3842,14 @@ void MDCache::recalc_auth_bits(bool replay)
   }
 
   set<CInode*> subtree_inodes;
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (p->first->dir_auth.first == mds->get_nodeid())
       subtree_inodes.insert(p->first->inode);
   }
 
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (p->first->inode->is_mdsdir()) {
@@ -4079,7 +4072,7 @@ void MDCache::rejoin_send_rejoins()
 	  ++q;
 	} else {
 	  // remove reconnect with no session
-	  p.second.second.erase(q++);
+	  q = p.second.second.erase(q);
 	}
       }
       rejoins[target]->cap_exports[p.first] = p.second.second;
@@ -4096,7 +4089,7 @@ void MDCache::rejoin_send_rejoins()
   
   
   // check all subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -4166,7 +4159,7 @@ void MDCache::rejoin_send_rejoins()
   if (!mds->is_rejoin()) {
     // i am survivor.  send strong rejoin.
     // note request remote_auth_pins, xlocks
-    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+    for (auto p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
       MDRequestRef& mdr = p->second;
@@ -4582,7 +4575,7 @@ void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
   }
   
   // weak base inodes?  (root, stray, etc.)
-  for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
+  for (auto p = weak->weak_inodes.begin();
        p != weak->weak_inodes.end();
        ++p) {
     CInode *in = get_inode(*p);
@@ -4616,7 +4609,7 @@ void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
     rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
     mds->send_message(ack, weak->get_connection());
 
-    for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+    for (auto p = gather_locks.begin(); p != gather_locks.end(); ++p) {
       if (!(*p)->is_stable())
 	mds->locker->eval_gather(*p);
     }
@@ -5184,12 +5177,12 @@ void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
   auto bp = ack->imported_caps.cbegin();
   decode(peer_imported, bp);
 
-  for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
+  for (auto p = peer_imported.begin();
        p != peer_imported.end();
        ++p) {
     auto& ex = cap_exports.at(p->first);
     ceph_assert(ex.first == from);
-    for (map<client_t,Capability::Import>::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       auto r = ex.second.find(q->first);
@@ -5271,7 +5264,7 @@ void MDCache::rejoin_trim_undef_inodes()
   dout(10) << "rejoin_trim_undef_inodes" << dendl;
 
   while (!rejoin_undef_inodes.empty()) {
-    set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+    auto p = rejoin_undef_inodes.begin();
     CInode *in = *p;
     rejoin_undef_inodes.erase(p);
 
@@ -5496,12 +5489,12 @@ bool MDCache::process_imported_caps()
     }
 
     // process caps that were exported by peer rename
-    for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
+    for (auto p = rejoin_peer_exports.begin();
 	 p != rejoin_peer_exports.end();
 	 ++p) {
       CInode *in = get_inode(p->first);
       ceph_assert(in);
-      for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
+      for (auto q = p->second.second.begin();
 	   q != p->second.second.end();
 	   ++q) {
 	auto r = rejoin_session_map.find(q->first);
@@ -5568,7 +5561,7 @@ bool MDCache::process_imported_caps()
 	  }
 	}
       }
-      cap_imports.erase(p++);  // remove and move on
+      p = cap_imports.erase(p);  // remove and move on
     }
   } else {
     trim_non_auth();
@@ -5690,7 +5683,7 @@ void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
 
   for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
     split_inos.push_back((*p)->ino());
-  for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+  for (auto p = realm->open_children.begin();
        p != realm->open_children.end();
        ++p)
     split_realms.push_back((*p)->inode->ino());
@@ -5737,12 +5730,12 @@ void MDCache::clean_open_file_lists()
 {
   dout(10) << "clean_open_file_lists" << dendl;
   
-  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+  for (auto p = mds->mdlog->segments.begin();
        p != mds->mdlog->segments.end();
        ++p) {
     LogSegment *ls = p->second;
 
-    elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
+    auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
     while (!q.end()) {
       CInode *in = *q;
       ++q;
@@ -5828,7 +5821,7 @@ void MDCache::export_remaining_imported_caps()
       mds->heartbeat_reset();
   }
 
-  for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
+  for (auto p = cap_reconnect_waiters.begin();
        p != cap_reconnect_waiters.end();
        ++p)
     mds->queue_waiters(p->second);
@@ -5869,7 +5862,7 @@ Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
       dout(15) << " chose lock states on " << *in << dendl;
     }
 
-    map<inodeno_t, MDSContext::vec >::iterator it =
+    auto it =
       cap_reconnect_waiters.find(in->ino());
     if (it != cap_reconnect_waiters.end()) {
       mds->queue_waiters(it->second);
@@ -5956,7 +5949,7 @@ void MDCache::open_snaprealms()
       }
     }
 
-    rejoin_pending_snaprealms.erase(it++);
+    it = rejoin_pending_snaprealms.erase(it);
     in->put(CInode::PIN_OPENINGSNAPPARENTS);
 
     send_snaps(splits);
@@ -6094,10 +6087,10 @@ void MDCache::rejoin_send_acks()
   dout(7) << "rejoin_send_acks" << dendl;
 
   // replicate stray
-  for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+  for (auto p = rejoin_unlinked_inodes.begin();
        p != rejoin_unlinked_inodes.end();
        ++p) {
-    for (set<CInode*>::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       CInode *in = *q;
@@ -6127,7 +6120,7 @@ void MDCache::rejoin_send_acks()
   
   // send acks to everyone in the recovery set
   map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
-  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+  for (auto p = recovery_set.begin();
        p != recovery_set.end();
        ++p) {
     if (rejoin_ack_sent.count(*p))
@@ -6138,7 +6131,7 @@ void MDCache::rejoin_send_acks()
   rejoin_ack_sent = recovery_set;
   
   // walk subtrees
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); 
+  for (auto p = subtrees.begin(); 
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -6236,7 +6229,7 @@ void MDCache::rejoin_send_acks()
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
-  for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
+  for (auto p = rejoin_potential_updated_scatterlocks.begin();
        p != rejoin_potential_updated_scatterlocks.end();
        ++p) {
     CInode *in = *p;
@@ -6663,7 +6656,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
 {
   dout(10) << "truncate_inode_finish " << *in << dendl;
   
-  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  auto p = ls->truncating_inodes.find(in);
   ceph_assert(p != ls->truncating_inodes.end());
   ls->truncating_inodes.erase(p);
 
@@ -6719,7 +6712,7 @@ void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
   dout(20) << "remove_recovered_truncate " << *in << " in log segment "
 	   << ls->seq << "/" << ls->offset << dendl;
   // if we have the logseg the truncate started in, it must be in our list.
-  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  auto p = ls->truncating_inodes.find(in);
   ceph_assert(p != ls->truncating_inodes.end());
   ls->truncating_inodes.erase(p);
   in->put(CInode::PIN_TRUNCATING);
@@ -6728,11 +6721,11 @@ void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
 void MDCache::start_recovered_truncates()
 {
   dout(10) << "start_recovered_truncates" << dendl;
-  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+  for (auto p = mds->mdlog->segments.begin();
        p != mds->mdlog->segments.end();
        ++p) {
     LogSegment *ls = p->second;
-    for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
+    for (auto q = ls->truncating_inodes.begin();
 	 q != ls->truncating_inodes.end();
 	 ++q) {
       CInode *in = *q;
@@ -7006,7 +6999,7 @@ std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
 
   // Other rank's base inodes (when I'm stopping)
   if (mds->is_stopping()) {
-    for (set<CInode*>::iterator p = base_inodes.begin();
+    for (auto p = base_inodes.begin();
          p != base_inodes.end();) {
       CInode *base_in = *p;
       ++p;
@@ -7278,7 +7271,7 @@ void MDCache::trim_non_auth()
   dout(7) << "trim_non_auth" << dendl;
   
   // temporarily pin all subtree roots
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) 
     p->first->get(CDir::PIN_SUBTREETEMP);
@@ -7349,7 +7342,7 @@ void MDCache::trim_non_auth()
   lru.lru_touch_entire_pintail();
 
   // unpin all subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) 
     p->first->put(CDir::PIN_SUBTREETEMP);
@@ -7461,7 +7454,7 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
   // can we now trim child subtrees?
   set<CDir*> bounds;
   get_subtree_bounds(dir, bounds);
-  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+  for (auto p = bounds.begin(); p != bounds.end(); ++p) {
     CDir *bd = *p;
     if (bd->get_dir_auth().first != mds->get_nodeid() &&  // we are not auth
 	bd->get_num_any() == 0 && // and empty
@@ -7746,7 +7739,7 @@ void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
     }
   }
 
-  for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+  for (auto p = gather_locks.begin(); p != gather_locks.end(); ++p) {
     if (!(*p)->is_stable())
       mds->locker->eval_gather(*p);
   }
@@ -9292,7 +9285,7 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
       info.auth_hint = MDS_RANK_NONE;
     }
   } else {
-    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+    for (auto p = active.begin(); p != active.end(); ++p)
       if (*p != whoami && info.checked.count(*p) == 0) {
 	peer = *p;
 	break;
@@ -9405,7 +9398,7 @@ void MDCache::kick_open_ino_peers(mds_rank_t who)
 {
   dout(10) << "kick_open_ino_peers mds." << who << dendl;
 
-  for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+  for (auto p = opening_inodes.begin();
        p != opening_inodes.end();
        ++p) {
     open_ino_info_t& info = p->second;
@@ -9546,7 +9539,7 @@ void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
     m = fip.hint;
     fip.hint = MDS_RANK_NONE;
   } else {
-    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+    for (auto p = active.begin(); p != active.end(); ++p)
       if (*p != mds->get_nodeid() &&
 	  fip.checked.count(*p) == 0) {
 	m = *p;
@@ -9645,7 +9638,7 @@ void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
 void MDCache::kick_find_ino_peers(mds_rank_t who)
 {
   // find_ino_peers requests we should move on from
-  for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
+  for (auto p = find_ino_peer.begin();
        p != find_ino_peer.end();
        ++p) {
     find_ino_peer_info_t& fip = p->second;
@@ -9665,7 +9658,7 @@ void MDCache::kick_find_ino_peers(mds_rank_t who)
 int MDCache::get_num_client_requests()
 {
   int count = 0;
-  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+  for (auto p = active_requests.begin();
       p != active_requests.end();
       ++p) {
     MDRequestRef& mdr = p->second;
@@ -9766,7 +9759,7 @@ MDRequestRef MDCache::request_start_internal(int op)
 
 MDRequestRef MDCache::request_get(metareqid_t rid)
 {
-  ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
+  auto p = active_requests.find(rid);
   ceph_assert(p != active_requests.end());
   dout(7) << "request_get " << rid << " " << *p->second << dendl;
   return p->second;
@@ -10435,7 +10428,7 @@ void MDCache::discover_path(CDir *base,
 
 void MDCache::kick_discovers(mds_rank_t who)
 {
-  for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
+  for (auto p = discovers.begin();
        p != discovers.end();
        ++p) {
     if (p->second.mds != who)
@@ -10772,7 +10765,7 @@ void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
 
   // decrement discover counters
   if (m->get_tid()) {
-    map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
+    auto p = discovers.find(m->get_tid());
     if (p != discovers.end()) {
       dout(10) << " found tid " << m->get_tid() << dendl;
       discovers.erase(p);
@@ -11178,7 +11171,7 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast)
   }
 
   mds_rank_t whoami = mds->get_nodeid();
-  for (set<mds_rank_t>::iterator it = who.begin();
+  for (auto it = who.begin();
        it != who.end();
        ++it) {
     if (*it == whoami) continue;
@@ -11351,7 +11344,7 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, const MDRequestR
     CInode *strayin = straydn->get_linkage()->get_inode();
     strayin->encode_snap_blob(snapbl);
   }
-  for (set<mds_rank_t>::iterator it = replicas.begin();
+  for (auto it = replicas.begin();
        it != replicas.end();
        ++it) {
     // don't tell (rmdir) witnesses; they already know
@@ -11588,7 +11581,7 @@ void MDCache::adjust_dir_fragments(CInode *diri,
       set<CDir*> bounds;
       bounds.swap(subtrees[dir]);
       subtrees.erase(dir);
-      for (set<CDir*>::iterator p = bounds.begin();
+      for (auto p = bounds.begin();
 	   p != bounds.end();
 	   ++p) {
 	CDir *frag = get_subtree_root((*p)->get_parent_dir());
@@ -11627,11 +11620,11 @@ void MDCache::adjust_dir_fragments(CInode *diri,
       for (const auto& dir : srcfrags) {
 	ceph_assert(dir->is_subtree_root());
 	dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
-	map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
-	set<CDir*>::iterator r = q->second.begin();
+	auto q = subtrees.find(dir);
+	auto r = q->second.begin();
 	while (r != subtrees[dir].end()) {
 	  new_bounds.insert(*r);
-	  subtrees[dir].erase(r++);
+	  r = subtrees[dir].erase(r);
 	}
 	subtrees.erase(q);
 
@@ -11835,7 +11828,7 @@ public:
 void MDCache::fragment_mark_and_complete(const MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr) {
     dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
     request_finish(mdr);
@@ -11938,8 +11931,7 @@ void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
 bool MDCache::fragment_are_all_frozen(CDir *dir)
 {
   ceph_assert(dir->is_frozen_dir());
-  map<dirfrag_t,fragment_info_t>::iterator p;
-  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+  for (auto p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
        p != fragments.end() && p->first.ino == dir->ino();
        ++p) {
     if (p->first.frag.contains(dir->get_frag()))
@@ -11951,8 +11943,7 @@ bool MDCache::fragment_are_all_frozen(CDir *dir)
 
 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
 {
-  map<dirfrag_t,fragment_info_t>::iterator p;
-  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+  for (auto p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
        p != fragments.end() && p->first.ino == dir->ino();
        ++p) {
     if (p->first.frag.contains(dir->get_frag())) {
@@ -11971,7 +11962,7 @@ void MDCache::find_stale_fragment_freeze()
   utime_t cutoff = now;
   cutoff -= g_conf()->mds_freeze_tree_timeout;
 
-  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+  for (auto p = fragments.begin();
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
@@ -12060,7 +12051,7 @@ public:
 void MDCache::fragment_frozen(const MDRequestRef& mdr, int r)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr || r < 0) {
     dout(7) << "fragment_frozen " << basedirfrag << " must have aborted; rc=" << r << dendl;
     request_finish(mdr);
@@ -12079,7 +12070,7 @@ void MDCache::fragment_frozen(const MDRequestRef& mdr, int r)
 void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr) {
     dout(7) << __func__ << ": " << basedirfrag << " must have aborted" << dendl;
     request_finish(mdr);
@@ -12402,12 +12393,12 @@ void MDCache::fragment_drop_locks(fragment_info_t& info)
   //info.mdr.reset();
 }
 
-void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+MDCache::fragment_info_iterator MDCache::fragment_maybe_finish(const fragment_info_iterator it)
 {
   ceph_assert(kill_dirfrag_at != dirfrag_killpoint::FRAGMENT_MAYBE_FINISH);
 
   if (!it->second.finishing)
-    return;
+    return it;
 
   // unmark & auth_unpin
   for (const auto &dir : it->second.resultfrags) {
@@ -12421,7 +12412,7 @@ void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
     mds->balancer->maybe_fragment(dir, false);
   }
 
-  fragments.erase(it);
+  return fragments.erase(it);
 }
 
 
@@ -12522,7 +12513,7 @@ void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
 {
   dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
 	   << " op " << EFragment::op_name(op) << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  auto it = uncommitted_fragments.find(basedirfrag);
   if (it != uncommitted_fragments.end()) {
     ufragment& uf = it->second;
     if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
@@ -12539,7 +12530,7 @@ void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&&
 {
   dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
            << " old_frags (" << old_frags << ")" << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  auto it = uncommitted_fragments.find(basedirfrag);
   if (it != uncommitted_fragments.end()) {
     ufragment& uf = it->second;
     if (!uf.old_frags.empty()) {
@@ -12575,7 +12566,7 @@ struct C_MDC_FragmentRollback : public MDCacheLogContext {
 void MDCache::rollback_uncommitted_fragments()
 {
   dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
-  for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
+  for (auto p = uncommitted_fragments.begin();
        p != uncommitted_fragments.end();
        ++p) {
     ufragment &uf = p->second;
@@ -12722,7 +12713,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
 
   // root frags
   std::vector<CDir*> basefrags;
-  for (set<CInode*>::iterator p = base_inodes.begin();
+  for (auto p = base_inodes.begin();
        p != base_inodes.end();
        ++p) 
     (*p)->get_dirfrags(basefrags);
@@ -12760,13 +12751,11 @@ void MDCache::show_subtrees(int dbl, bool force_print)
     seen.insert(dir);
 
     // nested items?
-    if (!subtrees[dir].empty()) {
-      for (set<CDir*>::iterator p = subtrees[dir].begin();
-	   p != subtrees[dir].end();
-	   ++p) {
-	//dout(25) << " saw sub " << **p << dendl;
-	q.push_front(pair<CDir*,int>(*p, d+1));
-      }
+    for (auto p = subtrees[dir].begin();
+	 p != subtrees[dir].end();
+	 ++p) {
+      //dout(25) << " saw sub " << **p << dendl;
+      q.push_front(pair<CDir*,int>(*p, d+1));
     }
   }
 
@@ -12831,7 +12820,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
       else
 	indent += "  ";
 
-      for (set<CDir*>::iterator p = subtrees[dir].begin();
+      for (auto p = subtrees[dir].begin();
 	   p != subtrees[dir].end();
 	   ++p) 
 	q.push_front(pair<CDir*,int>(*p, d+2));
@@ -12840,7 +12829,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
 
   // verify there isn't stray crap in subtree map
   int lost = 0;
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (subtrees_seen.count(p->first)) continue;
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 0b01c9ab859..3c5d7e5e4f4 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -1485,7 +1485,7 @@ private:
   void fragment_frozen(const MDRequestRef& mdr, int r);
   void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
   void fragment_drop_locks(fragment_info_t &info);
-  void fragment_maybe_finish(const fragment_info_iterator& it);
+  fragment_info_iterator fragment_maybe_finish(const fragment_info_iterator it);
   void dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing=false);
   void _fragment_logged(const MDRequestRef& mdr);
   void _fragment_stored(const MDRequestRef& mdr);
diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h
index 187fb68f846..8538f6e236a 100644
--- a/src/mgr/ActivePyModule.h
+++ b/src/mgr/ActivePyModule.h
@@ -27,6 +27,8 @@
 #include "PyModuleRunner.h"
 #include "PyModule.h"
 
+#include <fmt/core.h>
+
 #include <vector>
 #include <string>
 
@@ -46,7 +48,6 @@ private:
 
   std::string m_command_perms;
   const MgrSession* m_session = nullptr;
-  std::string fin_thread_name;
 public:
   Finisher finisher; // per active module finisher to execute commands
 
@@ -54,8 +55,7 @@ public:
   ActivePyModule(const PyModuleRef &py_module_,
       LogChannelRef clog_)
     : PyModuleRunner(py_module_, clog_),
-      fin_thread_name(std::string("m-fin-" + py_module->get_name()).substr(0,15)),
-      finisher(g_ceph_context, thread_name, fin_thread_name)
+      finisher(g_ceph_context, thread_name, fmt::format("m-fin-{}", py_module->get_name()).substr(0,15))
 
   {
   }
@@ -97,14 +97,14 @@ public:
     uri = str;
   }
 
-  std::string get_uri() const
+  std::string_view get_uri() const
   {
     return uri;
   }
 
-  std::string get_fin_thread_name() const
+  std::string_view get_fin_thread_name() const
   {
-    return fin_thread_name;
+    return finisher.get_thread_name();
   }
 
   bool is_authorized(const std::map<std::string, std::string>& arguments) const;
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
index 17bb3951142..aebbb5d8c9a 100644
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -770,9 +770,9 @@ std::map<std::string, std::string> ActivePyModules::get_services() const
   std::map<std::string, std::string> result;
   std::lock_guard l(lock);
   for (const auto& [name, module] : modules) {
-    std::string svc_str = module->get_uri();
+    const std::string_view svc_str = module->get_uri();
     if (!svc_str.empty()) {
-      result[name] = svc_str;
+      result.emplace(name, svc_str);
     }
   }
 
diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc
index 86528c1dedf..1444103f460 100644
--- a/src/mon/ConfigMap.cc
+++ b/src/mon/ConfigMap.cc
@@ -266,7 +266,7 @@ int ConfigMap::add_option(
     ldout(cct, 10) << __func__ << " unrecognized option '" << name << "'" << dendl;
     stray_options.push_back(
       std::unique_ptr<Option>(
-	new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+	new Option(std::string{name}, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
     opt = stray_options.back().get();
   }
 
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
index 62d37574ded..b935ace4aff 100644
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -385,6 +385,17 @@ public:
       return -EINVAL;
     }
 
+  bool confirm = false;
+  cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+  if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
+    ss << "One or more file system health warnings are present. Modifying "
+       << "the file system setting variable \"max_mds\" may not help "
+       << "troubleshoot or recover from these warnings and may further "
+       << "destabilize the system. If you really wish to proceed, run "
+       << "again with --yes-i-really-mean-it";
+    return -EPERM;
+  }
+
     return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val);
   }
 };
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 76a57ac443d..d8cca4ceb61 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1557,6 +1557,13 @@ bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
   return false;
 }
 
+bool MDSMonitor::has_any_health_warning()
+{
+  return std::any_of(
+    pending_daemon_health.begin(), pending_daemon_health.end(),
+    [](auto& it) { return !it.second.metrics.empty() ? true : false; });
+}
+
 int MDSMonitor::filesystem_command(
     FSMap &fsmap,
     MonOpRequestRef op,
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index b0f88cd3130..dd2a269009d 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -53,6 +53,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
   bool prepare_update(MonOpRequestRef op) override;
   bool should_propose(double& delay) override;
   bool has_health_warnings(std::vector<mds_metric_t> warnings);
+  bool has_any_health_warning();
 
   bool should_print_status() const {
     auto& fs = get_fsmap();
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index c235d9e6219..30382b97052 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -215,7 +215,7 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
       string name = string("mgr/") + i.name + "/" + j.second.name;
       auto p = mgr_module_options.emplace(
 	name,
-	Option(name, static_cast<Option::type_t>(j.second.type),
+	Option(std::string{name}, static_cast<Option::type_t>(j.second.type),
 	       static_cast<Option::level_t>(j.second.level)));
       Option& opt = p.first->second;
       opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 7b1bc9b8e56..c01ea9e7103 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -254,7 +254,7 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
   }
 }
 
-int NVMeofGwMap::process_gw_map_gw_no_subsystems(
+int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
 {
   int rc = 0;
@@ -424,7 +424,6 @@ void NVMeofGwMap::find_failback_gw(
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
-
   dout(10) << "Find failback GW for GW " << gw_id << dendl;
   for (auto& gw_state_it: gws_states) {
     auto& st = gw_state_it.second;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 29710371742..267d85b10f9 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -54,7 +54,7 @@ public:
   int process_gw_map_gw_down(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
-  int process_gw_map_gw_no_subsystems(
+  int process_gw_map_gw_no_subsys_no_listeners(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
   void update_active_timers(bool &propose_pending);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 734e90defd9..d9e936e27df 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -367,6 +367,13 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
 	std::stringstream  sstrm1;
 	sstrm1 << state.availability;
 	f->dump_string("Availability", sstrm1.str());
+	uint32_t num_listeners = 0;
+	if (state.availability == gw_availability_t::GW_AVAILABLE) {
+	  for (auto &subs: state.subsystems) {
+	    num_listeners += subs.listeners.size();
+	  }
+	  f->dump_unsigned("num-listeners", num_listeners);
+	}
 	sstrm1.str("");
 	for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
 	  sstrm1 << " " << state_itr.first + 1 << ": "
@@ -476,7 +483,7 @@ void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
     if (avail == gw_availability_t::GW_UNAVAILABLE) {
       pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
     } else {
-      pending_map.process_gw_map_gw_no_subsystems(gw_id, group_key, propose_pending);
+      pending_map.process_gw_map_gw_no_subsys_no_listeners(gw_id, group_key, propose_pending);
     }
 
   }
@@ -600,7 +607,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   if (sub.size() == 0) {
     avail = gw_availability_t::GW_CREATED;
-  }
+  } else {
+    bool listener_found = false;
+    for (auto &subs: sub) {
+      if (subs.listeners.size()) {
+        listener_found = true;
+        break;
+      }
+    }
+    if (!listener_found) {
+     avail = gw_availability_t::GW_CREATED;
+    }
+  }// for HA no-subsystems and no-listeners are same usecases
   if (pending_map.created_gws[group_key][gw_id].subsystems != sub) {
     dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
     pending_map.created_gws[group_key][gw_id].subsystems =  sub;
diff --git a/src/mrgw.sh b/src/mrgw.sh
index 05739bf015e..86bef336867 100755
--- a/src/mrgw.sh
+++ b/src/mrgw.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Start/restart a radosgw instance on the given mstart.sh cluster.
+
 set -e
 
 rgw_frontend=${RGW_FRONTEND:-"beast"}
diff --git a/src/mrun b/src/mrun
index a8522180021..df7e3542b93 100755
--- a/src/mrun
+++ b/src/mrun
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Run a ceph command against the given mstart.sh cluster.
+
 [ $# -lt 2 ] && echo "usage: $0 <name> <command> [params...]" && exit 1
 
 root=`dirname $0`
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 3e5c58ec376..bb67ff3eef5 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -262,8 +262,8 @@ public:
 #endif
 
 protected:
-  ceph_msg_header  header;      // headerelope
-  ceph_msg_footer  footer;
+  ceph_msg_header  header{};      // headerelope
+  ceph_msg_footer  footer{};
   ceph::buffer::list       payload;  // "front" unaligned blob
   ceph::buffer::list       middle;   // "middle" unaligned blob
   ceph::buffer::list       data;     // data payload (page-alignment will be preserved where possible)
@@ -332,16 +332,11 @@ protected:
   friend class Messenger;
 
 public:
-  Message() {
-    memset(&header, 0, sizeof(header));
-    memset(&footer, 0, sizeof(footer));
-  }
+  Message() = default;
   Message(int t, int version=1, int compat_version=0) {
-    memset(&header, 0, sizeof(header));
     header.type = t;
     header.version = version;
     header.compat_version = compat_version;
-    memset(&footer, 0, sizeof(footer));
   }
 
   Message *get() {
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index 683be086efa..ab3d454748e 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -310,7 +310,7 @@ ssize_t AsyncConnection::write(ceph::buffer::list &bl,
     outgoing_bl.claim_append(bl);
     ssize_t r = _try_send(more);
     if (r > 0) {
-      writeCallback = callback;
+      writeCallback = std::move(callback);
     }
     return r;
 }
@@ -621,7 +621,7 @@ void AsyncConnection::fault()
 }
 
 void AsyncConnection::_stop() {
-  writeCallback.reset();
+  writeCallback = {};
   dispatch_queue->discard_queue(conn_id);
   async_msgr->unregister_conn(this);
   worker->release_worker();
@@ -737,8 +737,7 @@ void AsyncConnection::handle_write_callback() {
   recv_start_time = ceph::mono_clock::now();
   write_lock.lock();
   if (writeCallback) {
-    auto callback = *writeCallback;
-    writeCallback.reset();
+    auto callback = std::move(writeCallback);
     write_lock.unlock();
     callback(0);
     return;
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
index 78a590f8ca3..a4f18e2c4fb 100644
--- a/src/msg/async/AsyncConnection.h
+++ b/src/msg/async/AsyncConnection.h
@@ -223,7 +223,7 @@ private:
 
   std::unique_ptr<Protocol> protocol;
 
-  std::optional<std::function<void(ssize_t)>> writeCallback;
+  std::function<void(ssize_t)> writeCallback;
   std::function<void(char *, ssize_t)> readCallback;
   std::optional<unsigned> pendingReadLen;
   char *read_buffer;
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
index a595667e447..6acd6275738 100644
--- a/src/msg/async/Event.h
+++ b/src/msg/async/Event.h
@@ -97,11 +97,7 @@ class EventCenter {
   using clock_type = ceph::coarse_mono_clock;
 
   struct AssociatedCenters {
-    EventCenter *centers[MAX_EVENTCENTER];
-    AssociatedCenters() {
-      // FIPS zeroization audit 20191115: this memset is not security related.
-      memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
-    }
+    EventCenter *centers[MAX_EVENTCENTER]{};
   };
 
   struct FileEvent {
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
index b14de7b1e56..a53f6389c31 100644
--- a/src/msg/async/ProtocolV1.cc
+++ b/src/msg/async/ProtocolV1.cc
@@ -90,9 +90,8 @@ void ProtocolV1::connect() {
 
   // reset connect state variables
   authorizer_buf.clear();
-  // FIPS zeroization audit 20191115: these memsets are not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
-  memset(&connect_reply, 0, sizeof(connect_reply));
+  connect_msg = {};
+  connect_reply = {};
 
   global_seq = messenger->get_global_seq();
 }
@@ -820,7 +819,7 @@ CtPtr ProtocolV1::read_message_data_prepare() {
 #if 0
     // rx_buffers is broken by design... see
     //  http://tracker.ceph.com/issues/22480
-    map<ceph_tid_t, pair<ceph::buffer::list, int> >::iterator p =
+    const auto p =
         connection->rx_buffers.find(current_header.tid);
     if (p != connection->rx_buffers.end()) {
       ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
@@ -1205,7 +1204,7 @@ void ProtocolV1::requeue_sent() {
     return;
   }
 
-  list<out_q_entry_t> &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  auto &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
   out_seq -= sent.size();
   while (!sent.empty()) {
     Message *m = sent.back();
@@ -1220,10 +1219,11 @@ void ProtocolV1::requeue_sent() {
 uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
   ldout(cct, 10) << __func__ << " " << seq << dendl;
   std::lock_guard<std::mutex> l(connection->write_lock);
-  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+  const auto it = out_q.find(CEPH_MSG_PRIO_HIGHEST);
+  if (it == out_q.end()) {
     return seq;
   }
-  list<out_q_entry_t> &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  auto &rq = it->second;
   uint64_t count = out_seq;
   while (!rq.empty()) {
     Message* const m = rq.front().m;
@@ -1235,7 +1235,7 @@ uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
     rq.pop_front();
     count++;
   }
-  if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+  if (rq.empty()) out_q.erase(it);
   return count;
 }
 
@@ -1246,18 +1246,16 @@ uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
 void ProtocolV1::discard_out_queue() {
   ldout(cct, 10) << __func__ << " started" << dendl;
 
-  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
-    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
-    (*p)->put();
+  for (Message *msg : sent) {
+    ldout(cct, 20) << __func__ << " discard " << msg << dendl;
+    msg->put();
   }
   sent.clear();
-  for (map<int, list<out_q_entry_t>>::iterator p =
-           out_q.begin();
-       p != out_q.end(); ++p) {
-    for (list<out_q_entry_t>::iterator r = p->second.begin();
-         r != p->second.end(); ++r) {
-      ldout(cct, 20) << __func__ << " discard " << r->m << dendl;
-      r->m->put();
+  for (auto& [ prio, entries ] : out_q) {
+    static_cast<void>(prio);
+    for (auto& entry : entries) {
+      ldout(cct, 20) << __func__ << " discard " << entry.m << dendl;
+      entry.m->put();
     }
   }
   out_q.clear();
@@ -1296,7 +1294,7 @@ void ProtocolV1::reset_recv_state()
 
   // clean read and write callbacks
   connection->pendingReadLen.reset();
-  connection->writeCallback.reset();
+  connection->writeCallback = {};
 
   if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
       connection->policy.throttler_messages) {
@@ -1328,14 +1326,12 @@ void ProtocolV1::reset_recv_state()
 
 ProtocolV1::out_q_entry_t ProtocolV1::_get_next_outgoing() {
   out_q_entry_t out_entry;
-  if (!out_q.empty()) {
-    map<int, list<out_q_entry_t>>::reverse_iterator it =
-        out_q.rbegin();
+  if (const auto it = out_q.begin(); it != out_q.end()) {
     ceph_assert(!it->second.empty());
-    list<out_q_entry_t>::iterator p = it->second.begin();
+    const auto p = it->second.begin();
     out_entry = *p;
     it->second.erase(p);
-    if (it->second.empty()) out_q.erase(it->first);
+    if (it->second.empty()) out_q.erase(it);
   }
   return out_entry;
 }
@@ -1572,8 +1568,7 @@ CtPtr ProtocolV1::handle_connect_message_write(int r) {
 CtPtr ProtocolV1::wait_connect_reply() {
   ldout(cct, 20) << __func__ << dendl;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_reply, 0, sizeof(connect_reply));
+  connect_reply = {};
   return READ(sizeof(connect_reply), handle_connect_reply_1);
 }
 
@@ -1923,8 +1918,7 @@ CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
 CtPtr ProtocolV1::wait_connect_message() {
   ldout(cct, 20) << __func__ << dendl;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
+  connect_msg = {};
   return READ(sizeof(connect_msg), handle_connect_message_1);
 }
 
@@ -1988,8 +1982,7 @@ CtPtr ProtocolV1::handle_connect_message_2() {
   ceph_msg_connect_reply reply;
   ceph::buffer::list authorizer_reply;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&reply, 0, sizeof(reply));
+  reply = {};
   reply.protocol_version =
       messenger->get_proto_version(connection->peer_type, false);
 
@@ -2616,8 +2609,7 @@ CtPtr ProtocolV1::server_ready() {
 		 << dendl;
 
   ldout(cct, 20) << __func__ << " accept done" << dendl;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
+  connect_msg = {};
 
   if (connection->delay_state) {
     ceph_assert(connection->delay_state->ready());
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
index 1b7c1d2b5f8..63bc1cd0946 100644
--- a/src/msg/async/ProtocolV1.h
+++ b/src/msg/async/ProtocolV1.h
@@ -112,7 +112,12 @@ protected:
     bool is_prepared {false};
   };
   // priority queue for outbound msgs
-  std::map<int, std::list<out_q_entry_t>> out_q;
+
+  /**
+   * A queue for each priority value, highest priority first.
+   */
+  std::map<int, std::list<out_q_entry_t>, std::greater<int>> out_q;
+
   bool keepalive;
   bool write_in_progress = false;
 
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
index c4cfa76d16f..ed6f93cdd48 100644
--- a/src/msg/async/ProtocolV2.cc
+++ b/src/msg/async/ProtocolV2.cc
@@ -127,9 +127,9 @@ bool ProtocolV2::is_connected() { return can_write; }
 void ProtocolV2::discard_out_queue() {
   ldout(cct, 10) << __func__ << " started" << dendl;
 
-  for (auto p = sent.begin(); p != sent.end(); ++p) {
-    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
-    (*p)->put();
+  for (Message *msg : sent) {
+    ldout(cct, 20) << __func__ << " discard " << msg << dendl;
+    msg->put();
   }
   sent.clear();
   for (auto& [ prio, entries ] : out_queue) {
@@ -211,10 +211,11 @@ void ProtocolV2::requeue_sent() {
 uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
   ldout(cct, 10) << __func__ << " " << seq << dendl;
   std::lock_guard<std::mutex> l(connection->write_lock);
-  if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+  const auto it = out_queue.find(CEPH_MSG_PRIO_HIGHEST);
+  if (it == out_queue.end()) {
     return seq;
   }
-  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  auto& rq = it->second;
   uint64_t count = out_seq;
   while (!rq.empty()) {
     Message* const m = rq.front().m;
@@ -226,7 +227,7 @@ uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
     rq.pop_front();
     count++;
   }
-  if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+  if (rq.empty()) out_queue.erase(it);
   return count;
 }
 
@@ -265,7 +266,7 @@ void ProtocolV2::reset_recv_state() {
 
   // clean read and write callbacks
   connection->pendingReadLen.reset();
-  connection->writeCallback.reset();
+  connection->writeCallback = {};
 
   next_tag = static_cast<Tag>(0);
 
@@ -507,14 +508,13 @@ void ProtocolV2::read_event() {
 ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
   out_queue_entry_t out_entry;
 
-  if (!out_queue.empty()) {
-    auto it = out_queue.rbegin();
+  if (const auto it = out_queue.begin(); it != out_queue.end()) {
     auto& entries = it->second;
     ceph_assert(!entries.empty());
     out_entry = entries.front();
     entries.pop_front();
     if (entries.empty()) {
-      out_queue.erase(it->first);
+      out_queue.erase(it);
     }
   }
   return out_entry;
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
index 918003a21ce..1ee258c4975 100644
--- a/src/msg/async/ProtocolV2.h
+++ b/src/msg/async/ProtocolV2.h
@@ -93,7 +93,12 @@ private:
     bool is_prepared {false};
     Message* m {nullptr};
   };
-  std::map<int, std::list<out_queue_entry_t>> out_queue;
+
+  /**
+   * A queue for each priority value, highest priority first.
+   */
+  std::map<int, std::list<out_queue_entry_t>, std::greater<int>> out_queue;
+
   std::list<Message *> sent;
   std::atomic<uint64_t> out_seq{0};
   std::atomic<uint64_t> in_seq{0};
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
index ef4a6ddabfb..a9b03c74d4d 100644
--- a/src/msg/async/frames_v2.cc
+++ b/src/msg/async/frames_v2.cc
@@ -63,9 +63,7 @@ static bool check_epilogue_late_status(__u8 late_status) {
 
 void FrameAssembler::fill_preamble(Tag tag,
                                    preamble_block_t& preamble) const {
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&preamble, 0, sizeof(preamble));
-
+  preamble = {};
   preamble.tag = static_cast<__u8>(tag);
   for (size_t i = 0; i < m_descs.size(); i++) {
     preamble.segments[i].length = m_descs[i].logical_len;
@@ -100,9 +98,7 @@ uint64_t FrameAssembler::get_frame_onwire_len() const {
 
 bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
                                         bufferlist segment_bls[]) const {
-  epilogue_crc_rev0_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_crc_rev0_block_t epilogue{};
 
   bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
   frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
@@ -123,9 +119,7 @@ bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
   preamble_bl.append(reinterpret_cast<const char*>(&preamble),
                      sizeof(preamble));
 
-  epilogue_secure_rev0_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_secure_rev0_block_t epilogue{};
   bufferlist epilogue_bl(sizeof(epilogue));
   epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
                      sizeof(epilogue));
@@ -151,9 +145,7 @@ bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
 
 bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
                                         bufferlist segment_bls[]) const {
-  epilogue_crc_rev1_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_crc_rev1_block_t epilogue{};
   epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
 
   bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
@@ -215,9 +207,7 @@ bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
     return frame_bl;  // no epilogue if only one segment
   }
 
-  epilogue_secure_rev1_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_secure_rev1_block_t epilogue{};
   epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
   bufferlist epilogue_bl(sizeof(epilogue));
   epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
diff --git a/src/mstart.sh b/src/mstart.sh
index 34b57e17611..0c512ca9eb8 100755
--- a/src/mstart.sh
+++ b/src/mstart.sh
@@ -1,5 +1,33 @@
 #!/bin/sh
 
+# Deploy a vstart.sh cluster in a named subdirectory. This makes it possible to
+# start multiple clusters in different subdirectories. See mstop.sh for cleanup.
+#
+# Example:
+#
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c1 -n -d
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c2 -n -d
+#
+# ~/ceph/build $ ls run
+# c1  c2
+# ~/ceph/build $ ls run/c1
+# asok  ceph.conf  dev  keyring  out
+#
+# ~/ceph/build $ ../src/mrun c1 radosgw-admin user list
+# [
+#     "56789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01234",
+#     "testx$9876543210abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "testacct1user",
+#     "test",
+#     "testacct2root",
+#     "testacct1root",
+#     "testid"
+# ]
+#
+# ~/ceph/build $ ../src/mstop.sh c1
+# ~/ceph/build $ ../src/mstop.sh c2
+
 usage="usage: $0 <name> [vstart options]..\n"
 
 usage_exit() {
diff --git a/src/mstop.sh b/src/mstop.sh
index 702d1765941..eec0ca02e42 100755
--- a/src/mstop.sh
+++ b/src/mstop.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Stop a named cluster started by mstart.sh
+
 set -e
 
 script_root=`dirname $0`
diff --git a/src/mypy-constrains.txt b/src/mypy-constrains.txt
index 7810870804e..0a79b8ef4f1 100644
--- a/src/mypy-constrains.txt
+++ b/src/mypy-constrains.txt
@@ -2,7 +2,7 @@
 # Unfortunately this means we have to manually update those 
 # packages regularly. 
 
-mypy==1.1.1
+mypy==1.9
 
 # global
 types-python-dateutil==0.1.3
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 3dcd96830c4..5f4f1a4d48a 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3760,15 +3760,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
 {
   auto t0 = mono_clock::now();
   std::lock_guard hl(h->lock);
+  auto& fnode = h->file->fnode;
   dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
-           << " file " << h->file->fnode << dendl;
+           << " file " << fnode << dendl;
   if (h->file->deleted) {
     dout(10) << __func__ << "  deleted, no-op" << dendl;
     return 0;
   }
 
   // we never truncate internal log files
-  ceph_assert(h->file->fnode.ino > 1);
+  ceph_assert(fnode.ino > 1);
 
   // truncate off unflushed data?
   if (h->pos < offset &&
@@ -3782,20 +3783,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
     if (r < 0)
       return r;
   }
-  if (offset == h->file->fnode.size) {
-    return 0;  // no-op!
-  }
-  if (offset > h->file->fnode.size) {
+  if (offset > fnode.size) {
     ceph_abort_msg("truncate up not supported");
   }
-  ceph_assert(h->file->fnode.size >= offset);
+  ceph_assert(offset <= fnode.size);
   _flush_bdev(h);
-
-  std::lock_guard ll(log.lock);
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
-  h->file->fnode.size = offset;
-  h->file->is_dirty = true;
-  log.t.op_file_update_inc(h->file->fnode);
+  {
+    std::lock_guard ll(log.lock);
+    std::lock_guard dl(dirty.lock);
+    bool changed_extents = false;
+    vselector->sub_usage(h->file->vselector_hint, fnode);
+    uint64_t x_off = 0;
+    auto p = fnode.seek(offset, &x_off);
+    uint64_t cut_off =
+      (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
+    uint64_t new_allocated;
+    if (0 == cut_off) {
+      // whole pextent to remove
+      changed_extents = true;
+      new_allocated = offset;
+    } else if (cut_off < p->length) {
+      dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
+      new_allocated = (offset - x_off) + cut_off;
+      p->length = cut_off;
+      changed_extents = true;
+      ++p;
+    } else {
+      ceph_assert(cut_off >= p->length);
+      new_allocated  = (offset - x_off) + p->length;
+      // just leave it here
+      ++p;
+    }
+    while (p != fnode.extents.end()) {
+      dirty.pending_release[p->bdev].insert(p->offset, p->length);
+      p = fnode.extents.erase(p);
+      changed_extents = true;
+    }
+    if (changed_extents) {
+      fnode.size = offset;
+      fnode.allocated = new_allocated;
+      fnode.reset_delta();
+      log.t.op_file_update(fnode);
+      // sad, but is_dirty must be set to signal flushing of the log
+      h->file->is_dirty = true;
+    } else {
+      if (offset != fnode.size) {
+        fnode.size = offset;
+        //skipping log.t.op_file_update_inc, it will be done by flush()
+        h->file->is_dirty = true;
+      }
+    }
+    vselector->add_usage(h->file->vselector_hint, fnode);
+  }
   logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0);
   return 0;
 }
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 68040af4282..7cbe0a1d121 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
   }
 
   rocksdb::Status Close() override {
-    fs->fsync(h);
 
-    // mimic posix env, here.  shrug.
-    size_t block_size;
-    size_t last_allocated_block;
-    GetPreallocationStatus(&block_size, &last_allocated_block);
-    if (last_allocated_block > 0) {
-      int r = fs->truncate(h, h->pos);
-      if (r < 0)
-	return err_to_status(r);
+    int r = fs->truncate(h, h->pos);
+    if (r < 0) {
+      return err_to_status(r);
     }
-
+    fs->fsync(h);
     return rocksdb::Status::OK();
   }
 
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 44a171873c0..535cf166f0a 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -6794,9 +6794,8 @@ void BlueStore::_main_bdev_label_try_reserve()
   vector<uint64_t> candidate_positions;
   vector<uint64_t> accepted_positions;
   uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
-  for (size_t i = 1; i < bdev_label_positions.size(); i++) {
-    uint64_t location = bdev_label_positions[i];
-    if (location + lsize <= bdev->get_size()) {
+  for (uint64_t location : bdev_label_valid_locations) {
+    if (location != BDEV_FIRST_LABEL_POSITION) {
       candidate_positions.push_back(location);
     }
   }
@@ -11497,9 +11496,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     string p = path + "/block";
     _write_bdev_label(cct, bdev, p, bdev_label, bdev_labels_in_repair);
     for (uint64_t pos : bdev_labels_in_repair) {
-      if (pos != BDEV_FIRST_LABEL_POSITION) {
-        bdev_label_valid_locations.push_back(pos);
-      }
+      bdev_label_valid_locations.push_back(pos);
     }
     repaired += bdev_labels_in_repair.size();
   }
@@ -20572,6 +20569,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
     if (ret < 0) {
       return ret;
     }
+    if (bdev_label_multi) {
+      uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
+      for (uint64_t p : bdev_label_valid_locations) {
+	if (p != BDEV_FIRST_LABEL_POSITION) {
+	  allocator->init_rm_free(p, lsize);
+	}
+      }
+    }
 
     duration = ceph_clock_now() - start;
     stats.insert_count = 0;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 76256df49b8..71b9b713385 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1137,46 +1137,10 @@ void PG::update_snap_map(
   const vector<pg_log_entry_t> &log_entries,
   ObjectStore::Transaction &t)
 {
-  for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+  for (const auto& entry : log_entries) {
     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-    if (i->soid.snap < CEPH_MAXSNAP) {
-      if (i->is_delete()) {
-	int r = snap_mapper.remove_oid(
-	  i->soid,
-	  &_t);
-	if (r)
-	  derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
-        // On removal tolerate missing key corruption
-        ceph_assert(r == 0 || r == -ENOENT);
-      } else if (i->is_update()) {
-	ceph_assert(i->snaps.length() > 0);
-	vector<snapid_t> snaps;
-	bufferlist snapbl = i->snaps;
-	auto p = snapbl.cbegin();
-	try {
-	  decode(snaps, p);
-	} catch (...) {
-	  derr << __func__ << " decode snaps failure on " << *i << dendl;
-	  snaps.clear();
-	}
-	set<snapid_t> _snaps(snaps.begin(), snaps.end());
-
-	if (i->is_clone() || i->is_promote()) {
-	  snap_mapper.add_oid(
-	    i->soid,
-	    _snaps,
-	    &_t);
-	} else if (i->is_modify()) {
-	  int r = snap_mapper.update_snaps(
-	    i->soid,
-	    _snaps,
-	    0,
-	    &_t);
-	  ceph_assert(r == 0);
-	} else {
-	  ceph_assert(i->is_clean());
-	}
-      }
+    if (entry.soid.snap < CEPH_MAXSNAP) {
+      snap_mapper.update_snap_map(entry, &_t);
     }
   }
 }
diff --git a/src/osd/osd_types_fmt.h b/src/osd/osd_types_fmt.h
index 04f4d46ee51..100ce6e4646 100644
--- a/src/osd/osd_types_fmt.h
+++ b/src/osd/osd_types_fmt.h
@@ -392,4 +392,6 @@ inline std::ostream &operator<<(std::ostream &lhs, const object_stat_sum_t &sum)
 
 #if FMT_VERSION >= 90000
 template <bool TrackChanges> struct fmt::formatter<pg_missing_set<TrackChanges>> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<pool_opts_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<store_statfs_t> : fmt::ostream_formatter {};
 #endif
diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index a00ab2caece..7f28ca2d642 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -1,11 +1,13 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 
-#include "ScrubStore.h"
+#include "./ScrubStore.h"
 #include "osd/osd_types.h"
 #include "common/scrub_types.h"
 #include "include/rados/rados_types.hpp"
 
+#include "pg_scrubber.h"
+
 using std::ostringstream;
 using std::string;
 using std::vector;
@@ -13,21 +15,9 @@ using std::vector;
 using ceph::bufferlist;
 
 namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
-  ostringstream ss;
-  ss << "scrub_" << pgid;
-  return pgid.make_temp_ghobject(ss.str());
-}
-
 string first_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -47,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -60,14 +45,9 @@ string last_object_key(int64_t pool)
 string first_snap_key(int64_t pool)
 {
   // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
-  // the representing the minimal and maximum keys. and this relies on how
+  // representing the minimal and maximum keys. and this relies on how
   // hobject_t::to_str() works: hex(pool).hex(revhash).
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
@@ -86,123 +66,447 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_snap_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
+
+}  // namespace
+
+#undef dout_context
+#define dout_context (m_scrubber.get_pg_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix_fn(_dout, this, __func__)
+
+template <class T>
+static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
+{
+  return t->gen_prefix(*_dout, fn);
 }
 
 namespace Scrub {
 
-Store*
-Store::create(ObjectStore* store,
-	      ObjectStore::Transaction* t,
-	      const spg_t& pgid,
-	      const coll_t& coll)
+Store::Store(
+    PgScrubber& scrubber,
+    ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll)
+    : m_scrubber{scrubber}
+    , object_store{osd_store}
+    , coll{coll}
 {
-  ceph_assert(store);
   ceph_assert(t);
-  ghobject_t oid = make_scrub_object(pgid);
-  t->touch(coll, oid);
-  return new Store{coll, oid, store};
+
+  // shallow errors DB object
+  const auto sh_err_obj =
+      pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
+  t->touch(coll, sh_err_obj);
+  shallow_db.emplace(
+      pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
+
+  // and the DB for deep errors
+  const auto dp_err_obj =
+      pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid));
+  t->touch(coll, dp_err_obj);
+  deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
+
+  dout(20) << fmt::format(
+		  "created Scrub::Store for pg[{}], shallow: {}, deep: {}",
+		  pgid, sh_err_obj, dp_err_obj)
+	   << dendl;
 }
 
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
-  : coll(coll),
-    hoid(oid),
-    driver(store, coll, hoid),
-    backend(&driver)
-{}
 
 Store::~Store()
 {
-  ceph_assert(results.empty());
+  ceph_assert(!shallow_db || shallow_db->results.empty());
+  ceph_assert(!deep_db || deep_db->results.empty());
 }
 
+
+std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const
+{
+  if (fn.starts_with("operator")) {
+    // it's a lambda, and __func__ is not available
+    return m_scrubber.gen_prefix(out) << "Store::";
+  } else {
+    return m_scrubber.gen_prefix(out) << "Store::" << fn << ": ";
+  }
+}
+
+
 void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
   add_object_error(pool, e);
 }
 
+
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_object_key(pool, e.object)] = bl;
+  using librados::obj_err_t;
+  const auto key = to_object_key(pool, e.object);
+  dout(20) << fmt::format(
+		  "{}: adding error for object {} ({}). Errors: {} ({}/{}) "
+		  "unfiltered:{}",
+		  (current_level == scrub_level_t::deep ? "deep" : "shallow"),
+		  e.object, key, obj_err_t{e.errors},
+		  obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS},
+		  obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e)
+	   << dendl;
+
+  if (current_level == scrub_level_t::deep) {
+    // not overriding the deep errors DB during shallow scrubs
+    deep_db->results[key] = e.encode();
+  }
+
+  // only shallow errors are stored in the shallow DB
+  auto e_copy = e;
+  e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS;
+  e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS;
+  shallow_db->results[key] = e_copy.encode();
 }
 
+
 void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
   add_snap_error(pool, e);
 }
 
+
 void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_snap_key(pool, e.object)] = bl;
+  // note: snap errors are only placed in the shallow store
+  shallow_db->results[to_snap_key(pool, e.object)] = e.encode();
 }
 
-bool Store::empty() const
+
+bool Store::is_empty() const
 {
-  return results.empty();
+  return (!shallow_db || shallow_db->results.empty()) &&
+	 (!deep_db || deep_db->results.empty());
 }
 
+
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    OSDriver::OSTransaction txn = driver.get_transaction(t);
-    backend.set_keys(results, &txn);
+    auto txn = shallow_db->driver.get_transaction(t);
+    shallow_db->backend.set_keys(shallow_db->results, &txn);
+    txn = deep_db->driver.get_transaction(t);
+    deep_db->backend.set_keys(deep_db->results, &txn);
+  }
+
+  shallow_db->results.clear();
+  deep_db->results.clear();
+}
+
+
+void Store::clear_level_db(
+    ObjectStore::Transaction* t,
+    at_level_t& db,
+    std::string_view db_name)
+{
+  dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name)
+	   << dendl;
+  // easiest way to guarantee that the object representing the DB exists
+  t->touch(coll, db.errors_hoid);
+
+  // remove all the keys in the DB
+  t->omap_clear(coll, db.errors_hoid);
+
+  // restart the 'in progress' part of the MapCacher
+  db.backend.reset();
+}
+
+
+void Store::reinit(
+    ObjectStore::Transaction* t,
+    scrub_level_t level)
+{
+  // Note: only one caller, and it creates the transaction passed to reinit().
+  // No need to assert on 't'
+  dout(20) << fmt::format(
+		  "re-initializing the Scrub::Store (for {} scrub)",
+		  (level == scrub_level_t::deep ? "deep" : "shallow"))
+	   << dendl;
+
+  current_level = level;
+
+  // always clear the known shallow errors DB (as both shallow and deep scrubs
+  // would recreate it)
+  if (shallow_db) {
+    clear_level_db(t, *shallow_db, "shallow");
+  }
+  // only a deep scrub recreates the deep errors DB
+  if (level == scrub_level_t::deep && deep_db) {
+    clear_level_db(t, *deep_db, "deep");
   }
-  results.clear();
 }
 
+
 void Store::cleanup(ObjectStore::Transaction* t)
 {
-  t->remove(coll, hoid);
+  dout(20) << "discarding error DBs" << dendl;
+  ceph_assert(t);
+  if (shallow_db)
+    t->remove(coll, shallow_db->errors_hoid);
+  if (deep_db)
+    t->remove(coll, deep_db->errors_hoid);
 }
 
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
-		       const librados::object_id_t& start,
-		       uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_snap_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_snap_key(pool) : to_snap_key(pool, start));
+  vector<bufferlist> errors;
+  const string begin =
+      (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start));
   const string end = last_snap_key(pool);
-  return get_errors(begin, end, max_return);
+
+  // the snap errors are stored only in the shallow store
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin);
+
+  while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) {
+    errors.push_back(latest_sh->data);
+    latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+  }
+
+  return errors;
 }
 
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
-			 const librados::object_id_t& start,
-			 uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_object_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_object_key(pool) : to_object_key(pool, start));
+  const string begin =
+      (start.name.empty() ? first_object_key(pool)
+			  : to_object_key(pool, start));
   const string end = last_object_key(pool);
+  dout(20) << fmt::format("fetching errors, from {} to {}", begin, end)
+	   << dendl;
   return get_errors(begin, end, max_return);
 }
 
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
-		  const string& end,
-		  uint64_t max_return) const
+
+inline void decode(
+    librados::inconsistent_obj_t& obj,
+    ceph::buffer::list::const_iterator& bp)
 {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+
+inconsistent_obj_wrapper decode_wrapper(
+    hobject_t obj,
+    ceph::buffer::list::const_iterator bp)
+{
+  inconsistent_obj_wrapper iow{obj};
+  iow.decode(bp);
+  return iow;
+}
+
+
+void Store::collect_specific_store(
+    MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+    Store::ExpCacherPosData& latest,
+    std::vector<bufferlist>& errors,
+    std::string_view end_key,
+    uint64_t max_return) const
+{
+  while (max_return-- && latest.has_value() &&
+	 latest.value().last_key < end_key) {
+    errors.push_back(latest->data);
+    latest = backend.get_1st_after_key(latest->last_key);
+  }
+}
+
+
+/*
+ * Implementation notes:
+ * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f
+ *   for why we encode the shard_info_t in the store.
+ * - to maintain known shard_info-s created during a deep scrub (but only when
+ *   needed), we use our knowledge of the level of the last scrub performed
+ *   (current_level), and the object user version as encoded in the error
+ *   structure.
+ */
+bufferlist Store::merge_encoded_error_wrappers(
+    hobject_t obj,
+    ExpCacherPosData& latest_sh,
+    ExpCacherPosData& latest_dp) const
+{
+  // decode both error wrappers
+  auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
+  auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
+
+  // note: the '20' level is just until we're sure the merging works as
+  // expected
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    dout(20) << fmt::format(
+		    "merging errors {}. Deep: {:#x}-({})", sh_wrap.object,
+		    dp_wrap.errors, dp_wrap)
+	     << dendl;
+    dout(20) << fmt::format(
+		    "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object,
+		    sh_wrap.errors, sh_wrap)
+	     << dendl;
+    // dev: list the attributes:
+    for (const auto& [shard, si] : sh_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+    for (const auto& [shard, si] : dp_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+  }
+
+  // Actual merging of the shard map entries is only performed if the
+  // latest version is from the shallow scrub.
+  // Otherwise, the deep scrub, which (for the shards info) contains all data,
+  // and the shallow scrub is ignored.
+  if (current_level == scrub_level_t::shallow) {
+    // is the object data related to the same object version?
+    if (sh_wrap.version == dp_wrap.version) {
+      // combine the error information
+      dp_wrap.errors |= sh_wrap.errors;
+      for (const auto& [shard, si] : sh_wrap.shards) {
+	if (dp_wrap.shards.contains(shard)) {
+	  dout(20) << fmt::format(
+			  "-----> {}-{}  combining: sh-errors: {} dp-errors:{}",
+			  sh_wrap.object, shard, si, dp_wrap.shards[shard])
+		   << dendl;
+	  const auto saved_er = dp_wrap.shards[shard].errors;
+	  dp_wrap.shards[shard].selected_oi = si.selected_oi;
+	  dp_wrap.shards[shard].primary = si.primary;
+	  dp_wrap.shards[shard].errors |= saved_er;
+
+	  // the attributes:
+	  for (const auto& [attr, bl] : si.attrs) {
+	    if (!dp_wrap.shards[shard].attrs.contains(attr)) {
+	      dout(20) << fmt::format(
+			      "-----> {}-{}  copying shallow attr: attr: {}",
+			      sh_wrap.object, shard, attr)
+		       << dendl;
+	      dp_wrap.shards[shard].attrs[attr] = bl;
+	    }
+	    // otherwise - we'll ignore the shallow attr buffer
+	  }
+	} else {
+	  // the deep scrub data for this shard is missing. We take the shallow
+	  // scrub data.
+	  dp_wrap.shards[shard] = si;
+	}
+      }
+    } else if (sh_wrap.version > dp_wrap.version) {
+	if (false && dp_wrap.version == 0) {
+	  // there was a read error in the deep scrub. The deep version
+	  // shows as '0'. That's severe enough for us to ignore the shallow.
+	  dout(10) << fmt::format("{} ignoring deep after read failure",
+			  sh_wrap.object)
+		   << dendl;
+	} else {
+	  // There is a new shallow version of the object results.
+	  // The deep data is for an older version of that object.
+	  // There are multiple possibilities here, but for now we ignore the
+	  // deep data.
+	  dp_wrap = sh_wrap;
+	}
+      }
+  }
+
+  return dp_wrap.encode();
+}
+
+
+// a better way to implement get_errors(): use two generators, one for each store.
+// and sort-merge the results. Almost like a merge-sort, but with equal
+// keys combined. 'todo' once 'ranges' are really working.
+
+std::vector<bufferlist> Store::get_errors(
+    const std::string& from_key,
+    const std::string& end_key,
+    uint64_t max_return) const
+{
+  // merge the input from the two sorted DBs into 'errors' (until
+  // enough errors are collected)
   vector<bufferlist> errors;
-  auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !backend.get_next(next.first, &next)) {
-    if (next.first >= end)
+  dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key)
+	   << dendl;
+
+  ceph_assert(shallow_db);
+  ceph_assert(deep_db);
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key);
+  ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key);
+
+  while (max_return) {
+    dout(20) << fmt::format(
+		    "n:{} latest_sh: {}, latest_dp: {}", max_return,
+		    (latest_sh ? latest_sh->last_key : "(none)"),
+		    (latest_dp ? latest_dp->last_key : "(none)"))
+	     << dendl;
+
+    // keys not smaller than end_key are not interesting
+    if (latest_sh.has_value() && latest_sh->last_key >= end_key) {
+      latest_sh = tl::unexpected(-EINVAL);
+    }
+    if (latest_dp.has_value() && latest_dp->last_key >= end_key) {
+      latest_dp = tl::unexpected(-EINVAL);
+    }
+
+    if (!latest_sh && !latest_dp) {
+      // both stores are exhausted
+      break;
+    }
+    if (!latest_sh.has_value()) {
+      // continue with the deep store
+      dout(10) << fmt::format("collecting from deep store") << dendl;
+      collect_specific_store(
+	  deep_db->backend, latest_dp, errors, end_key, max_return);
       break;
-    errors.push_back(next.second);
+    }
+    if (!latest_dp.has_value()) {
+      // continue with the shallow store
+      dout(10) << fmt::format("collecting from shallow store") << dendl;
+      collect_specific_store(
+	  shallow_db->backend, latest_sh, errors, end_key, max_return);
+      break;
+    }
+
+    // we have results from both stores. Select the one with a lower key.
+    // If the keys are equal, combine the errors.
+    if (latest_sh->last_key == latest_dp->last_key) {
+      auto bl = merge_encoded_error_wrappers(
+	  shallow_db->errors_hoid.hobj, latest_sh, latest_dp);
+      errors.push_back(bl);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+
+    } else if (latest_sh->last_key < latest_dp->last_key) {
+      dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key)
+	       << dendl;
+      errors.push_back(latest_sh->data);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+    } else {
+      dout(20) << fmt::format("deep store element ({})", latest_dp->last_key)
+	       << dendl;
+      errors.push_back(latest_dp->data);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+    }
     max_return--;
   }
+
+  dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
   return errors;
 }
-
-} // namespace Scrub
+}  // namespace Scrub
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 567badf608b..0955654d78e 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -1,10 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
+#pragma once
 
 #include "common/map_cacher.hpp"
+#include "osd/osd_types_fmt.h"
 #include "osd/SnapMapper.h"  // for OSDriver
 
 namespace librados {
@@ -13,27 +12,71 @@ struct object_id_t;
 
 struct inconsistent_obj_wrapper;
 struct inconsistent_snapset_wrapper;
+class PgScrubber;
 
 namespace Scrub {
 
+/**
+ * Storing errors detected during scrubbing.
+ *
+ * From both functional and internal perspectives, the store is a pair of key-value
+ * databases: one maps objects to shallow errors detected during their scrubbing,
+ * and other stores deep errors.
+ * Note that the first store is updated in both shallow and in deep scrubs. The
+ * second - only while deep scrubbing.
+ *
+ * The DBs can be consulted by the operator, when trying to list 'errors known
+ * at this point in time'. Whenever a scrub starts - the relevant entries in the
+ * DBs are removed. Specifically - the shallow errors DB is recreated each scrub,
+ * while the deep errors DB is recreated only when a deep scrub starts.
+ *
+ * When queried - the data from both DBs is merged for each named object, and
+ * returned to the operator.
+ *
+ * Implementation:
+ * Each of the two DBs is implemented as OMAP entries of a single, uniquely named,
+ * object. Both DBs are cached using the general KV Cache mechanism.
+ */
+
 class Store {
  public:
   ~Store();
-  static Store* create(ObjectStore* store,
-		       ObjectStore::Transaction* t,
-		       const spg_t& pgid,
-		       const coll_t& coll);
+
+  Store(
+      PgScrubber& scrubber,
+      ObjectStore& osd_store,
+      ObjectStore::Transaction* t,
+      const spg_t& pgid,
+      const coll_t& coll);
+
+
+  /// mark down detected errors, either shallow or deep
   void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+
   void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
   // and a variant-friendly interface:
   void add_error(int64_t pool, const inconsistent_obj_wrapper& e);
   void add_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
-  bool empty() const;
+  [[nodiscard]] bool is_empty() const;
   void flush(ObjectStore::Transaction*);
+
+  /// remove both shallow and deep errors DBs. Called on interval.
   void cleanup(ObjectStore::Transaction*);
 
+  /**
+   * prepare the Store object for a new scrub session.
+   * This involves clearing one or both of the errors DBs, and resetting
+   * the cache.
+   *
+   * @param level: the scrub level to prepare for. Whenever a deep scrub
+   * is requested, both the shallow and deep errors DBs are cleared.
+   * If, on the other hand, a shallow scrub is requested, only the shallow
+   * errors DB is cleared.
+   */
+  void reinit(ObjectStore::Transaction* t, scrub_level_t level);
+
   std::vector<ceph::buffer::list> get_snap_errors(
     int64_t pool,
     const librados::object_id_t& start,
@@ -44,20 +87,89 @@ class Store {
     const librados::object_id_t& start,
     uint64_t max_return) const;
 
+  std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+
  private:
-  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
-  std::vector<ceph::buffer::list> get_errors(const std::string& start,
-					     const std::string& end,
-					     uint64_t max_return) const;
- private:
+  /**
+   * at_level_t
+   *
+   * The machinery for caching and storing errors at a specific scrub level.
+   */
+  struct at_level_t {
+    at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr)
+	: errors_hoid{err_obj}
+	, driver{std::move(drvr)}
+	, backend{&driver}
+    {}
+
+    /// the object in the PG store, where the errors are stored
+    ghobject_t errors_hoid;
+
+    /// abstracted key fetching
+    OSDriver driver;
+
+    /// a K,V cache for the errors that are detected during the scrub
+    /// session. The errors marked for a specific object are stored as
+    /// an OMap entry with the object's name as the key.
+    MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+    /// a temp object mapping seq-id to inconsistencies
+    std::map<std::string, ceph::buffer::list> results;
+  };
+
+  using CacherPosData =
+      MapCacher::MapCacher<std::string, ceph::buffer::list>::PosAndData;
+  using ExpCacherPosData = tl::expected<CacherPosData, int>;
+
+  /// access to the owning Scrubber object, for logging mostly
+  PgScrubber& m_scrubber;
+
+  /// the OSD's storage backend
+  ObjectStore& object_store;
+
+  /// the collection (i.e. - the PG store) in which the errors are stored
   const coll_t coll;
-  const ghobject_t hoid;
-  // a temp object holding mappings from seq-id to inconsistencies found in
-  // scrubbing
-  OSDriver driver;
-  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
-  std::map<std::string, ceph::buffer::list> results;
+
+  scrub_level_t current_level;
+
+  /**
+   * the machinery (backend details, cache, etc.) for storing both levels
+   * of errors (note: 'optional' to allow delayed creation w/o dynamic
+   * allocations; and 'mutable', as the caching mechanism is used in const
+   * methods)
+   */
+  mutable std::optional<at_level_t> shallow_db;
+  mutable std::optional<at_level_t> deep_db;
+
+  std::vector<ceph::buffer::list> get_errors(
+      const std::string& start,
+      const std::string& end,
+      uint64_t max_return) const;
+
+  void collect_specific_store(
+      MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+      ExpCacherPosData& latest,
+      std::vector<bufferlist>& errors,
+      std::string_view end_key,
+      uint64_t max_return) const;
+
+  /**
+   * Clear the DB of errors at a specific scrub level by performing an
+   * omap_clear() on the DB object, and resetting the MapCacher.
+   */
+  void clear_level_db(
+      ObjectStore::Transaction* t,
+      at_level_t& db,
+      std::string_view db_name);
+
+  /**
+   * merge the two error wrappers - fetched from both DBs for the same object.
+   * Specifically, the object errors are or'ed, and so are the per-shard
+   * entries.
+   */
+  bufferlist merge_encoded_error_wrappers(
+      hobject_t obj,
+      ExpCacherPosData& latest_sh,
+      ExpCacherPosData& latest_dp) const;
 };
 }  // namespace Scrub
-
-#endif	// CEPH_SCRUB_RESULT_H
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 555d13ba72b..594ffb15e2b 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica,
   m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
 }
 
+// only called on interval change. Both DBs are to be removed.
 void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
 {
   if (!m_store)
@@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
   ceph_assert(!m_store);
 }
 
+
+void PgScrubber::reinit_scrub_store()
+{
+  // Entering, 0 to 3 of the following objects(*) may exist:
+  // ((*)'objects' here: both code objects (the ScrubStore object) and
+  // actual Object Store objects).
+  // 1. The ScrubStore object itself.
+  // 2,3. The two special hobjects in the coll (the PG data) holding the last
+  //      scrub's results.
+  //
+  // The Store object can be deleted and recreated, as a way to guarantee
+  // no junk is left. We won't do it here, but we will clear the at_level_t
+  // structures.
+  // The hobjects: possibly. The shallow DB object is always cleared. The
+  // deep one - only if running a deep scrub.
+  ObjectStore::Transaction t;
+  if (m_store) {
+    dout(10) << __func__ << " reusing existing store" << dendl;
+    m_store->flush(&t);
+  } else {
+    dout(10) << __func__ << " creating new store" << dendl;
+    m_store = std::make_unique<Scrub::Store>(
+	*this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
+  }
+
+  // regardless of whether the ScrubStore object was recreated or reused, we need to
+  // (possibly) clear the actual DB objects in the Object Store.
+  m_store->reinit(&t, m_active_target->level());
+  m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+}
+
+
 void PgScrubber::on_init()
 {
   // going upwards from 'inactive'
@@ -1217,14 +1250,8 @@ void PgScrubber::on_init()
     m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow,
     m_pg->get_actingset());
 
-  //  create a new store
-  {
-    ObjectStore::Transaction t;
-    cleanup_store(&t);
-    m_store.reset(
-      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
-    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-  }
+  // create or reuse the 'known errors' store
+  reinit_scrub_store();
 
   m_start = m_pg->info.pgid.pgid.get_hobj_start();
   m_active = true;
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index ff8c98d387e..1a5813bd923 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -771,6 +771,16 @@ class PgScrubber : public ScrubPgIF,
 
   std::unique_ptr<Scrub::Store> m_store;
 
+  /**
+   * the ScrubStore sub-object caches and manages the database of known
+   * scrub errors. reinit_scrub_store() clears the database and re-initializes
+   * the ScrubStore object.
+   *
+   * in the next iteration - reinit_..() potentially deletes only the
+   * shallow errors part of the database.
+   */
+  void reinit_scrub_store();
+
   int num_digest_updates_pending{0};
   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
index 4a574ed66d9..d15862c08ba 100644
--- a/src/osdc/Journaler.h
+++ b/src/osdc/Journaler.h
@@ -529,43 +529,35 @@ public:
   // ===================
 
   Header get_last_committed() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_committed;
   }
   Header get_last_written() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_written;
   }
 
   uint64_t get_layout_period() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout.get_period();
   }
   file_layout_t get_layout() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout;
   }
   bool is_active() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_ACTIVE;
   }
   bool is_stopping() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_STOPPING;
   }
   int get_error() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return error;
   }
   bool is_readonly() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return readonly;
   }
@@ -573,32 +565,26 @@ public:
   bool _is_readable();
   bool try_read_entry(bufferlist& bl);
   uint64_t get_write_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return write_pos;
   }
   uint64_t get_write_safe_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return safe_pos;
   }
   uint64_t get_read_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return read_pos;
   }
   uint64_t get_expire_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return expire_pos;
   }
   uint64_t get_trimmed_pos() const {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return trimmed_pos;
   }
   size_t get_journal_envelope_size() const { 
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return journal_stream.get_envelope_size(); 
   }
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 68bd76268ae..927c7e41329 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -48,7 +48,6 @@
 #include "include/function2.hpp"
 #include "include/neorados/RADOS_Decodable.hpp"
 
-#include "common/async/completion.h"
 #include "common/admin_socket.h"
 #include "common/ceph_time.h"
 #include "common/ceph_mutex.h"
@@ -1968,30 +1967,6 @@ public:
     }
   }
 
-  boost::asio::any_completion_handler<void(boost::system::error_code)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code)>> c) {
-    if (c)
-      return [c = std::move(c)](boost::system::error_code ec) mutable {
-	c->dispatch(std::move(c), ec);
-      };
-    else
-      return nullptr;
-  }
-
-  template<typename T>
-  boost::asio::any_completion_handler<void(boost::system::error_code, T)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code, T)>> c) {
-    if (c) {
-      return [c = std::move(c)](boost::system::error_code ec, T t) mutable {
-	c->dispatch(std::move(c), ec, std::move(t));
-      };
-    } else {
-      return nullptr;
-    }
-  }
-
   struct Op : public RefCountedObject {
     OSDSession *session = nullptr;
     int incarnation = 0;
@@ -3268,18 +3243,6 @@ public:
     return linger_watch(info, op, snapc, mtime, inbl,
 			OpContextVert<ceph::buffer::list>(onfinish, nullptr), objver);
   }
-  ceph_tid_t linger_watch(LingerOp *info,
-			  ObjectOperation& op,
-			  const SnapContext& snapc, ceph::real_time mtime,
-			  ceph::buffer::list& inbl,
-			  std::unique_ptr<ceph::async::Completion<
-			    void(boost::system::error_code,
-			         ceph::buffer::list)>> onfinish,
-			  version_t *objver) {
-    return linger_watch(info, op, snapc, mtime, inbl,
-			OpCompletionVert<ceph::buffer::list>(
-			  std::move(onfinish)), objver);
-  }
   ceph_tid_t linger_notify(LingerOp *info,
 			   ObjectOperation& op,
 			   snapid_t snap, ceph::buffer::list& inbl,
@@ -3295,17 +3258,6 @@ public:
 			 OpContextVert(onack, poutbl),
 			 objver);
   }
-  ceph_tid_t linger_notify(LingerOp *info,
-			   ObjectOperation& op,
-			   snapid_t snap, ceph::buffer::list& inbl,
-			   std::unique_ptr<ceph::async::Completion<
-			     void(boost::system::error_code,
-			          ceph::buffer::list)>> onack,
-			   version_t *objver) {
-    return linger_notify(info, op, snap, inbl,
-			 OpCompletionVert<ceph::buffer::list>(
-			   std::move(onack)), objver);
-  }
   tl::expected<ceph::timespan,
 	       boost::system::error_code> linger_check(LingerOp *info);
   void linger_cancel(LingerOp *info);  // releases a reference
@@ -3886,12 +3838,6 @@ public:
     create_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void create_pool_snap(
-    int64_t pool, std::string_view snapName,
-    std::unique_ptr<ceph::async::Completion<PoolOp::OpSig>> c) {
-    create_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
   void allocate_selfmanaged_snap(int64_t pool,
 				 boost::asio::any_completion_handler<
 				 void(boost::system::error_code,
@@ -3901,12 +3847,6 @@ public:
     allocate_selfmanaged_snap(pool,
 			      OpContextVert(c, psnapid));
   }
-  void allocate_selfmanaged_snap(int64_t pool,
-				 std::unique_ptr<ceph::async::Completion<void(
-				   boost::system::error_code, snapid_t)>> c) {
-    allocate_selfmanaged_snap(pool,
-			      OpCompletionVert<snapid_t>(std::move(c)));
-  }
   void delete_pool_snap(int64_t pool, std::string_view snapName,
 			decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool_snap(int64_t pool, std::string_view snapName,
@@ -3914,12 +3854,6 @@ public:
     delete_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_pool_snap(int64_t pool, std::string_view snapName,
-			std::unique_ptr<ceph::async::Completion<void(
-                          boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
 			       decltype(PoolOp::onfinish)&& onfinish);
@@ -3928,12 +3862,6 @@ public:
     delete_selfmanaged_snap(pool, snap,
 			    OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
-			       std::unique_ptr<ceph::async::Completion<void(
-                                 boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_selfmanaged_snap(pool, snap,
-			    OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
 
   void create_pool(std::string_view name,
@@ -3945,25 +3873,12 @@ public:
 		OpContextVert<ceph::buffer::list>(onfinish, nullptr),
 		crush_rule);
   }
-  void create_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c,
-		   int crush_rule=-1) {
-    create_pool(name,
-		OpCompletionVert<ceph::buffer::list>(std::move(c)),
-		crush_rule);
-  }
   void delete_pool(int64_t pool,
 		   decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool(int64_t pool,
 		   Context* onfinish) {
     delete_pool(pool, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(int64_t pool,
-		   std::unique_ptr<ceph::async::Completion<void(
-                    boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(pool, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_pool(std::string_view name,
 		   decltype(PoolOp::onfinish)&& onfinish);
@@ -3972,11 +3887,6 @@ public:
 		   Context* onfinish) {
     delete_pool(name, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(name, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void handle_pool_op_reply(MPoolOpReply *m);
   int pool_op_cancel(ceph_tid_t tid, int r);
@@ -4026,11 +3936,6 @@ public:
 		    Context *onfinish) {
     get_fs_stats_(poolid, OpContextVert(onfinish, result));
   }
-  void get_fs_stats(std::optional<int64_t> poolid,
-		    std::unique_ptr<ceph::async::Completion<void(
-                      boost::system::error_code, struct ceph_statfs)>> c) {
-    get_fs_stats_(poolid, OpCompletionVert<struct ceph_statfs>(std::move(c)));
-  }
   int statfs_op_cancel(ceph_tid_t tid, int r);
   void _finish_statfs_op(StatfsOp *op, int r);
 
diff --git a/src/pybind/mgr/cephadm/ceph_volume.py b/src/pybind/mgr/cephadm/ceph_volume.py
new file mode 100644
index 00000000000..a270bb7028f
--- /dev/null
+++ b/src/pybind/mgr/cephadm/ceph_volume.py
@@ -0,0 +1,430 @@
+from cephadm.serve import CephadmServe
+from typing import List, TYPE_CHECKING, Any, Dict, Set, Tuple
+if TYPE_CHECKING:
+    from cephadm import CephadmOrchestrator
+
+
+class CephVolume:
+    def __init__(self, mgr: "CephadmOrchestrator", _inheritance: bool = False) -> None:
+        self.mgr: "CephadmOrchestrator" = mgr
+        if not _inheritance:
+            self.lvm_list: "CephVolumeLvmList" = CephVolumeLvmList(mgr)
+
+    def run_json(self, hostname: str, command: List[str]) -> Dict[str, Any]:
+        """Execute a JSON command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a JSON command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run_json` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the JSON command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                JSON command.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the JSON response from the
+                            executed command, which may include various data
+                            based on the command executed.
+        """
+        return self.mgr.wait_async(self._run_json(hostname, command))
+
+    def run(self, hostname: str, command: List[str], **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                command.
+            **kw (Any): Additional keyword arguments to customize the command
+                        execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        return self.mgr.wait_async(self._run(hostname, command, **kw))
+
+    async def _run(self,
+                   hostname: str,
+                   command: List[str],
+                   **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a ceph-volume command on the specified hostname and return the result.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+            **kw (Any): Additional keyword arguments to customize the command execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm(
+            hostname, 'osd', 'ceph-volume',
+            cmd,
+            **kw)
+        return result
+
+    async def _run_json(self,
+                        hostname: str,
+                        command: List[str]) -> Dict[str, Any]:
+        """Execute a ceph-volume command on a specified hostname.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+
+        Returns:
+            Dict[str, Any]: The result of the command execution as a dictionary parsed from
+                            the JSON output.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm_json(
+            hostname, 'osd', 'ceph-volume',
+            cmd)
+        return result
+
+    def clear_replace_header(self, hostname: str, device: str) -> str:
+        """Clear the replacement header on a specified device for a given hostname.
+
+        This method checks if a replacement header exists on the specified device
+        and clears it if found. After clearing, it invalidates the cached device
+        information for the specified hostname and kicks the serve loop.
+
+        Args:
+            hostname (str): The hostname of the device on which the replacement header
+                            will be cleared. This is used to identify the specific
+                            device within the manager's context.
+            device (str): The path to the device (e.g., '/dev/sda') from which the
+                          replacement header will be cleared.
+
+        Returns:
+            str: A message indicating the result of the operation. It will either confirm
+                 that the replacement header was cleared or state that no replacement header
+                 was detected on the device.
+        """
+        output: str = ''
+        result = self.run(hostname, ['lvm',
+                                     'zap',
+                                     '--clear-replace-header',
+                                     device],
+                          error_ok=True)
+        out, err, rc = result
+        if not rc:
+            output = f'Replacement header cleared on {device}'
+            self.mgr.cache.invalidate_host_devices(hostname)
+            self.mgr._kick_serve_loop()
+        else:
+            plain_out: str = '\n'.join(out)
+            plain_err: str = '\n'.join(err)
+            output = f'No replacement header could be cleared on {device}.\n{plain_out}\n{plain_err}'
+        return output
+
+
+class CephVolumeLvmList(CephVolume):
+    def __init__(self, mgr: "CephadmOrchestrator") -> None:
+        super().__init__(mgr, True)
+        self.data: Dict[str, Any] = {}
+
+    def get_data(self, hostname: str) -> None:
+        """Execute the `ceph-volume lvm list` command to list LVM-based OSDs.
+
+        This asynchronous method interacts with the Ceph manager to retrieve
+        information about the Logical Volume Manager (LVM) devices associated
+        with the OSDs. It calls the `ceph-volume lvm list` command in JSON format
+        to gather relevant data.
+
+        Returns:
+            None: This method does not return a value. The retrieved data is
+                  stored in the `self.data` attribute for further processing.
+        """
+        self.data = self.run_json(hostname,
+                                  ['lvm', 'list', '--format', 'json'])
+
+    def devices_by_type(self, device_type: str) -> List[str]:
+        """Retrieve a list of devices of a specified type across all OSDs.
+
+        This method iterates through all OSDs and collects devices that match
+        the specified type (e.g., 'block', 'db', 'wal'). The resulting list
+        contains unique device paths.
+
+        Args:
+            device_type (str): The type of devices to retrieve. This should
+                               be one of the recognized device types such as
+                               'block', 'db', or 'wal'.
+
+        Returns:
+            List[str]: A list of unique device paths of the specified type
+                       found across all OSDs. If no devices of the specified
+                       type are found, an empty list is returned.
+        """
+        result: Set[str] = set()
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type:
+                    result.update(lv.get('devices', []))
+        return list(result)
+
+    def block_devices(self) -> List[str]:
+        """List all block devices used by OSDs.
+
+        This method returns a list of devices that are used as 'block' devices
+        for storing the main OSD data.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block' devices.
+        """
+        return self.devices_by_type('block')
+
+    def db_devices(self) -> List[str]:
+        """List all database (DB) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'db' devices
+        for storing the database files associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'db' devices.
+        """
+        return self.devices_by_type('db')
+
+    def wal_devices(self) -> List[str]:
+        """List all write-ahead log (WAL) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'wal' devices
+        for storing write-ahead log data associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'wal' devices.
+        """
+        return self.devices_by_type('wal')
+
+    def all_devices(self) -> List[str]:
+        """List all devices used by OSDs for 'block', 'db', or 'wal' purposes.
+
+        This method aggregates all devices that are currently used by the OSDs
+        in the system for the following device types:
+        - 'block' devices: Used to store the OSD's data.
+        - 'db' devices: Used for database purposes.
+        - 'wal' devices: Used for Write-Ahead Logging.
+
+        The returned list combines devices from all these categories.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block', 'db', or 'wal' devices.
+        """
+        return self.block_devices() + self.db_devices() + self.wal_devices()
+
+    def device_osd_mapping(self, device_type: str = '') -> Dict[str, Dict[str, List[str]]]:
+        """Create a mapping of devices to their corresponding OSD IDs based on device type.
+
+        This method serves as a 'proxy' function, designed to be called by the *_device_osd_mapping() methods.
+
+        This method iterates over the OSDs and their logical volumes to build a
+        dictionary that maps each device of the specified type to the list of
+        OSD IDs that use it. The resulting dictionary can be used to determine
+        which OSDs share a specific device.
+
+        Args:
+            device_type (str): The type of the device to filter by (e.g., 'block', 'db', or 'wal').
+                               If an empty string is provided, devices of all types will be included.
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dictionary where the keys are device
+            names and the values are dictionaries containing a list of OSD IDs
+            that use the corresponding device.
+
+        eg:
+        ```
+            {
+                '/dev/vda': {'osd_ids': ['0', '1']},
+                '/dev/vdb': {'osd_ids': ['2']}
+            }
+        ```
+
+        """
+        result: Dict[str, Dict[str, List[str]]] = {}
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type or not device_type:
+                    for device in lv.get('devices', []):
+                        if device not in result:
+                            result[device] = {'osd_ids': []}
+                        result[device]['osd_ids'].append(osd)
+        return result
+
+    def block_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all block devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdb': {'osd_ids': ['0']},
+         '/dev/vdc': {'osd_ids': ['1']},
+         '/dev/vdf': {'osd_ids': ['2']},
+         '/dev/vde': {'osd_ids': ['3', '4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all block devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('block')
+
+    def db_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all db devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdv': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdx': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all db devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('db')
+
+    def wal_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all wal devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdy': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdz': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all wal devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('wal')
+
+    def is_shared_device(self, device: str) -> bool:
+        """Determines if a device is shared between multiple OSDs.
+
+        This method checks if a given device is shared by multiple OSDs for a specified device type
+        (such as 'block', 'db', or 'wal'). If the device is associated with more than one OSD,
+        it is considered shared.
+
+        Args:
+            device (str): The device path to check (e.g., '/dev/sda').
+            device_type (str): The type of the device (e.g., 'block', 'db', 'wal').
+
+        Raises:
+            RuntimeError: If the device is not valid or not found in the shared devices mapping.
+
+        Returns:
+            bool: True if the device is shared by more than one OSD, False otherwise.
+        """
+        device_osd_mapping = self.device_osd_mapping()
+        if not device or device not in device_osd_mapping:
+            raise RuntimeError('Not a valid device path.')
+        return len(device_osd_mapping[device]['osd_ids']) > 1
+
+    def is_block_device(self, device: str) -> bool:
+        """Check if a specified device is a block device.
+
+        This method checks if the specified device is included in the
+        list of block devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a block device,
+                  False otherwise.
+        """
+        return device in self.block_devices()
+
+    def is_db_device(self, device: str) -> bool:
+        """Check if a specified device is a DB device.
+
+        This method checks if the specified device is included in the
+        list of DB devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a DB device,
+                  False otherwise.
+        """
+        return device in self.db_devices()
+
+    def is_wal_device(self, device: str) -> bool:
+        """Check if a specified device is a WAL device.
+
+        This method checks if the specified device is included in the
+        list of WAL devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a WAL device,
+                  False otherwise.
+        """
+        return device in self.wal_devices()
+
+    def get_block_devices_from_osd_id(self, osd_id: str) -> List[str]:
+        """Retrieve the list of block devices associated with a given OSD ID.
+
+        This method looks up the specified OSD ID in the `data` attribute
+        and returns a list of devices that are of type 'block'. If there are
+        no devices of type 'block' for the specified OSD ID, an empty list is returned.
+
+        Args:
+            osd_id (str): The OSD ID for which to retrieve block devices.
+
+        Returns:
+            List[str]: A list of block device paths associated with the
+                       specified OSD ID. If no block devices are found,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        for lv in self.data.get(osd_id, []):
+            if lv.get('type') == 'block':
+                result = lv.get('devices', [])
+        return result
+
+    def osd_ids(self) -> List[str]:
+        """Retrieve the list of OSD IDs.
+
+        This method returns a list of OSD IDs by extracting the keys
+        from the `data` attribute, which is expected to contain
+        information about OSDs. If there is no data available, an
+        empty list is returned.
+
+        Returns:
+            List[str]: A list of OSD IDs. If no data is present,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        if self.data:
+            result = list(self.data.keys())
+        return result
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5216c489064..7bf65b532fa 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -101,6 +101,7 @@ from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall
 from .configchecks import CephadmConfigChecks
 from .offline_watcher import OfflineHostWatcher
 from .tuned_profiles import TunedProfileUtils
+from .ceph_volume import CephVolume
 
 try:
     import asyncssh
@@ -135,13 +136,13 @@ DEFAULT_IMAGE = 'quay.io/ceph/ceph'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -446,7 +447,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         Option(
             'default_registry',
             type='str',
-            default='docker.io',
+            default='quay.io',
             desc='Search-registry to which we should normalize unqualified image names. '
                  'This is not the default registry',
         ),
@@ -764,6 +765,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
         self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
         self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
+        self.rgw_service: RgwService = cast(RgwService, self.cephadm_services['rgw'])
 
         self.scheduled_async_actions: List[Callable] = []
 
@@ -791,6 +793,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         # as part of the handling of stray daemons
         self.recently_altered_daemons: Dict[str, datetime.datetime] = {}
 
+        self.ceph_volume: CephVolume = CephVolume(self)
+
     def shutdown(self) -> None:
         self.log.debug('shutdown')
         self._worker_pool.close()
@@ -3609,7 +3613,12 @@ Then run the following:
         return "Scheduled %s update..." % spec.service_name()
 
     @handle_orch_error
-    def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence[GenericSpec],
+        no_overwrite: bool = False,
+        continue_on_error: bool = True
+    ) -> List[str]:
         results = []
         for spec in specs:
             if no_overwrite:
@@ -3621,7 +3630,14 @@ Then run the following:
                     results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag'
                                    % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name()))
                     continue
-            results.append(self._apply(spec))
+            try:
+                res = self._apply(spec)
+                results.append(res)
+            except Exception as e:
+                if continue_on_error:
+                    results.append(f'Failed to apply spec for {spec}: {str(e)}')
+                else:
+                    raise e
         return results
 
     @handle_orch_error
@@ -3828,8 +3844,55 @@ Then run the following:
         return self.upgrade.upgrade_stop()
 
     @handle_orch_error
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> Any:
+        output: str = ''
+
+        self.ceph_volume.lvm_list.get_data(hostname=hostname)
+
+        if clear:
+            output = self.ceph_volume.clear_replace_header(hostname, device)
+        else:
+            osds_to_zap: List[str] = []
+            if hostname not in list(self.inventory.keys()):
+                raise OrchestratorError(f'{hostname} invalid host.')
+
+            if device not in self.ceph_volume.lvm_list.all_devices():
+                raise OrchestratorError(f"{device} doesn't appear to be used for an OSD, not a valid device in {hostname}.")
+
+            device_osd_mapping = self.ceph_volume.lvm_list.device_osd_mapping()
+            osds_to_zap = device_osd_mapping[device]['osd_ids']
+
+            if self.ceph_volume.lvm_list.is_shared_device(device):
+                if not yes_i_really_mean_it:
+                    raise OrchestratorError(f'{device} is a shared device.\n'
+                                            f'Replacing {device} implies destroying OSD(s): {osds_to_zap}.\n'
+                                            'Please, *be very careful*, this can be a very dangerous operation.\n'
+                                            'If you know what you are doing, pass --yes-i-really-mean-it')
+            if not self.to_remove_osds.rm_util.safe_to_destroy([int(osd_id) for osd_id in osds_to_zap]):
+                raise OrchestratorError(f"Destroying OSD(s) {osds_to_zap} would cause some PGs to be undersized/degraded.\n"
+                                        'Refusing to proceed.')
+            replace_block: bool = self.ceph_volume.lvm_list.is_block_device(device)
+            replace_db: bool = self.ceph_volume.lvm_list.is_db_device(device)
+            replace_wal: bool = self.ceph_volume.lvm_list.is_wal_device(device)
+
+            self.remove_osds(list(osds_to_zap),
+                             replace_block=replace_block,
+                             replace_db=replace_db,
+                             replace_wal=replace_wal)
+
+            output = f'Scheduled to destroy osds: {osds_to_zap} and mark {device} as being replaced.'
+        return output
+
+    @handle_orch_error
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> str:
@@ -3852,6 +3915,9 @@ Then run the following:
             try:
                 self.to_remove_osds.enqueue(OSD(osd_id=int(daemon.daemon_id),
                                                 replace=replace,
+                                                replace_block=replace_block,
+                                                replace_db=replace_db,
+                                                replace_wal=replace_wal,
                                                 force=force,
                                                 zap=zap,
                                                 no_destroy=no_destroy,
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index c6212c9efb8..4a7959ae045 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -96,7 +96,10 @@ class CephadmServe:
                 if not self.mgr.paused:
                     self._run_async_actions()
 
-                    self.mgr.to_remove_osds.process_removal_queue()
+                    removal_queue_result = self.mgr.to_remove_osds.process_removal_queue()
+                    self.log.debug(f'process_removal_queue() returned = {removal_queue_result}')
+                    if removal_queue_result:
+                        continue
 
                     self.mgr.migration.migrate()
                     if self.mgr.migration.is_migration_ongoing():
@@ -950,6 +953,10 @@ class CephadmServe:
                     )
                     continue
 
+                # set multisite config before deploying the rgw daemon
+                if service_type == 'rgw':
+                    self.mgr.rgw_service.set_realm_zg_zone(cast(RGWSpec, spec))
+
                 # deploy new daemon
                 daemon_id = slot.name
 
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index eb9a1c838a6..9043577bc5a 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -984,10 +984,9 @@ class RgwService(CephService):
     def allow_colo(self) -> bool:
         return True
 
-    def config(self, spec: RGWSpec) -> None:  # type: ignore
+    def set_realm_zg_zone(self, spec: RGWSpec) -> None:
         assert self.TYPE == spec.service_type
 
-        # set rgw_realm rgw_zonegroup and rgw_zone, if present
         if spec.rgw_realm:
             ret, out, err = self.mgr.check_mon_command({
                 'prefix': 'config set',
@@ -1010,6 +1009,12 @@ class RgwService(CephService):
                 'value': spec.rgw_zone,
             })
 
+    def config(self, spec: RGWSpec) -> None:  # type: ignore
+        assert self.TYPE == spec.service_type
+
+        # set rgw_realm rgw_zonegroup and rgw_zone, if present
+        self.set_realm_zg_zone(spec)
+
         if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
             # generate a self-signed cert for the rgw service
             cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 8b15aace373..162815da24c 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -123,10 +123,9 @@ class NvmeofService(CephService):
             gateways = json.loads(out)['gateways']
             cmd_dicts = []
 
-            spec = cast(NvmeofServiceSpec,
-                        self.mgr.spec_store.all_specs.get(daemon_descrs[0].service_name(), None))
-
             for dd in daemon_descrs:
+                spec = cast(NvmeofServiceSpec,
+                            self.mgr.spec_store.all_specs.get(dd.service_name(), None))
                 service_name = dd.service_name()
                 if dd.hostname is None:
                     err_msg = ('Trying to config_dashboard nvmeof but no hostname is defined')
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index 9b09b8c9f49..80bf92772c4 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -551,6 +551,12 @@ class RemoveUtil(object):
         "Zaps all devices that are associated with an OSD"
         if osd.hostname is not None:
             cmd = ['--', 'lvm', 'zap', '--osd-id', str(osd.osd_id)]
+            if osd.replace_block:
+                cmd.append('--replace-block')
+            if osd.replace_db:
+                cmd.append('--replace-db')
+            if osd.replace_wal:
+                cmd.append('--replace-wal')
             if not osd.no_destroy:
                 cmd.append('--destroy')
             with self.mgr.async_timeout_handler(osd.hostname, f'cephadm ceph-volume {" ".join(cmd)}'):
@@ -618,6 +624,9 @@ class OSD:
                  started: bool = False,
                  stopped: bool = False,
                  replace: bool = False,
+                 replace_block: bool = False,
+                 replace_db: bool = False,
+                 replace_wal: bool = False,
                  force: bool = False,
                  hostname: Optional[str] = None,
                  zap: bool = False,
@@ -649,6 +658,12 @@ class OSD:
 
         # If this is a replace or remove operation
         self.replace = replace
+        # If this is a block device replacement
+        self.replace_block = replace_block
+        # If this is a db device replacement
+        self.replace_db = replace_db
+        # If this is a wal device replacement
+        self.replace_wal = replace_wal
         # If we wait for the osd to be drained
         self.force = force
         # The name of the node
@@ -676,7 +691,7 @@ class OSD:
         if self.stopped:
             logger.debug(f"Won't start draining {self}. OSD draining is stopped.")
             return False
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'out')
         else:
             self.rm_util.reweight_osd(self, 0.0)
@@ -686,7 +701,7 @@ class OSD:
         return True
 
     def stop_draining(self) -> bool:
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'in')
         else:
             if self.original_weight:
@@ -764,6 +779,9 @@ class OSD:
         out['draining'] = self.draining
         out['stopped'] = self.stopped
         out['replace'] = self.replace
+        out['replace_block'] = self.replace_block
+        out['replace_db'] = self.replace_db
+        out['replace_wal'] = self.replace_wal
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
@@ -789,6 +807,13 @@ class OSD:
             inp['hostname'] = hostname
         return cls(**inp)
 
+    @property
+    def any_replace_params(self) -> bool:
+        return any([self.replace,
+                    self.replace_block,
+                    self.replace_db,
+                    self.replace_wal])
+
     def __hash__(self) -> int:
         return hash(self.osd_id)
 
@@ -812,7 +837,7 @@ class OSDRemovalQueue(object):
         # network calls, like mon commands.
         self.lock = Lock()
 
-    def process_removal_queue(self) -> None:
+    def process_removal_queue(self) -> bool:
         """
         Performs actions in the _serve() loop to remove an OSD
         when criteria is met.
@@ -820,6 +845,8 @@ class OSDRemovalQueue(object):
         we can't hold self.lock, as we're calling _remove_daemon in the loop
         """
 
+        result: bool = False
+
         # make sure that we don't run on OSDs that are not in the cluster anymore.
         self.cleanup()
 
@@ -863,16 +890,23 @@ class OSDRemovalQueue(object):
             if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'):
                 CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname)
                 logger.info(f"Successfully removed {osd} on {osd.hostname}")
+                result = True
             else:
                 logger.info(f"Daemon {osd} on {osd.hostname} was already removed")
 
-            if osd.replace:
+            any_replace_params: bool = any([osd.replace,
+                                            osd.replace_block,
+                                            osd.replace_db,
+                                            osd.replace_wal])
+            if any_replace_params:
                 # mark destroyed in osdmap
                 if not osd.destroy():
                     raise orchestrator.OrchestratorError(
                         f"Could not destroy {osd}")
                 logger.info(
                     f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement")
+                if any_replace_params:
+                    osd.zap = True
             else:
                 # purge from osdmap
                 if not osd.purge():
@@ -884,7 +918,7 @@ class OSDRemovalQueue(object):
                 logger.info(f"Zapping devices for {osd} on {osd.hostname}")
                 osd.do_zap()
                 logger.info(f"Successfully zapped devices for {osd} on {osd.hostname}")
-
+            self.mgr.cache.invalidate_host_devices(osd.hostname)
             logger.debug(f"Removing {osd} from the queue.")
 
         # self could change while this is processing (osds get added from the CLI)
@@ -893,6 +927,7 @@ class OSDRemovalQueue(object):
         with self.lock:
             self.osds.intersection_update(new_queue)
             self._save_to_store()
+        return result
 
     def cleanup(self) -> None:
         # OSDs can always be cleaned up manually. This ensures that we run on existing OSDs
diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py
index dabc202a024..e322acb0e3e 100644
--- a/src/pybind/mgr/cephadm/services/smb.py
+++ b/src/pybind/mgr/cephadm/services/smb.py
@@ -1,6 +1,9 @@
+import errno
 import logging
 from typing import Any, Dict, List, Tuple, cast, Optional
 
+from mgr_module import HandleCommandResult
+
 from ceph.deployment.service_spec import ServiceSpec, SMBSpec
 
 from orchestrator import DaemonDescription
@@ -117,6 +120,23 @@ class SMBService(CephService):
             return True
         return False
 
+    def ok_to_stop(
+        self, daemon_ids: List[str], force: bool = False, known: Optional[List[str]] = None
+    ) -> HandleCommandResult:
+        # if only 1 smb, alert user (this is not passable with --force)
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, "SMB", 1, True)
+        if warn:
+            return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
+        # if reached here, there is > 1 smb daemon.
+        if force:
+            return HandleCommandResult(0, warn_message, "")
+
+        # if reached here, > 1 smb daemon and no force flag.
+        # Provide warning
+        warn_message = "WARNING: Removing SMB daemons can cause clients to lose connectivity. "
+        return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
     def _allow_config_key_command(self, name: str) -> str:
         # permit the samba container config access to the mon config key store
         # with keys like smb/config/<cluster_id>/*.
diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
index 972ef22e7b5..967f1355af1 100644
--- a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
+++ b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
@@ -15,7 +15,8 @@
   http_port = {{ http_port }}
   http_addr = {{ http_addr }}
 {% if mgmt_gw_enabled %}
-  root_url = %(protocol)s://%(domain)s/grafana/
+  root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+  serve_from_sub_path = true
 {% endif %}
 [snapshots]
   external_enabled = false
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index f33bc6c8dfd..594582e7ee4 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -115,11 +115,11 @@ server {
 
 {% if grafana_endpoints %}
     location /grafana {
-        rewrite ^/grafana/(.*) /$1 break;
         proxy_pass {{ grafana_scheme }}://grafana_servers;
         # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
         # will send this header if Grafana is running on the same node as one of those services
         proxy_set_header Authorization "";
+        proxy_buffering off;
         {% if oauth2_proxy_url %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
index 829c0757589..9148ddc4a14 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
@@ -1,5 +1,8 @@
 
 server {
+    ssl_client_certificate /etc/nginx/ssl/ca.crt;
+    ssl_verify_client on;
+
     listen              {{ internal_port }} ssl;
     listen              [::]:{{ internal_port }} ssl;
     ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
index ded403169c9..03ff8a32ca2 100644
--- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
@@ -4,6 +4,7 @@ NFS_CORE_PARAM {
         Enable_RQUOTA = false;
         Protocols = 4;
         NFS_Port = {{ port }};
+        allow_set_io_flusher_fail = true;
 {% if bind_addr %}
         Bind_addr = {{ bind_addr }};
 {% endif %}
diff --git a/src/pybind/mgr/cephadm/tests/ceph_volume_data.py b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
new file mode 100644
index 00000000000..afd6d89d39e
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
@@ -0,0 +1 @@
+data = '{"0":[{"devices":["/dev/vdb"],"lv_name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668"},{"devices":["/dev/vdk"],"lv_name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"1":[{"devices":["/dev/vdc"],"lv_name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb"},{"devices":["/dev/vdk"],"lv_name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"2":[{"devices":["/dev/vdf"],"lv_name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.block_uuid=adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.osd_id=2,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","tags":{"ceph.block_device":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.block_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.osd_id":"2","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-3ba7a728-709b-408c-a043-9e48704b5ffb"}],"3":[{"devices":["/dev/vde"],"lv_name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.block_uuid=GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.osd_id=3,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","tags":{"ceph.block_device":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.block_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.osd_id":"3","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b"}],"4":[{"devices":["/dev/vdg"],"lv_name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-20acdce8-5548-4707-a38e-b8e925485bc5"},{"devices":["/dev/vdj"],"lv_name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdi"],"lv_name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}],"5":[{"devices":["/dev/vdj"],"lv_name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdh"],"lv_name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351"},{"devices":["/dev/vdi"],"lv_name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}]}'
diff --git a/src/pybind/mgr/cephadm/tests/conftest.py b/src/pybind/mgr/cephadm/tests/conftest.py
index e8add2c7b83..5cc2fabaf49 100644
--- a/src/pybind/mgr/cephadm/tests/conftest.py
+++ b/src/pybind/mgr/cephadm/tests/conftest.py
@@ -1,13 +1,14 @@
 import pytest
 
 from cephadm.services.osd import RemoveUtil, OSD
-from tests import mock
-
+from mock import mock
 from .fixtures import with_cephadm_module
+from cephadm import CephadmOrchestrator
+from typing import Generator
 
 
 @pytest.fixture()
-def cephadm_module():
+def cephadm_module() -> Generator[CephadmOrchestrator, None, None]:
     with with_cephadm_module({}) as m:
         yield m
 
diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py
index dd858c6c7da..dda0c6720ac 100644
--- a/src/pybind/mgr/cephadm/tests/fixtures.py
+++ b/src/pybind/mgr/cephadm/tests/fixtures.py
@@ -35,11 +35,11 @@ def get_module_option_ex(_, module, key, default=None):
     return None
 
 
-def _run_cephadm(ret):
+def _run_cephadm(ret, rc: int = 0):
     async def foo(s, host, entity, cmd, e, **kwargs):
         if cmd == 'gather-facts':
             return '{}', '', 0
-        return [ret], '', 0
+        return [ret], '', rc
     return foo
 
 
diff --git a/src/pybind/mgr/cephadm/tests/test_ceph_volume.py b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
new file mode 100644
index 00000000000..cc1378a7575
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
@@ -0,0 +1,231 @@
+import json
+import pytest
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from mock import patch
+from .fixtures import _run_cephadm, with_host
+
+
+class TestCephVolume:
+    def test_run(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.run('test', ['/bin/foo'])
+                assert c == (['fake-output'], '', 0)
+
+    def test_run_json(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('{"this-is-a-fake-key": "this-is-a-fake-value"}', 0)):
+                    c = cephadm_module.ceph_volume.run_json('test', ['/bin/foo'])
+                assert c == {"this-is-a-fake-key": "this-is-a-fake-value"}
+
+    def test_clear_replace_header_ok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('test', '/dev/foo')
+                assert c == 'Replacement header cleared on /dev/foo'
+
+    def test_clear_replace_header_nok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('', 1)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('fake-output', '/dev/foo')
+                assert c.strip() == 'No replacement header could be cleared on /dev/foo.'
+
+
+class TestCephVolumeList:
+    def test_get_data(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.data == json.loads(data)
+
+    def test_devices_by_type_block(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('block')) == set(['/dev/vdb',
+                                                                                                     '/dev/vdc',
+                                                                                                     '/dev/vdg',
+                                                                                                     '/dev/vde',
+                                                                                                     '/dev/vdf',
+                                                                                                     '/dev/vdh'])
+
+    def test_devices_by_type_db(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('db')) == set(['/dev/vdi',
+                                                                                                  '/dev/vdk'])
+
+    def test_devices_by_type_wal(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.devices_by_type('wal') == ['/dev/vdj']
+
+    def test_block_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.block_devices()) == set(['/dev/vdb',
+                                                                                            '/dev/vdc',
+                                                                                            '/dev/vdg',
+                                                                                            '/dev/vde',
+                                                                                            '/dev/vdf',
+                                                                                            '/dev/vdh'])
+
+    def test_db_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.db_devices()) == set(['/dev/vdk',
+                                                                                         '/dev/vdi'])
+
+    def test_wal_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.wal_devices()) == set(['/dev/vdj'])
+
+    def test_all_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.all_devices()) == set(['/dev/vdg',
+                                                                                          '/dev/vdj',
+                                                                                          '/dev/vdh',
+                                                                                          '/dev/vdi',
+                                                                                          '/dev/vdc',
+                                                                                          '/dev/vde',
+                                                                                          '/dev/vdf',
+                                                                                          '/dev/vdb',
+                                                                                          '/dev/vdk'])
+
+    def test_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                        '/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                        '/dev/vdc': {'osd_ids': ['1']},
+                                                                                        '/dev/vdf': {'osd_ids': ['2']},
+                                                                                        '/dev/vde': {'osd_ids': ['3']},
+                                                                                        '/dev/vdg': {'osd_ids': ['4']},
+                                                                                        '/dev/vdj': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdi': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_block_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.block_device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                              '/dev/vdc': {'osd_ids': ['1']},
+                                                                                              '/dev/vdf': {'osd_ids': ['2']},
+                                                                                              '/dev/vde': {'osd_ids': ['3']},
+                                                                                              '/dev/vdg': {'osd_ids': ['4']},
+                                                                                              '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_db_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.db_device_osd_mapping() == {'/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                           '/dev/vdi': {'osd_ids': ['4', '5']}}
+
+    def test_wal_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.wal_device_osd_mapping() == {'/dev/vdj': {'osd_ids': ['4', '5']}}
+
+    def test_is_shared_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/vdj')
+
+    def test_is_shared_device_with_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    with pytest.raises(RuntimeError) as e:
+                        assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/invalid-device')
+                    assert str(e.value) == 'Not a valid device path.'
+
+    def test_is_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_block_device('/dev/vdb')
+
+    def test_is_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_db_device('/dev/vdk')
+
+    def test_is_wal_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_wal_device('/dev/vdj')
+
+    def test_get_block_devices_from_osd_id(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.get_block_devices_from_osd_id('0') == ['/dev/vdb']
+
+    def test_osd_ids(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.osd_ids()) == set(['0', '1', '2', '3', '4', '5'])
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 5a485f98be3..975c125225d 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -2040,7 +2040,7 @@ class TestCephadm(object):
             ), CephadmOrchestrator.apply_iscsi),
             (CustomContainerSpec(
                 service_id='hello-world',
-                image='docker.io/library/hello-world:latest',
+                image='quay.io/hello-world/hello-world:latest',
                 uid=65534,
                 gid=65534,
                 dirs=['foo/bar'],
diff --git a/src/pybind/mgr/cephadm/tests/test_replace_device.py b/src/pybind/mgr/cephadm/tests/test_replace_device.py
new file mode 100644
index 00000000000..b4a2c81ad9a
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_replace_device.py
@@ -0,0 +1,53 @@
+import pytest
+from mock import patch
+from .fixtures import _run_cephadm, with_host, wait
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from orchestrator import OrchestratorError
+
+
+class TestReplaceDevice:
+    def test_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/invalid-device')
+                    assert "/dev/invalid-device doesn't appear to be used for an OSD, not a valid device in test." in str(e.value)
+
+    def test_invalid_hostname(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError):
+                        cephadm_module.replace_device('invalid-hostname', '/dev/vdb')
+
+    def test_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdb')
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0'] and mark /dev/vdb as being replaced."
+
+    def test_shared_db_device_no_ireallymeanit_flag(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/vdk')
+                    assert "/dev/vdk is a shared device.\nReplacing /dev/vdk implies destroying OSD(s): ['0', '1'].\nPlease, *be very careful*, this can be a very dangerous operation.\nIf you know what you are doing, pass --yes-i-really-mean-it" in str(e.value)
+
+    def test_shared_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdk', yes_i_really_mean_it=True)
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0', '1'] and mark /dev/vdk as being replaced."
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0..4b11a588ad3 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -49,9 +49,9 @@ from typing import Dict, List
 
 cephadm_root_ca = """-----BEGIN CERTIFICATE-----\\nMIIE7DCCAtSgAwIBAgIUE8b2zZ64geu2ns3Zfn3/4L+Cf6MwDQYJKoZIhvcNAQEL\\nBQAwFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MB4XDTI0MDYyNjE0NDA1M1oXDTM0\\nMDYyNzE0NDA1M1owFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MIICIjANBgkqhkiG\\n9w0BAQEFAAOCAg8AMIICCgKCAgEAsZRJsdtTr9GLG1lWFql5SGc46ldFanNJd1Gl\\nqXq5vgZVKRDTmNgAb/XFuNEEmbDAXYIRZolZeYKMHfn0pouPRSel0OsC6/02ZUOW\\nIuN89Wgo3IYleCFpkVIumD8URP3hwdu85plRxYZTtlruBaTRH38lssyCqxaOdEt7\\nAUhvYhcMPJThB17eOSQ73mb8JEC83vB47fosI7IhZuvXvRSuZwUW30rJanWNhyZq\\neS2B8qw2RSO0+77H6gA4ftBnitfsE1Y8/F9Z/f92JOZuSMQXUB07msznPbRJia3f\\nueO8gOc32vxd1A1/Qzp14uX34yEGY9ko2lW226cZO29IVUtXOX+LueQttwtdlpz8\\ne6Npm09pXhXAHxV/OW3M28MdXmobIqT/m9MfkeAErt5guUeC5y8doz6/3VQRjFEn\\nRpN0WkblgnNAQ3DONPc+Qd9Fi/wZV2X7bXoYpNdoWDsEOiE/eLmhG1A2GqU/mneP\\nzQ6u79nbdwTYpwqHpa+PvusXeLfKauzI8lLUJotdXy9EK8iHUofibB61OljYye6B\\nG3b8C4QfGsw8cDb4APZd/6AZYyMx/V3cGZ+GcOV7WvsC8k7yx5Uqasm/kiGQ3EZo\\nuNenNEYoGYrjb8D/8QzqNUTwlEh27/ps80tO7l2GGTvWVZL0PRZbmLDvO77amtOf\\nOiRXMoUCAwEAAaMwMC4wGwYDVR0RBBQwEocQAAAAAAAAAAAAAAAAAAAAATAPBgNV\\nHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4ICAQAxwzX5AhYEWhTV4VUwUj5+\\nqPdl4Q2tIxRokqyE+cDxoSd+6JfGUefUbNyBxDt0HaBq8obDqqrbcytxnn7mpnDu\\nhtiauY+I4Amt7hqFOiFA4cCLi2mfok6g2vL53tvhd9IrsfflAU2wy7hL76Ejm5El\\nA+nXlkJwps01Whl9pBkUvIbOn3pXX50LT4hb5zN0PSu957rjd2xb4HdfuySm6nW4\\n4GxtVWfmGA6zbC4XMEwvkuhZ7kD2qjkAguGDF01uMglkrkCJT3OROlNBuSTSBGqt\\ntntp5VytHvb7KTF7GttM3ha8/EU2KYaHM6WImQQTrOfiImAktOk4B3lzUZX3HYIx\\n+sByO4P4dCvAoGz1nlWYB2AvCOGbKf0Tgrh4t4jkiF8FHTXGdfvWmjgi1pddCNAy\\nn65WOCmVmLZPERAHOk1oBwqyReSvgoCFo8FxbZcNxJdlhM0Z6hzKggm3O3Dl88Xl\\n5euqJjh2STkBW8Xuowkg1TOs5XyWvKoDFAUzyzeLOL8YSG+gXV22gPTUaPSVAqdb\\nwd0Fx2kjConuC5bgTzQHs8XWA930U3XWZraj21Vaa8UxlBLH4fUro8H5lMSYlZNE\\nJHRNW8BkznAClaFSDG3dybLsrzrBFAu/Qb5zVkT1xyq0YkepGB7leXwq6vjWA5Pw\\nmZbKSphWfh0qipoqxqhfkw==\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_cert = """-----BEGIN CERTIFICATE-----\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\n-----END CERTIFICATE-----\n"""
+ceph_generated_cert = """-----BEGIN CERTIFICATE-----\\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_key = """-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\n39gnaegswnz9KMQAvzKFdg==\n-----END PRIVATE KEY-----\n"""
+ceph_generated_key = """-----BEGIN PRIVATE KEY-----\\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\\n39gnaegswnz9KMQAvzKFdg==\\n-----END PRIVATE KEY-----\\n"""
 
 
 class FakeInventory:
@@ -608,6 +608,101 @@ class TestMonitoring:
     @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert')
     @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey'))
+    def test_alertmanager_config_when_mgmt_gw_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+
+        fqdn = 'host1.test'
+        _get_fqdn.return_value = fqdn
+
+        with with_host(cephadm_module, 'test'):
+            cephadm_module.secure_monitoring_stack = True
+            cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user')
+            cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password')
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, AlertManagerSpec()):
+
+                y = dedent("""
+                # This file is generated by cephadm.
+                # See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+                global:
+                  resolve_timeout: 5m
+                  http_config:
+                    tls_config:
+                      ca_file: root_cert.pem
+
+                route:
+                  receiver: 'default'
+                  routes:
+                    - group_by: ['alertname']
+                      group_wait: 10s
+                      group_interval: 10s
+                      repeat_interval: 1h
+                      receiver: 'ceph-dashboard'
+
+                receivers:
+                - name: 'default'
+                  webhook_configs:
+                - name: 'ceph-dashboard'
+                  webhook_configs:
+                  - url: 'https://host_fqdn:29443/internal/dashboard/api/prometheus_receiver'
+                """).lstrip()
+
+                web_config = dedent("""
+                tls_server_config:
+                  cert_file: alertmanager.crt
+                  key_file: alertmanager.key
+                  client_auth_type: RequireAndVerifyClientCert
+                  client_ca_file: root_cert.pem
+                basic_auth_users:
+                    alertmanager_user: alertmanager_password_hash
+                """).lstrip()
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "alertmanager.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'alertmanager.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9093, 9094],
+                        },
+                        "meta": {
+                            'service_name': 'alertmanager',
+                            'ports': [9093, 9094],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "alertmanager.yml": y,
+                                'alertmanager.crt': 'mycert',
+                                'alertmanager.key': 'mykey',
+                                'web.yml': web_config,
+                                'root_cert.pem': 'cephadm_root_cert'
+                            },
+                            'peers': [],
+                            'web_config': '/etc/alertmanager/web.yml',
+                            "use_url_prefix": True,
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
+    @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash')
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert')
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey'))
     def test_alertmanager_config_security_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
 
@@ -739,6 +834,110 @@ class TestMonitoring:
                                                 use_current_daemon_image=False)
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("mgr_module.MgrModule.get")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_without_mgmt_gw(
+        self,
+        mock_getfqdn,
+        mock_get,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        fqdn = 'host1.test'
+        mock_getfqdn.return_value = fqdn
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {}
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: (ceph_generated_cert, ceph_generated_key))
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_with_mgmt_gw(
+        self,
+        mock_getfqdn,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        mock_getfqdn.return_value = 'host1.test'
+
+        y = dedent("""
+        tls_server_config:
+          cert_file: node_exporter.crt
+          key_file: node_exporter.key
+          client_auth_type: RequireAndVerifyClientCert
+          client_ca_file: root_cert.pem
+        """).lstrip()
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "web.yml": y,
+                                'root_cert.pem': f"{cephadm_root_ca}",
+                                'node_exporter.crt': f"{ceph_generated_cert}",
+                                'node_exporter.key': f"{ceph_generated_key}",
+                            },
+                            'web_config': '/etc/node-exporter/web.yml',
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
@@ -1244,6 +1443,286 @@ class TestMonitoring:
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
     @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
     @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw_and_ouath2_proxy(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider',
+                                      client_id='my_client_id',
+                                      client_secret='my_client_secret',
+                                      oidc_issuer_url='http://192.168.10.10:8888/dex',
+                                      cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=',
+                                      ssl_certificate=ceph_generated_cert,
+                                      ssl_certificate_key=ceph_generated_key)
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(cephadm_module, PrometheusSpec("prometheus")) as _, \
+                 with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, oauth2_spec) as _, \
+                 with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                     cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true
+                        [auth]
+                          disable_login_form = true
+                        [auth.proxy]
+                          enabled = true
+                          header_name = X-WEBAUTH-USER
+                          header_property = username
+                          auto_sign_up = true
+                          sync_ttl = 15
+                          whitelist = 1::4
+                          headers_encoded = false
+                          enable_login_token = false
+                          headers = Role:X-WEBAUTH-ROLE\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(
+                cephadm_module, PrometheusSpec("prometheus")
+            ) as _, with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
     def test_grafana_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
 
@@ -2710,6 +3189,7 @@ class TestIngressService:
             '        Enable_RQUOTA = false;\n'
             '        Protocols = 4;\n'
             '        NFS_Port = 2049;\n'
+            '        allow_set_io_flusher_fail = true;\n'
             '        HAProxy_Hosts = 192.168.122.111, 10.10.2.20, 192.168.122.222;\n'
             '}\n'
             '\n'
@@ -3296,7 +3776,7 @@ class TestMgmtGateway:
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_mgmt_gateway_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gw_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3417,11 +3897,11 @@ class TestMgmtGateway:
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                  }
 
                                                  location /prometheus {
@@ -3446,6 +3926,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -3518,7 +4001,7 @@ class TestMgmtGateway:
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_oauth2_service_url", lambda _: "https://192.168.100.102:4180")
-    def test_mgmt_gateway_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gw_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3689,11 +4172,11 @@ class TestMgmtGateway:
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                      auth_request /oauth2/auth;
                                                      error_page 401 = /oauth2/sign_in;
 
@@ -3760,6 +4243,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py
index 78a2d73118f..42e590945cd 100644
--- a/src/pybind/mgr/cephadm/tests/test_spec.py
+++ b/src/pybind/mgr/cephadm/tests/test_spec.py
@@ -130,7 +130,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "d94d7969094d",
         "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0",
-        "container_image_name": "docker.io/prom/alertmanager:latest",
+        "container_image_name": "quay.io/prometheus/alertmanager:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "alertmanager",
         "version": "0.20.0",
@@ -145,7 +145,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "c4b036202241",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "crash",
         "version": "15.2.0",
@@ -160,7 +160,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "5b7b94b48f31",
         "container_image_id": "87a51ecf0b1c9a7b187b21c1b071425dafea0d765a96d5bc371c791169b3d7f4",
-        "container_image_name": "docker.io/ceph/ceph-grafana:latest",
+        "container_image_name": "quay.io/ceph/ceph-grafana:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "grafana",
         "version": "6.6.2",
@@ -175,7 +175,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "9ca007280456",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001.gkjwqp",
         "daemon_type": "mgr",
         "version": "15.2.0",
@@ -190,7 +190,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "3d1ba9a2b697",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "mon",
         "version": "15.2.0",
@@ -205,7 +205,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "36d026c68ba1",
         "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87",
-        "container_image_name": "docker.io/prom/node-exporter:latest",
+        "container_image_name": "quay.io/prometheus/node-exporter:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "node-exporter",
         "version": "0.18.1",
@@ -220,7 +220,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "faf76193cbfe",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "0",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -235,7 +235,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "f82505bae0f1",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "1",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -250,7 +250,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "2708d84cd484",
         "container_image_id": "358a0d2395fe711bb8258e8fb4b2d7865c0a9a6463969bcd1452ee8869ea6653",
-        "container_image_name": "docker.io/prom/prometheus:latest",
+        "container_image_name": "quay.io/prom/prometheus:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "prometheus",
         "version": "2.17.1",
@@ -569,7 +569,7 @@ def test_dd_octopus(dd_json):
         CustomContainerSpec(
             service_type='container',
             service_id='hello-world',
-            image='docker.io/library/hello-world:latest',
+            image='quay.io/hello-world/hello-world:latest',
         ),
         DaemonDescription(
             daemon_type='container',
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index d8ffab2da51..ed3d26807e5 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -29,17 +29,17 @@ CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
 def normalize_image_digest(digest: str, default_registry: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/centos', 'quay.io')
+    'quay.io/centos'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index 757b9e8ac02..519c310a98b 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -63,7 +63,10 @@ else:
 
         @EndpointDoc(
             "Get information from a specific NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_model(model.Subsystem, first="subsystems")
         @handle_nvmeof_error
@@ -78,6 +81,7 @@ else:
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
                 "enable_ha": Param(bool, "Enable high availability"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -95,6 +99,7 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "force": Param(bool, "Force delete", "false"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -111,12 +116,15 @@ else:
     class NVMeoFListener(RESTController):
         @EndpointDoc(
             "List all NVMeoF listeners",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Listener, pick="listeners")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_listeners(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_listeners(
                 NVMeoFClient.pb2.list_listeners_req(subsystem=nqn)
             )
 
@@ -128,6 +136,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -138,9 +147,10 @@ else:
             host_name: str,
             traddr: str,
             trsvcid: int = 4420,
-            adrfam: int = 0,  # IPv4
+            adrfam: int = 0,  # IPv4,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient(traddr=traddr).stub.create_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.create_listener(
                 NVMeoFClient.pb2.create_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -158,6 +168,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -170,8 +181,9 @@ else:
             trsvcid: int = 4420,
             adrfam: int = 0,  # IPv4
             force: bool = False,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.delete_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.delete_listener(
                 NVMeoFClient.pb2.delete_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -187,12 +199,15 @@ else:
     class NVMeoFNamespace(RESTController):
         @EndpointDoc(
             "List all NVMeoF namespaces in a subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Namespace, pick="namespaces")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn)
             )
 
@@ -201,12 +216,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.Namespace, first="namespaces")
         @handle_nvmeof_error
-        def get(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def get(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn, nsid=int(nsid))
             )
 
@@ -217,12 +233,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceIOStats)
         @handle_nvmeof_error
-        def io_stats(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_get_io_stats(
+        def io_stats(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_get_io_stats(
                 NVMeoFClient.pb2.namespace_get_io_stats_req(
                     subsystem_nqn=nqn, nsid=int(nsid))
             )
@@ -237,6 +254,7 @@ else:
                 "size": Param(int, "RBD image size"),
                 "block_size": Param(int, "NVMeoF namespace block size"),
                 "load_balancing_group": Param(int, "Load balancing group"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceCreation)
@@ -250,8 +268,9 @@ else:
             size: Optional[int] = 1024,
             block_size: int = 512,
             load_balancing_group: Optional[int] = None,
+            gw_group: Optional[str] = None,
         ):
-            return NVMeoFClient().stub.namespace_add(
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_add(
                 NVMeoFClient.pb2.namespace_add_req(
                     subsystem_nqn=nqn,
                     rbd_image_name=rbd_image_name,
@@ -274,6 +293,7 @@ else:
                 "rw_mbytes_per_second": Param(int, "Read/Write MB/s"),
                 "r_mbytes_per_second": Param(int, "Read MB/s"),
                 "w_mbytes_per_second": Param(int, "Write MB/s"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -288,12 +308,13 @@ else:
             rw_mbytes_per_second: Optional[int] = None,
             r_mbytes_per_second: Optional[int] = None,
             w_mbytes_per_second: Optional[int] = None,
+            gw_group: Optional[str] = None
         ):
             if rbd_image_size:
                 mib = 1024 * 1024
                 new_size_mib = int((rbd_image_size + mib - 1) / mib)
 
-                response = NVMeoFClient().stub.namespace_resize(
+                response = NVMeoFClient(gw_group=gw_group).stub.namespace_resize(
                     NVMeoFClient.pb2.namespace_resize_req(
                         subsystem_nqn=nqn, nsid=int(nsid), new_size=new_size_mib
                     )
@@ -336,12 +357,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_delete(
+        def delete(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_delete(
                 NVMeoFClient.pb2.namespace_delete_req(subsystem_nqn=nqn, nsid=int(nsid))
             )
 
@@ -351,7 +373,10 @@ else:
     class NVMeoFHost(RESTController):
         @EndpointDoc(
             "List all allowed hosts for an NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(
             model.Host,
@@ -362,8 +387,8 @@ else:
             else o,
         )
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_hosts(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_hosts(
                 NVMeoFClient.pb2.list_hosts_req(subsystem=nqn)
             )
 
@@ -372,12 +397,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to allow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def create(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.add_host(
+        def create(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.add_host(
                 NVMeoFClient.pb2.add_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -386,12 +412,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to disallow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.remove_host(
+        def delete(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.remove_host(
                 NVMeoFClient.pb2.remove_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -400,12 +427,15 @@ else:
     class NVMeoFConnection(RESTController):
         @EndpointDoc(
             "List all NVMeoF Subsystem Connections",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Connection, pick="connections")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_connections(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_connections(
                 NVMeoFClient.pb2.list_connections_req(subsystem=nqn)
             )
 
@@ -433,16 +463,17 @@ else:
                      parameters={
                          'subsystem_nqn': (str, 'Subsystem NQN'),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQNs'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @CreatePermission
-        def add(self, subsystem_nqn: str, host_nqn: str = ""):
+        def add(self, subsystem_nqn: str, gw_group: str, host_nqn: str = ""):
             response = None
             all_host_nqns = host_nqn.split(',')
 
             for nqn in all_host_nqns:
-                response = NVMeoFClient().stub.add_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.add_host(
                     NVMeoFClient.pb2.add_host_req(subsystem_nqn=subsystem_nqn, host_nqn=nqn)
                 )
                 if response.status != 0:
@@ -454,16 +485,17 @@ else:
                      parameters={
                          "subsystem_nqn": Param(str, "NVMeoF subsystem NQN"),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQN.'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @DeletePermission
-        def remove(self, subsystem_nqn: str, host_nqn: str):
+        def remove(self, subsystem_nqn: str, host_nqn: str, gw_group: str):
             response = None
             to_delete_nqns = host_nqn.split(',')
 
             for del_nqn in to_delete_nqns:
-                response = NVMeoFClient().stub.remove_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.remove_host(
                     NVMeoFClient.pb2.remove_host_req(subsystem_nqn=subsystem_nqn, host_nqn=del_nqn)
                 )
                 if response.status != 0:
diff --git a/src/pybind/mgr/dashboard/controllers/osd.py b/src/pybind/mgr/dashboard/controllers/osd.py
index c9d14177200..07d8db7755b 100644
--- a/src/pybind/mgr/dashboard/controllers/osd.py
+++ b/src/pybind/mgr/dashboard/controllers/osd.py
@@ -5,12 +5,14 @@ import logging
 import time
 from typing import Any, Dict, List, Optional, Union
 
+import cherrypy
 from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError  # type: ignore
 from mgr_util import get_most_recent_rate
 
 from .. import mgr
 from ..exceptions import DashboardException
 from ..security import Scope
+from ..services._paginate import ListPaginator
 from ..services.ceph_service import CephService, SendCommandError
 from ..services.exception import handle_orchestrator_error, handle_send_command_error
 from ..services.orchestrator import OrchClient, OrchFeature
@@ -121,8 +123,30 @@ def osd_task(name, metadata, wait_for=2.0):
 @APIRouter('/osd', Scope.OSD)
 @APIDoc('OSD management API', 'OSD')
 class Osd(RESTController):
-    def list(self):
-        osds = self.get_osd_map()
+    @RESTController.MethodMap(version=APIVersion(1, 1))
+    def list(self, offset: int = 0, limit: int = 10,
+             search: str = '', sort: str = ''):
+        all_osds = self.get_osd_map()
+
+        paginator = ListPaginator(int(offset), int(limit), sort, search,
+                                  input_list=all_osds.values(),
+                                  searchable_params=['id'],
+                                  sortable_params=['id'],
+                                  default_sort='+id')
+
+        cherrypy.response.headers['X-Total-Count'] = paginator.get_count()
+
+        paginated_osds_list = list(paginator.list())
+        # creating a dictionary to have faster lookups
+        paginated_osds_by_id = {osd['id']: osd for osd in paginated_osds_list}
+        try:
+            osds = {
+                key: paginated_osds_by_id[int(key)]
+                for key in all_osds.keys()
+                if int(key) in paginated_osds_by_id
+            }
+        except ValueError as e:
+            raise DashboardException(e, component='osd', http_status_code=400)
 
         # Extending by osd stats information
         for stat in mgr.get('osd_stats')['osd_stats']:
diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060..2e6e466f97b 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -162,9 +162,9 @@ class RgwMultisiteController(RESTController):
     @ReadPermission
     @allow_empty_body
     # pylint: disable=W0102,W0613
-    def get_sync_status(self):
+    def get_sync_status(self, daemon_name=None):
         multisite_instance = RgwMultisite()
-        result = multisite_instance.get_multisite_sync_status()
+        result = multisite_instance.get_multisite_sync_status(daemon_name)
         return result
 
     @Endpoint(path='/sync-policy')
@@ -176,6 +176,15 @@ class RgwMultisiteController(RESTController):
         if all_policy:
             sync_policy_list = []
             buckets = json.loads(RgwBucket().list(stats=False))
+            zonegroups_info = RgwMultisite().get_all_zonegroups_info()
+            default_zonegroup = ''
+            if 'zonegroups' in zonegroups_info and 'default_zonegroup' in zonegroups_info:
+                default_zonegroup = next(
+                    (zonegroup['name'] for zonegroup in zonegroups_info['zonegroups']
+                        if 'id' in zonegroup and 'name' in zonegroup
+                        and zonegroup['id'] == zonegroups_info['default_zonegroup']),
+                    ''
+                )
             for bucket in buckets:
                 sync_policy = multisite_instance.get_sync_policy(bucket, zonegroup_name)
                 for policy in sync_policy['groups']:
@@ -183,6 +192,7 @@ class RgwMultisiteController(RESTController):
                     sync_policy_list.append(policy)
             other_sync_policy = multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
             for policy in other_sync_policy['groups']:
+                policy['zonegroup'] = default_zonegroup
                 sync_policy_list.append(policy)
             return sync_policy_list
         return multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
@@ -244,11 +254,13 @@ class RgwMultisiteController(RESTController):
                          source_zones: Dict[str, Any],
                          destination_zones: Dict[str, Any],
                          source_bucket: str = '',
-                         destination_bucket: str = '', bucket_name: str = ''):
+                         destination_bucket: str = '', bucket_name: str = '',
+                         user: str = '', mode: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.create_sync_pipe(group_id, pipe_id, source_zones,
                                                    destination_zones, source_bucket,
-                                                   destination_bucket, bucket_name, True)
+                                                   destination_bucket, bucket_name, True,
+                                                   user, mode)
 
     @Endpoint(method='DELETE', path='/sync-pipe')
     @EndpointDoc("Remove the sync pipe")
@@ -256,12 +268,10 @@ class RgwMultisiteController(RESTController):
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '',
                          bucket_name: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.remove_sync_pipe(group_id, pipe_id, source_zones,
-                                                   destination_zones, destination_bucket,
-                                                   bucket_name, True)
+                                                   destination_zones, bucket_name, True)
 
 
 @APIRouter('/rgw/daemon', Scope.RGW)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
index 3a143a1a8df..32f7c76a362 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
@@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { ActivatedRoute, Router } from '@angular/router';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { InitiatorRequest, NvmeofService } from '~/app/shared/api/nvmeof.service';
 
 @Component({
   selector: 'cd-nvmeof-initiators-form',
@@ -26,6 +26,7 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   remove: boolean = false;
   subsystemNQN: string;
   removeHosts: { name: string; value: boolean; id: number }[] = [];
+  group: string;
 
   constructor(
     private authStorageService: AuthStorageService,
@@ -52,6 +53,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   );
 
   ngOnInit() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.ADD;
     this.route.params.subscribe((params: { subsystem_nqn: string }) => {
@@ -108,8 +112,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
     const hosts: string[] = this.addedHosts.value;
     let taskUrl = `nvmeof/initiator/${URLVerbs.ADD}`;
 
-    const request = {
-      host_nqn: hosts.join(',')
+    const request: InitiatorRequest = {
+      host_nqn: hosts.join(','),
+      gw_group: this.group
     };
 
     if (allowAnyHost) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
index fff38e6985a..a5575a9c926 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit, TemplateRef, ViewChild } from '@angular/core';
+import { Component, Input, OnInit, TemplateRef, ViewChild } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -20,9 +20,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-initiators-list.component.html',
   styleUrls: ['./nvmeof-initiators-list.component.scss']
 })
-export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
+export class NvmeofInitiatorsListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   @ViewChild('hostTpl', { static: true })
   hostTpl: TemplateRef<any>;
@@ -58,10 +60,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -79,17 +81,13 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
     return this.selection.selected.findIndex((selected) => selected.nqn === '*');
   }
 
-  ngOnChanges() {
-    this.listInitiators();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listInitiators() {
     this.nvmeofService
-      .getInitiators(this.subsystemNQN)
+      .getInitiators(this.subsystemNQN, this.group)
       .subscribe((initiators: NvmeofSubsystemInitiator[]) => {
         this.initiators = initiators;
       });
@@ -118,7 +116,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             plural: itemNames.length > 1
           }),
-          call: this.nvmeofService.removeInitiators(this.subsystemNQN, { host_nqn })
+          call: this.nvmeofService.removeInitiators(this.subsystemNQN, {
+            host_nqn,
+            gw_group: this.group
+          })
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
index cd362bf8abe..8310e65d203 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
@@ -103,7 +103,8 @@ export class NvmeofListenersFormComponent implements OnInit {
     const host = this.listenerForm.getValue('host');
     let trsvcid = Number(this.listenerForm.getValue('trsvcid'));
     if (!trsvcid) trsvcid = 4420;
-    const request = {
+    const request: ListenerRequest = {
+      gw_group: this.group,
       host_name: host.hostname,
       traddr: host.addr,
       trsvcid
@@ -128,9 +129,7 @@ export class NvmeofListenersFormComponent implements OnInit {
           component.listenerForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
index f88442e1bd6..b49adda7c1b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -21,7 +21,7 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-listeners-list.component.html',
   styleUrls: ['./nvmeof-listeners-list.component.scss']
 })
-export class NvmeofListenersListComponent implements OnInit, OnChanges {
+export class NvmeofListenersListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
   @Input()
@@ -76,22 +76,18 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteListenerModal()
       }
     ];
   }
 
-  ngOnChanges() {
-    this.listListeners();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listListeners() {
     this.nvmeofService
-      .listListeners(this.subsystemNQN)
+      .listListeners(this.subsystemNQN, this.group)
       .subscribe((listResponse: NvmeofListener[]) => {
         this.listeners = listResponse.map((listener, index) => {
           listener['id'] = index;
@@ -101,7 +97,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
       });
   }
 
-  deleteSubsystemModal() {
+  deleteListenerModal() {
     const listener = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Listener',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
index f5721e11ab6..b65ad62bdb4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
@@ -41,6 +41,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   nsid: string;
   currentBytes: number;
   invalidSizeError: boolean;
+  group: string;
 
   constructor(
     public actionLabels: ActionLabelsI18n,
@@ -62,6 +63,9 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   }
 
   init() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.CREATE;
     this.route.params.subscribe((params: { subsystem_nqn: string; nsid: string }) => {
@@ -74,7 +78,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     this.edit = true;
     this.action = this.actionLabels.EDIT;
     this.nvmeofService
-      .getNamespace(this.subsystemNQN, this.nsid)
+      .getNamespace(this.subsystemNQN, this.nsid, this.group)
       .subscribe((res: NvmeofSubsystemNamespace) => {
         const convertedSize = this.dimlessBinaryPipe.transform(res.rbd_image_size).split(' ');
         this.currentBytes = res.rbd_image_size;
@@ -120,6 +124,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     const image_size = this.nsForm.getValue('image_size');
     const image_size_unit = this.nsForm.getValue('unit');
     const request = {} as NamespaceCreateRequest | NamespaceEditRequest;
+    request['gw_group'] = this.group;
     if (image_size) {
       const key: string = this.edit ? 'rbd_image_size' : 'size';
       const value: number = this.formatterService.toBytes(image_size + image_size_unit);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
index c40b538c820..8f8f6eb8d05 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -23,9 +23,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-namespaces-list.component.html',
   styleUrls: ['./nvmeof-namespaces-list.component.scss']
 })
-export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
+export class NvmeofNamespacesListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   namespacesColumns: any;
   tableActions: CdTableAction[];
@@ -117,10 +119,10 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -128,41 +130,45 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'update',
         icon: Icons.edit,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            {
-              outlets: {
-                modal: [URLVerbs.EDIT, this.subsystemNQN, 'namespace', this.selection.first().nsid]
+          this.router.navigate(
+            [
+              BASE_URL,
+              {
+                outlets: {
+                  modal: [
+                    URLVerbs.EDIT,
+                    this.subsystemNQN,
+                    'namespace',
+                    this.selection.first().nsid
+                  ]
+                }
               }
-            }
-          ])
+            ],
+            { queryParams: { group: this.group } }
+          )
       },
       {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteNamespaceModal()
       }
     ];
   }
 
-  ngOnChanges() {
-    this.listNamespaces();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listNamespaces() {
     this.nvmeofService
-      .listNamespaces(this.subsystemNQN)
+      .listNamespaces(this.subsystemNQN, this.group)
       .subscribe((res: NvmeofSubsystemNamespace[]) => {
         this.namespaces = res;
       });
   }
 
-  deleteSubsystemModal() {
+  deleteNamespaceModal() {
     const namespace = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Namespace',
@@ -174,7 +180,7 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             nsid: namespace.nsid
           }),
-          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid)
+          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid, this.group)
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
index 7f15a1360ad..58a1e01a525 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
@@ -24,14 +24,18 @@
       <a ngbNavLink
          i18n>Namespaces</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-namespaces-list>
+        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-namespaces-list>
       </ng-template>
     </ng-container>
     <ng-container ngbNavItem="initiators">
       <a ngbNavLink
          i18n>Initiators</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-initiators-list>
+        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-initiators-list>
       </ng-template>
     </ng-container>
   </nav>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
index f7b35a2d645..7e5b064f379 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
@@ -118,9 +118,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
           component.subsystemForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
index da1a3f355c7..d79a4a6ccad 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
@@ -19,6 +19,7 @@ import { CephfsSnapshotScheduleService } from '~/app/shared/api/cephfs-snapshot-
 import { CephfsSubvolumeService } from '~/app/shared/api/cephfs-subvolume.service';
 import { DirectoryStoreService } from '~/app/shared/api/directory-store.service';
 import { ActionLabelsI18n, URLVerbs } from '~/app/shared/constants/app.constants';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 import { Icons } from '~/app/shared/enum/icons.enum';
 import { RepeatFrequency } from '~/app/shared/enum/repeat-frequency.enum';
 import { RetentionFrequency } from '~/app/shared/enum/retention-frequency.enum';
@@ -35,7 +36,6 @@ import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 
 const VALIDATON_TIMER = 300;
 const DEBOUNCE_TIMER = 300;
-const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
 
 @Component({
   selector: 'cd-cephfs-snapshotschedule-form',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
index b5eb7a88681..be7b81940df 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
@@ -34,8 +34,7 @@ import { CephfsMountDetailsComponent } from '../cephfs-mount-details/cephfs-moun
 import { HealthService } from '~/app/shared/api/health.service';
 import _ from 'lodash';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-
-const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 
 @Component({
   selector: 'cd-cephfs-subvolume-list',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
index 92f337f85a6..0087ffd66cd 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
@@ -26,7 +26,7 @@ import moment from 'moment';
 import { Validators } from '@angular/forms';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 
 @Component({
   selector: 'cd-cephfs-subvolume-snapshots-list',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
index 5f5f91dd0ed..a56877512f9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
@@ -6,13 +6,15 @@
        i18n>OSDs List</a>
     <ng-template ngbNavContent>
       <cd-table [data]="osds"
-                (fetchData)="getOsdList()"
+                (fetchData)="getOsdList($event)"
                 [columns]="columns"
                 selectionType="multiClick"
                 [hasDetails]="true"
                 (setExpandedRow)="setExpandedRow($event)"
                 (updateSelection)="updateSelection($event)"
-                [updateSelectionOnRefresh]="'never'">
+                [updateSelectionOnRefresh]="'never'"
+                [serverSide]="true"
+                [count]="count">
 
         <div class="table-actions">
           <cd-table-actions [permission]="permissions.osd"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
index 77facfe3f85..85ea9240414 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
@@ -33,6 +33,8 @@ import {
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdListComponent } from './osd-list.component';
 import { ResizeObserver as ResizeObserverPolyfill } from '@juggle/resize-observer';
+import { PaginateObservable } from '~/app/shared/api/paginate.model';
+import { Osd } from '~/app/shared/models/osd.model';
 
 describe('OsdListComponent', () => {
   let component: OsdListComponent;
@@ -141,38 +143,42 @@ describe('OsdListComponent', () => {
   });
 
   describe('getOsdList', () => {
-    let osds: any[];
+    let osds: Osd[];
     let flagsSpy: jasmine.Spy;
 
-    const createOsd = (n: number) =>
-      <Record<string, any>>{
-        in: 'in',
-        up: 'up',
-        tree: {
-          device_class: 'ssd'
-        },
-        stats_history: {
-          op_out_bytes: [
-            [n, n],
-            [n * 2, n * 2]
-          ],
-          op_in_bytes: [
-            [n * 3, n * 3],
-            [n * 4, n * 4]
-          ]
-        },
-        stats: {
-          stat_bytes_used: n * n,
-          stat_bytes: n * n * n
-        },
-        state: []
-      };
+    const createOsd = (n: number): Osd => ({
+      id: n,
+      host: {
+        id: 0,
+        name: 'test_host'
+      },
+      in: 1,
+      up: 1,
+      tree: {
+        device_class: 'ssd'
+      },
+      stats_history: {
+        op_out_bytes: [
+          [n, n],
+          [n * 2, n * 2]
+        ],
+        op_in_bytes: [
+          [n * 3, n * 3],
+          [n * 4, n * 4]
+        ]
+      },
+      stats: {
+        stat_bytes_used: n * n,
+        stat_bytes: n * n * n
+      },
+      state: []
+    });
 
     const expectAttributeOnEveryOsd = (attr: string) =>
       expect(component.osds.every((osd) => Boolean(_.get(osd, attr)))).toBeTruthy();
 
     beforeEach(() => {
-      spyOn(osdService, 'getList').and.callFake(() => of(osds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(osds)));
       flagsSpy = spyOn(osdService, 'getFlags').and.callFake(() => of([]));
       osds = [createOsd(1), createOsd(2), createOsd(3)];
       component.getOsdList();
@@ -556,8 +562,9 @@ describe('OsdListComponent', () => {
 
     beforeEach(() => {
       component.permissions = fakeAuthStorageService.getPermissions();
-      spyOn(osdService, 'getList').and.callFake(() => of(fakeOsds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(fakeOsds)));
       spyOn(osdService, 'getFlags').and.callFake(() => of([]));
+      component.getOsdList();
     });
 
     const testTableActions = async (
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
index 103b61e79f0..91cb0193f3c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
@@ -39,6 +39,8 @@ import { OsdRecvSpeedModalComponent } from '../osd-recv-speed-modal/osd-recv-spe
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdScrubModalComponent } from '../osd-scrub-modal/osd-scrub-modal.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { Osd } from '~/app/shared/models/osd.model';
 
 const BASE_URL = 'osd';
 
@@ -71,6 +73,7 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
   clusterWideActions: CdTableAction[];
   icons = Icons;
   osdSettings = new OsdSettings();
+  count = 0;
 
   selection = new CdTableSelection();
   osds: any[] = [];
@@ -426,10 +429,13 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
     }
   }
 
-  getOsdList() {
-    const observables = [this.osdService.getList(), this.osdService.getFlags()];
-    observableForkJoin(observables).subscribe((resp: [any[], string[]]) => {
-      this.osds = resp[0].map((osd) => {
+  getOsdList(context?: CdTableFetchDataContext) {
+    if (!context) context = new CdTableFetchDataContext();
+    const pagination_obs = this.osdService.getList(context.toParams());
+    const observables = [pagination_obs.observable, this.osdService.getFlags()];
+    observableForkJoin(observables).subscribe((resp: any) => {
+      this.osds = resp[0].map((osd: Osd) => {
+        this.count = pagination_obs.count;
         osd.collectedStates = OsdListComponent.collectStates(osd);
         osd.stats_history.out_bytes = osd.stats_history.op_out_bytes.map((i: string) => i[1]);
         osd.stats_history.in_bytes = osd.stats_history.op_in_bytes.map((i: string) => i[1]);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
index d3ea8c018f6..367418c752e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
@@ -27,7 +27,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '003c10beafc8c27b635bcdfed1ed832e4c1005be89bb1bb05ad4cc6c2b98e41b',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '3',
       daemon_type: 'osd',
       daemon_name: 'osd.3',
@@ -47,7 +47,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: 'baeec41a01374b3ed41016d542d19aef4a70d69c27274f271e26381a0cc58e7a',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '4',
       daemon_type: 'osd',
       daemon_name: 'osd.4',
@@ -63,7 +63,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '8483de277e365bea4365cee9e1f26606be85c471e4da5d51f57e4b85a42c616e',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '5',
       daemon_type: 'osd',
       daemon_name: 'osd.5',
@@ -79,7 +79,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'mon0',
       container_id: '6ca0574f47e300a6979eaf4e7c283a8c4325c2235ae60358482fc4cd58844a21',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: 'a',
       daemon_name: 'mon.a',
       daemon_type: 'mon',
@@ -99,7 +99,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'osd',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 3,
         running: 3,
         last_refresh: '2020-02-25T04:33:26.465699'
@@ -111,7 +111,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'crash',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 1,
         running: 1,
         last_refresh: '2020-02-25T04:33:26.465766'
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
index 1a73490175d..0da4913e9b8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
@@ -106,7 +106,6 @@
                       [invalid]="nfsForm.controls.fsal.controls.user_id.invalid && (nfsForm.controls.fsal.controls.user_id.dirty)"
                       [invalidText]="userIdError"
                       [skeleton]="allRGWUsers === null"
-                      (valueChange)="pathChangeHandler()"
                       i18n>
             <option *ngIf="allRGWUsers === null"
                     value="">Loading...</option>
@@ -223,8 +222,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="pathDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #pathError>
@@ -259,8 +256,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="bucketDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #bucketPathError>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
index 2317671b022..d502524256e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
@@ -434,7 +434,7 @@ export class NfsFormComponent extends CdForm implements OnInit {
           fs_name: this.selectedFsName
         }
       });
-      this.volumeChangeHandler();
+      this.getSubVolGrp(this.selectedFsName);
     }
     if (!_.isEmpty(this.selectedSubvolGroup)) {
       this.nfsForm.patchValue({
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
index ddc202152b9..463eac88b1e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
@@ -158,8 +158,14 @@
                   </div>
                 </td>
                 <td>
-                  <pre *ngIf="lifecycleFormat === 'json'">{{selection.lifecycle | json}}</pre>
-                  <pre *ngIf="lifecycleFormat === 'xml'">{{ (selection.lifecycle | xml) || '-'}}</pre>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'json'">
+                    {{selection.lifecycle | json}}
+                  </cds-code-snippet>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'xml'">
+                    {{ (selection.lifecycle | xml:{'Rules':'Rule'}) || '-'}}
+                  </cds-code-snippet>
                 </td>
               </tr>
               <tr>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
index e50666cdeaa..767305958d4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
@@ -64,6 +64,9 @@
                    i18n-placeholder
                    placeholder="Source Bucket Name..."
                    formControlName="source_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
           </div>
         <div class="form-group row">
@@ -78,6 +81,9 @@
                    i18n-placeholder
                    placeholder="Destination Bucket Name..."
                    formControlName="destination_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
         </div>
       </div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
index 369658d7d42..1127db1c59a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
@@ -89,6 +89,47 @@ describe('RgwMultisiteSyncPipeModalComponent', () => {
     component.submit();
     expect(spy).toHaveBeenCalled();
     expect(putDataSpy).toHaveBeenCalled();
-    expect(putDataSpy).toHaveBeenCalledWith(component.pipeForm.getRawValue());
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: '',
+      user: ''
+    });
+  });
+
+  it('should pass "user" and "mode" while creating/editing pipe', () => {
+    component.editing = true;
+    component.pipeForm.patchValue({
+      pipe_id: 'pipe1',
+      group_id: 's3-bucket-replication:enabled',
+      source_bucket: '',
+      source_zones: { added: ['zone1-zg1-realm1'], removed: [] },
+      destination_bucket: '',
+      destination_zones: { added: ['zone2-zg1-realm1'], removed: [] }
+    });
+    component.pipeSelectedRow = {
+      dest: { bucket: '*', zones: ['zone2-zg1-realm1'] },
+      id: 'pipi1',
+      params: {
+        dest: {},
+        mode: 'user',
+        priority: 0,
+        source: { filter: { tags: [] } },
+        user: 'dashboard'
+      },
+      source: { bucket: '*', zones: ['zone1-zg1-realm1'] }
+    };
+
+    component.sourceZones.data.selected = ['zone1-zg1-realm1'];
+    component.destZones.data.selected = ['zone2-zg1-realm1'];
+    const spy = jest.spyOn(component, 'submit');
+    const putDataSpy = jest.spyOn(multisiteServiceMock, 'createEditSyncPipe');
+    component.submit();
+    expect(spy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: 'user',
+      user: 'dashboard'
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
index 2f41dbd23c8..43742ef60b8 100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
@@ -18,6 +18,8 @@ import { ZoneData } from '../models/rgw-multisite-zone-selector';
 import { SucceededActionLabelsI18n } from '~/app/shared/constants/app.constants';
 
 const ALL_ZONES = $localize`All zones (*)`;
+const ALL_BUCKET_SELECTED_HELP_TEXT =
+  'If no value is provided, all the buckets in the zone group will be selected.';
 
 @Component({
   selector: 'cd-rgw-multisite-sync-pipe-modal',
@@ -33,6 +35,7 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
   sourceZones = new ZoneData(false, 'Filter Zones');
   destZones = new ZoneData(false, 'Filter Zones');
   icons = Icons;
+  allBucketSelectedHelpText = ALL_BUCKET_SELECTED_HELP_TEXT;
 
   constructor(
     public activeModal: NgbActiveModal,
@@ -187,7 +190,9 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
       .createEditSyncPipe({
         ...this.pipeForm.getRawValue(),
         source_zones: sourceZones,
-        destination_zones: destZones
+        destination_zones: destZones,
+        user: this.editing ? this.pipeSelectedRow?.params?.user : '',
+        mode: this.editing ? this.pipeSelectedRow?.params?.mode : ''
       })
       .subscribe(
         () => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
index ee261db5042..03228856125 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
@@ -88,12 +88,22 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
       {
         name: $localize`Zonegroup`,
         prop: 'zonegroup',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       },
       {
         name: $localize`Bucket`,
         prop: 'bucket',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       }
     ];
     this.rgwDaemonService.list().subscribe();
@@ -137,7 +147,7 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
           groupName: policy['id'],
           status: policy['status'],
           bucket: policy['bucketName'],
-          zonegroup: ''
+          zonegroup: policy['zonegroup']
         });
       });
       this.syncPolicyData = [...this.syncPolicyData];
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
index 8b5901769c3..00037a7235b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
@@ -91,7 +91,9 @@ export class RgwOverviewDashboardComponent implements OnInit, OnDestroy {
         this.totalPoolUsedBytes = data['total_pool_bytes_used'];
         this.averageObjectSize = data['average_object_size'];
       });
-      this.getSyncStatus();
+      setTimeout(() => {
+        this.getSyncStatus();
+      });
     });
     this.BucketSub = this.rgwBucketService
       .getTotalBucketsAndUsersLength()
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
index 3439562c8e2..5f8c6f50135 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
@@ -70,7 +70,8 @@ import {
   IconModule,
   LoadingModule,
   ModalModule,
-  ProgressIndicatorModule
+  ProgressIndicatorModule,
+  CodeSnippetModule
 } from 'carbon-components-angular';
 import { CephSharedModule } from '../shared/ceph-shared.module';
 
@@ -94,6 +95,7 @@ import { CephSharedModule } from '../shared/ceph-shared.module';
     ModalModule,
     GridModule,
     ProgressIndicatorModule,
+    CodeSnippetModule,
     ButtonModule,
     LoadingModule,
     IconModule,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
index 313db3445f2..a5c84e60b6f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
@@ -27,6 +27,7 @@ describe('NvmeofService', () => {
     expect(service).toBeTruthy();
   });
 
+  // gateways
   it('should call listGatewayGroups', () => {
     service.listGatewayGroups().subscribe();
     const req = httpTesting.expectOne('api/nvmeof/gateway/group');
@@ -39,6 +40,7 @@ describe('NvmeofService', () => {
     expect(req.request.method).toBe('GET');
   });
 
+  // subsystems
   it('should call listSubsystems', () => {
     service.listSubsystems(mockGroupName).subscribe();
     const req = httpTesting.expectOne(`api/nvmeof/subsystem?gw_group=${mockGroupName}`);
@@ -69,9 +71,12 @@ describe('NvmeofService', () => {
     expect(req.request.method).toBe('DELETE');
   });
 
+  // initiators
   it('should call getInitiators', () => {
-    service.getInitiators(mockNQN).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}/host`);
+    service.getInitiators(mockNQN, mockGroupName).subscribe();
+    const req = httpTesting.expectOne(
+      `api/nvmeof/subsystem/${mockNQN}/host?gw_group=${mockGroupName}`
+    );
     expect(req.request.method).toBe('GET');
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
index 40202d0d672..a2bbf507bc3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
@@ -8,6 +8,7 @@ import { catchError, mapTo } from 'rxjs/operators';
 export const MAX_NAMESPACE = 1024;
 
 export interface ListenerRequest {
+  gw_group: string;
   host_name: string;
   traddr: string;
   trsvcid: number;
@@ -17,14 +18,17 @@ export interface NamespaceCreateRequest {
   rbd_image_name: string;
   rbd_pool: string;
   size: number;
+  gw_group: string;
 }
 
 export interface NamespaceEditRequest {
   rbd_image_size: number;
+  gw_group: string;
 }
 
 export interface InitiatorRequest {
   host_nqn: string;
+  gw_group: string;
 }
 
 const API_PATH = 'api/nvmeof';
@@ -81,8 +85,8 @@ export class NvmeofService {
   }
 
   // Initiators
-  getInitiators(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host`);
+  getInitiators(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host?gw_group=${group}`);
   }
 
   addInitiators(subsystemNQN: string, request: InitiatorRequest) {
@@ -92,14 +96,17 @@ export class NvmeofService {
   }
 
   removeInitiators(subsystemNQN: string, request: InitiatorRequest) {
-    return this.http.delete(`${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}`, {
-      observe: 'response'
-    });
+    return this.http.delete(
+      `${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}/${request.gw_group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 
   // Listeners
-  listListeners(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener`);
+  listListeners(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener?gw_group=${group}`);
   }
 
   createListener(subsystemNQN: string, request: ListenerRequest) {
@@ -121,12 +128,14 @@ export class NvmeofService {
   }
 
   // Namespaces
-  listNamespaces(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace`);
+  listNamespaces(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace?gw_group=${group}`);
   }
 
-  getNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`);
+  getNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.get(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`
+    );
   }
 
   createNamespace(subsystemNQN: string, request: NamespaceCreateRequest) {
@@ -141,9 +150,12 @@ export class NvmeofService {
     });
   }
 
-  deleteNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.delete(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`, {
-      observe: 'response'
-    });
+  deleteNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.delete(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
index d1f9997791a..c81c9193a2e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
@@ -3,6 +3,7 @@ import { TestBed } from '@angular/core/testing';
 
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { OsdService } from './osd.service';
+import { CdTableFetchDataContext } from '../models/cd-table-fetch-data-context';
 
 describe('OsdService', () => {
   let service: OsdService;
@@ -64,8 +65,9 @@ describe('OsdService', () => {
   });
 
   it('should call getList', () => {
-    service.getList().subscribe();
-    const req = httpTesting.expectOne('api/osd');
+    const context = new CdTableFetchDataContext(() => {});
+    service.getList(context.toParams()).observable.subscribe();
+    const req = httpTesting.expectOne('api/osd?offset=0&limit=10&search=&sort=%2Bname');
     expect(req.request.method).toBe('GET');
   });
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
index f2ed4d7cc9e..85a75073dea 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
@@ -1,4 +1,4 @@
-import { HttpClient } from '@angular/common/http';
+import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 
 import _ from 'lodash';
@@ -12,6 +12,9 @@ import { OsdSettings } from '../models/osd-settings';
 import { SmartDataResponseV1 } from '../models/smart';
 import { DeviceService } from '../services/device.service';
 import { CdFormGroup } from '../forms/cd-form-group';
+import { PaginateObservable } from './paginate.model';
+import { PaginateParams } from '../classes/paginate-params.class';
+import { Osd } from '../models/osd.model';
 
 @Injectable({
   providedIn: 'root'
@@ -80,8 +83,10 @@ export class OsdService {
     return this.http.post(this.path, request, { observe: 'response' });
   }
 
-  getList() {
-    return this.http.get(`${this.path}`);
+  getList(params: HttpParams): PaginateObservable<Osd[]> {
+    return new PaginateObservable<Osd[]>(
+      this.http.get<Osd[]>(this.path, new PaginateParams(params, 1, 1))
+    );
   }
 
   getOsdSettings(): Observable<OsdSettings> {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
index 703792a7571..77ec4e43f7c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
@@ -9,7 +9,7 @@ export class PaginateObservable<Type> {
     this.observable = obs.pipe(
       map((response: any) => {
         this.count = Number(response.headers?.get('X-Total-Count'));
-        return response['body'];
+        return response['body'] || response;
       })
     );
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
index b3a77e32198..3dc886e172f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
@@ -32,7 +32,9 @@ export class RgwMultisiteService {
   }
 
   getSyncStatus() {
-    return this.http.get(`${this.url}/sync_status`);
+    return this.rgwDaemonService.request((params: HttpParams) => {
+      return this.http.get(`${this.url}/sync_status`, { params: params });
+    });
   }
 
   status() {
@@ -127,8 +129,15 @@ export class RgwMultisiteService {
     );
   }
 
-  createEditSyncPipe(payload: any) {
-    return this.http.put(`${this.url}/sync-pipe`, payload);
+  createEditSyncPipe(payload: any, user?: string, mode?: string) {
+    let params = new HttpParams();
+    if (user) {
+      params = params.append('user', user);
+    }
+    if (mode) {
+      params = params.append('mode', mode);
+    }
+    return this.http.put(`${this.url}/sync-pipe`, payload, { params });
   }
 
   removeSyncPipe(pipe_id: string, group_id: string, bucket_name?: string) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
new file mode 100644
index 00000000000..a1b079b426b
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
@@ -0,0 +1,15 @@
+import { HttpParams } from '@angular/common/http';
+
+export class PaginateParams {
+  constructor(params: HttpParams, majorVersion = 1, minorVersion = 0) {
+    const options = {
+      params: params,
+      headers: {
+        Accept: `application/vnd.ceph.api.v${majorVersion}.${minorVersion}+json`
+      }
+    };
+
+    options['observe'] = 'response';
+    return options;
+  }
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts
deleted file mode 100644
index 56890ff7214..00000000000
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts
+++ /dev/null
@@ -1 +0,0 @@
-export const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
index 0df2d2ebbe0..6ea415bfee9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
@@ -18,7 +18,7 @@ export class CdTableFetchDataContext {
   search = '';
   sort = '+name';
 
-  constructor(error: () => void) {
+  constructor(error?: () => void) {
     this.error = error;
   }
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
new file mode 100644
index 00000000000..f22987e439e
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
@@ -0,0 +1,49 @@
+/* We will need to check what are all the value that the
+   UI need and only make them the mandatory parameters here.
+   For now based on what I saw in the unit test file;
+   osd-list.component.spec.ts, I've made the decision to make
+   things optional and non-optional. This should be re-evaluated. */
+
+export interface Osd {
+  id: number;
+  host: Host;
+  stats_history: StatsHistory;
+  state: string[];
+  stats: Stats;
+  collectedStates?: string[];
+  in?: number;
+  out?: number;
+  up?: number;
+  down?: number;
+  destroyed?: number;
+  cdIsBinary?: boolean;
+  cdIndivFlags?: string[];
+  cdClusterFlags?: string[];
+  cdExecuting?: any;
+  tree?: Tree;
+  operational_status?: string;
+}
+
+interface Tree {
+  device_class: string;
+}
+
+interface Host {
+  id: number;
+  name: string;
+}
+
+interface StatsHistory {
+  op_out_bytes: any[];
+  op_in_bytes: any[];
+  out_bytes?: any[];
+  in_bytes?: any[];
+}
+
+interface Stats {
+  stat_bytes_used: number;
+  stat_bytes: number;
+  op_w?: number;
+  op_r?: number;
+  usage?: number;
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
index 59d7572e9f0..45cca684dab 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
@@ -7,9 +7,13 @@ import { JsonToXmlService } from '../services/json-to-xml.service';
 export class XmlPipe implements PipeTransform {
   constructor(private jsonToXmlService: JsonToXmlService) {}
 
-  transform(value: string, valueFormat: string = 'json'): string {
+  transform(
+    value: string,
+    replaceKey: Record<string, string> = {},
+    valueFormat: string = 'json'
+  ): string {
     if (valueFormat === 'json') {
-      value = this.jsonToXmlService.format(value);
+      value = this.jsonToXmlService.format(value, replaceKey);
     }
     return value;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
index 8f1d128c0c5..e9d30f9b7f2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
@@ -6,29 +6,39 @@ import { Injectable } from '@angular/core';
 export class JsonToXmlService {
   constructor() {}
 
-  format(json: any, indentSize: number = 2, currentIndent: number = 0): string {
+  format(
+    json: any,
+    replaceKey: Record<string, string> = null,
+    indentSize: number = 2,
+    currentIndent: number = 0
+  ): string {
     if (!json) return null;
     let xml = '';
     if (typeof json === 'string') {
       json = JSON.parse(json);
     }
 
-    for (const key in json) {
+    for (let key in json) {
       if (json.hasOwnProperty(key)) {
         const value = json[key];
         const indentation = ' '.repeat(currentIndent);
-
+        if (replaceKey) {
+          const [oldKey, newKey] = Object.entries(replaceKey)[0];
+          if (key === oldKey) {
+            key = newKey;
+          }
+        }
         if (Array.isArray(value)) {
           value.forEach((item) => {
             xml +=
               `${indentation}<${key}>\n` +
-              this.format(item, indentSize, currentIndent + indentSize) +
+              this.format(item, replaceKey, indentSize, currentIndent + indentSize) +
               `${indentation}</${key}>\n`;
           });
         } else if (typeof value === 'object') {
           xml +=
             `${indentation}<${key}>\n` +
-            this.format(value, indentSize, currentIndent + indentSize) +
+            this.format(value, replaceKey, indentSize, currentIndent + indentSize) +
             `${indentation}</${key}>\n`;
         } else {
           xml += `${indentation}<${key}>${value}</${key}>\n`;
diff --git a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
index 1d12facaf6a..61ca421101e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
+++ b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
@@ -142,3 +142,10 @@ Dashboard page
 cd-dashboard {
   font-size: 12px;
 }
+
+/******************************************
+Code snippet
+******************************************/
+.cds--snippet {
+  width: fit-content;
+}
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index e8ab663d0d5..aedee7e493d 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8293,6 +8293,7 @@ paths:
                   description: Enable high availability
                   type: boolean
                 gw_group:
+                  description: NVMeoF gateway group
                   type: string
                 max_namespaces:
                   default: 1024
@@ -8346,6 +8347,7 @@ paths:
         schema:
           type: boolean
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8384,6 +8386,7 @@ paths:
         schema:
           type: string
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8417,6 +8420,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8446,6 +8455,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8479,6 +8494,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_nqn:
                   description: NVMeoF host NQN. Use "*" to allow any host.
                   type: string
@@ -8525,6 +8543,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8559,6 +8583,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8596,6 +8626,9 @@ paths:
                   default: 0
                   description: NVMeoF address family (0 - IPv4, 1 - IPv6)
                   type: integer
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_name:
                   description: NVMeoF hostname
                   type: string
@@ -8673,6 +8706,12 @@ paths:
         name: force
         schema:
           type: boolean
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8707,6 +8746,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8748,6 +8793,9 @@ paths:
                   default: true
                   description: Create RBD image
                   type: boolean
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8805,6 +8853,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8844,6 +8898,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8883,6 +8943,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8937,6 +9000,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8959,11 +9028,31 @@ paths:
       - NVMe-oF Subsystem Namespace
   /api/osd:
     get:
-      parameters: []
+      parameters:
+      - default: 0
+        in: query
+        name: offset
+        schema:
+          type: integer
+      - default: 10
+        in: query
+        name: limit
+        schema:
+          type: integer
+      - default: ''
+        in: query
+        name: search
+        schema:
+          type: string
+      - default: ''
+        in: query
+        name: sort
+        schema:
+          type: string
       responses:
         '200':
           content:
-            application/vnd.ceph.api.v1.0+json:
+            application/vnd.ceph.api.v1.1+json:
               type: object
           description: OK
         '400':
@@ -11384,6 +11473,9 @@ paths:
                   type: string
                 group_id:
                   type: string
+                mode:
+                  default: ''
+                  type: string
                 pipe_id:
                   type: string
                 source_bucket:
@@ -11391,6 +11483,9 @@ paths:
                   type: string
                 source_zones:
                   type: string
+                user:
+                  default: ''
+                  type: string
               required:
               - group_id
               - pipe_id
@@ -11447,11 +11542,6 @@ paths:
           type: string
       - default: ''
         in: query
-        name: destination_bucket
-        schema:
-          type: string
-      - default: ''
-        in: query
         name: bucket_name
         schema:
           type: string
@@ -11677,7 +11767,12 @@ paths:
       - RgwMultisite
   /api/rgw/multisite/sync_status:
     get:
-      parameters: []
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: daemon_name
+        schema:
+          type: string
       responses:
         '200':
           content:
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index 1802f8a5fce..2426c599078 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -177,6 +177,18 @@ def _get_running_daemon_svc_config(svc_config, running_daemons):
 
 def _get_default_service(gateways):
     if gateways:
-        service_name = list(gateways.keys())[0]
+        gateway_keys = list(gateways.keys())
+        # if there are more than 1 gateway, rather than chosing a random gateway
+        # from any of the group, raise an exception to make it clear that we need
+        # to specify the group name in the API request.
+        if len(gateway_keys) > 1:
+            raise DashboardException(
+                msg=(
+                    "Multiple NVMe-oF gateway groups are configured. "
+                    "Please specify the 'gw_group' parameter in the request."
+                ),
+                component="nvmeof"
+            )
+        service_name = gateway_keys[0]
         return service_name, gateways[service_name][0]['service_url']
     return None
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 21eff2dd7f9..2fe09821694 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -10,6 +10,7 @@ import re
 import time
 import uuid
 import xml.etree.ElementTree as ET  # noqa: N814
+from collections import defaultdict
 from enum import Enum
 from subprocess import SubprocessError
 from urllib.parse import urlparse
@@ -288,21 +289,22 @@ class RgwClient(RestClient):
 
         daemon_keys = RgwClient._daemons.keys()
         if not daemon_name:
-            if len(daemon_keys) > 1:
-                try:
-                    multiiste = RgwMultisite()
-                    default_zonegroup = multiiste.get_all_zonegroups_info()['default_zonegroup']
-
-                    # Iterate through _daemons.values() to find the daemon with the
-                    # matching zonegroup_id
-                    for daemon in RgwClient._daemons.values():
-                        if daemon.zonegroup_id == default_zonegroup:
-                            daemon_name = daemon.name
-                            break
-                except Exception:  # pylint: disable=broad-except
-                    daemon_name = next(iter(daemon_keys))
-            else:
-                # Handle the case where there is only one or no key in _daemons
+            try:
+                if len(daemon_keys) > 1:
+                    default_zonegroup = (
+                        RgwMultisite()
+                        .get_all_zonegroups_info()['default_zonegroup']
+                    )
+                    if default_zonegroup:
+                        daemon_name = next(
+                            (daemon.name
+                             for daemon in RgwClient._daemons.values()
+                             if daemon.zonegroup_id == default_zonegroup),
+                            None
+                        )
+                daemon_name = daemon_name or next(iter(daemon_keys))
+            except Exception as e:  # pylint: disable=broad-except
+                logger.exception('Failed to determine default RGW daemon: %s', str(e))
                 daemon_name = next(iter(daemon_keys))
 
         # Discard all cached instances if any rgw setting has changed
@@ -700,12 +702,28 @@ class RgwClient(RestClient):
             raise DashboardException(msg=str(e), component='rgw')
         return result
 
+    @staticmethod
+    def _handle_rules(pairs):
+        result = defaultdict(list)
+        for key, value in pairs:
+            if key == 'Rule':
+                result['Rules'].append(value)
+            else:
+                result[key] = value
+        return result
+
     @RestClient.api_get('/{bucket_name}?lifecycle')
     def get_lifecycle(self, bucket_name, request=None):
         # pylint: disable=unused-argument
         try:
-            result = request()  # type: ignore
-            result = {'LifecycleConfiguration': result}
+            decoded_request = request(raw_content=True).decode("utf-8")  # type: ignore
+            result = {
+                'LifecycleConfiguration':
+                json.loads(
+                    decoded_request,
+                    object_pairs_hook=RgwClient._handle_rules
+                )
+            }
         except RequestException as e:
             if e.content:
                 content = json_str_to_object(e.content)
@@ -757,15 +775,15 @@ class RgwClient(RestClient):
             lifecycle = RgwClient.dict_to_xml(lifecycle)
         try:
             if lifecycle and '<LifecycleConfiguration>' not in str(lifecycle):
-                lifecycle = f'<LifecycleConfiguration>{lifecycle}</LifecycleConfiguration>'
+                lifecycle = f'<LifecycleConfiguration>\n{lifecycle}\n</LifecycleConfiguration>'
             result = request(data=lifecycle)  # type: ignore
         except RequestException as e:
+            msg = ''
             if e.content:
                 content = json_str_to_object(e.content)
                 if content.get("Code") == "MalformedXML":
                     msg = "Invalid Lifecycle document"
-                    raise DashboardException(msg=msg, component='rgw')
-            raise DashboardException(msg=str(e), component='rgw')
+            raise DashboardException(msg=msg or str(e), component='rgw')
         return result
 
     @RestClient.api_delete('/{bucket_name}?lifecycle')
@@ -1981,8 +1999,16 @@ class RgwMultisite:
             is_multisite_configured = False
         return is_multisite_configured
 
-    def get_multisite_sync_status(self):
+    def get_multisite_sync_status(self, daemon_name: str):
         rgw_multisite_sync_status_cmd = ['sync', 'status']
+        daemons = _get_daemons()
+        try:
+            realm_name = daemons[daemon_name].realm_name
+        except (KeyError, AttributeError):
+            raise DashboardException('Unable to get realm name from daemon',
+                                     http_status_code=500, component='rgw')
+        if realm_name:
+            rgw_multisite_sync_status_cmd.extend(['--rgw-realm', realm_name])
         try:
             exit_code, out, _ = mgr.send_rgwadmin_command(rgw_multisite_sync_status_cmd, False)
             if exit_code > 0:
@@ -2236,7 +2262,8 @@ class RgwMultisite:
                          source_bucket: str = '',
                          destination_bucket: str = '',
                          bucket_name: str = '',
-                         update_period=False):
+                         update_period=False,
+                         user: str = '', mode: str = ''):
 
         if source_zones['added'] or destination_zones['added']:
             rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'create',
@@ -2245,11 +2272,9 @@ class RgwMultisite:
             if bucket_name:
                 rgw_sync_policy_cmd += ['--bucket', bucket_name]
 
-            if source_bucket:
-                rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
+            rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
 
-            if destination_bucket:
-                rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
+            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
 
             if source_zones['added']:
                 rgw_sync_policy_cmd += ['--source-zones', ','.join(source_zones['added'])]
@@ -2257,6 +2282,12 @@ class RgwMultisite:
             if destination_zones['added']:
                 rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones['added'])]
 
+            if user:
+                rgw_sync_policy_cmd += ['--uid', user]
+
+            if mode:
+                rgw_sync_policy_cmd += ['--mode', mode]
+
             logger.info("Creating sync pipe!")
             try:
                 exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
@@ -2271,13 +2302,13 @@ class RgwMultisite:
         if ((source_zones['removed'] and '*' not in source_zones['added'])
                 or (destination_zones['removed'] and '*' not in destination_zones['added'])):
             self.remove_sync_pipe(group_id, pipe_id, source_zones['removed'],
-                                  destination_zones['removed'], destination_bucket,
-                                  bucket_name)
+                                  destination_zones['removed'],
+                                  bucket_name, True)
 
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '', bucket_name: str = '',
+                         bucket_name: str = '',
                          update_period=False):
         rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'remove',
                                '--group-id', group_id, '--pipe-id', pipe_id]
@@ -2291,9 +2322,6 @@ class RgwMultisite:
         if destination_zones:
             rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones)]
 
-        if destination_bucket:
-            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
-
         logger.info("Removing sync pipe! %s", rgw_sync_policy_cmd)
         try:
             exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
diff --git a/src/pybind/mgr/dashboard/tests/test_osd.py b/src/pybind/mgr/dashboard/tests/test_osd.py
index c3cd0dca88d..9b6dbd10de1 100644
--- a/src/pybind/mgr/dashboard/tests/test_osd.py
+++ b/src/pybind/mgr/dashboard/tests/test_osd.py
@@ -8,6 +8,7 @@ from ceph.deployment.drive_group import DeviceSelection, DriveGroupSpec  # type:
 from ceph.deployment.service_spec import PlacementSpec
 
 from .. import mgr
+from ..controllers._version import APIVersion
 from ..controllers.osd import Osd, OsdUi
 from ..services.osd import OsdDeploymentOptions
 from ..tests import ControllerTestCase
@@ -274,7 +275,7 @@ class OsdTest(ControllerTestCase):
         osds_leftover = [0, 1, 2]
         with self._mock_osd_list(osd_stat_ids=osds_actual, osdmap_tree_node_ids=osds_leftover,
                                  osdmap_ids=osds_actual):
-            self._get('/api/osd')
+            self._get('/api/osd', version=APIVersion(1, 1))
             self.assertEqual(len(self.json_body()), 2, 'It should display two OSDs without failure')
             self.assertStatus(200)
 
diff --git a/src/pybind/mgr/dashboard/tools.py b/src/pybind/mgr/dashboard/tools.py
index 51ed9c471aa..14de970cceb 100644
--- a/src/pybind/mgr/dashboard/tools.py
+++ b/src/pybind/mgr/dashboard/tools.py
@@ -9,9 +9,9 @@ import threading
 import time
 import urllib
 from datetime import datetime, timedelta
-from distutils.util import strtobool
 
 import cherrypy
+from ceph.utils import strtobool
 from mgr_util import build_url
 
 from . import mgr
diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py
index 67246545eea..5d37d478de7 100644
--- a/src/pybind/mgr/mgr_util.py
+++ b/src/pybind/mgr/mgr_util.py
@@ -22,6 +22,7 @@ import sys
 from ipaddress import ip_address
 from threading import Lock, Condition
 from typing import no_type_check, NewType
+from traceback import format_exc as tb_format_exc
 import urllib
 from functools import wraps
 if sys.version_info >= (3, 3):
@@ -88,9 +89,9 @@ class RTimer(Timer):
             while not self.finished.is_set():
                 self.finished.wait(self.interval)
                 self.function(*self.args, **self.kwargs)
-            self.finished.set()
-        except Exception as e:
-            logger.error("task exception: %s", e)
+        except Exception:
+            logger.error(f'exception encountered in RTimer instance "{self}":'
+                         f'\n{tb_format_exc()}')
             raise
 
 
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 82a8c13a9c1..d5c351fda7e 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -520,6 +520,15 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> OrchResult:
+        """Perform all required operations in order to replace a device.
+        """
+        raise NotImplementedError()
+
     def get_inventory(self, host_filter: Optional['InventoryFilter'] = None, refresh: bool = False) -> OrchResult[List['InventoryHost']]:
         """
         Returns something that was created by `ceph-volume inventory`.
@@ -576,7 +585,12 @@ class Orchestrator(object):
         raise NotImplementedError()
 
     @handle_orch_error
-    def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence["GenericSpec"],
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> List[str]:
         """
         Applies any spec
         """
@@ -699,12 +713,18 @@ class Orchestrator(object):
 
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> OrchResult[str]:
         """
         :param osd_ids: list of OSD IDs
         :param replace: marks the OSD as being destroyed. See :ref:`orchestrator-osd-replace`
+        :param replace_block: marks the corresponding block device as being replaced.
+        :param replace_db: marks the corresponding db device as being replaced.
+        :param replace_wal: marks the corresponding wal device as being replaced.
         :param force: Forces the OSD removal process without waiting for the data to be drained first.
         :param zap: Zap/Erase all devices associated with the OSDs (DESTROYS DATA)
         :param no_destroy: Do not destroy associated VGs/LVs with the OSD.
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index be0096bb2d9..dbfa10fb720 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -818,6 +818,21 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
             return HandleCommandResult(stdout=completion.result_str())
         return HandleCommandResult(stdout=completion.result_str().split('.')[0])
 
+    @_cli_read_command('orch device replace')
+    def _replace_device(self,
+                        hostname: str,
+                        device: str,
+                        clear: bool = False,
+                        yes_i_really_mean_it: bool = False) -> HandleCommandResult:
+        """Perform all required operations in order to replace a device.
+        """
+        completion = self.replace_device(hostname=hostname,
+                                         device=device,
+                                         clear=clear,
+                                         yes_i_really_mean_it=yes_i_really_mean_it)
+        raise_if_exception(completion)
+        return HandleCommandResult(stdout=completion.result_str())
+
     @_cli_read_command('orch device ls')
     def _list_devices(self,
                       hostname: Optional[List[str]] = None,
@@ -1415,8 +1430,9 @@ Usage:
                       zap: bool = False,
                       no_destroy: bool = False) -> HandleCommandResult:
         """Remove OSD daemons"""
-        completion = self.remove_osds(osd_id, replace=replace, force=force,
-                                      zap=zap, no_destroy=no_destroy)
+        completion = self.remove_osds(osd_id,
+                                      replace=replace,
+                                      force=force, zap=zap, no_destroy=no_destroy)
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())
 
@@ -1635,12 +1651,14 @@ Usage:
                    format: Format = Format.plain,
                    unmanaged: bool = False,
                    no_overwrite: bool = False,
+                   continue_on_error: bool = False,
                    inbuf: Optional[str] = None) -> HandleCommandResult:
         """Update the size or placement for a service or apply a large yaml spec"""
         usage = """Usage:
   ceph orch apply -i <yaml spec> [--dry-run]
   ceph orch apply <service_type> [--placement=<placement_string>] [--unmanaged]
         """
+        errs: List[str] = []
         if inbuf:
             if service_type or placement or unmanaged:
                 raise OrchestratorValidationError(usage)
@@ -1650,7 +1668,14 @@ Usage:
             # None entries in the output. Let's skip them silently.
             content = [o for o in yaml_objs if o is not None]
             for s in content:
-                spec = json_to_generic_spec(s)
+                try:
+                    spec = json_to_generic_spec(s)
+                except Exception as e:
+                    if continue_on_error:
+                        errs.append(f'Failed to convert {s} from json object: {str(e)}')
+                        continue
+                    else:
+                        raise e
 
                 # validate the config (we need MgrModule for that)
                 if isinstance(spec, ServiceSpec) and spec.config:
@@ -1658,7 +1683,12 @@ Usage:
                         try:
                             self.get_foreign_ceph_option('mon', k)
                         except KeyError:
-                            raise SpecValidationError(f'Invalid config option {k} in spec')
+                            err = SpecValidationError(f'Invalid config option {k} in spec')
+                            if continue_on_error:
+                                errs.append(str(err))
+                                continue
+                            else:
+                                raise err
 
                 # There is a general "osd" service with no service id, but we use
                 # that to dump osds created individually with "ceph orch daemon add osd"
@@ -1673,7 +1703,12 @@ Usage:
                     and spec.service_type == 'osd'
                     and not spec.service_id
                 ):
-                    raise SpecValidationError('Please provide the service_id field in your OSD spec')
+                    err = SpecValidationError('Please provide the service_id field in your OSD spec')
+                    if continue_on_error:
+                        errs.append(str(err))
+                        continue
+                    else:
+                        raise err
 
                 if dry_run and not isinstance(spec, HostSpec):
                     spec.preview_only = dry_run
@@ -1683,15 +1718,30 @@ Usage:
                     continue
                 specs.append(spec)
         else:
+            # Note in this case there is only ever one spec
+            # being applied so there is no need to worry about
+            # handling of continue_on_error
             placementspec = PlacementSpec.from_string(placement)
             if not service_type:
                 raise OrchestratorValidationError(usage)
             specs = [ServiceSpec(service_type.value, placement=placementspec,
                                  unmanaged=unmanaged, preview_only=dry_run)]
-        return self._apply_misc(specs, dry_run, format, no_overwrite)
-
-    def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult:
-        completion = self.apply(specs, no_overwrite)
+        cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error)
+        if errs:
+            # HandleCommandResult is a named tuple, so use
+            # _replace to modify it.
+            cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs))
+        return cmd_result
+
+    def _apply_misc(
+        self,
+        specs: Sequence[GenericSpec],
+        dry_run: bool,
+        format: Format,
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> HandleCommandResult:
+        completion = self.apply(specs, no_overwrite, continue_on_error)
         raise_if_exception(completion)
         out = completion.result_str()
         if dry_run:
diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
index 726a7ac7937..3247b06a399 100644
--- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
+++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
@@ -102,7 +102,7 @@ placement:
   host_pattern: '*'
 status:
   container_image_id: 74803e884bea289d2d2d3ebdf6d37cd560499e955595695b1390a89800f4e37a
-  container_image_name: docker.io/ceph/daemon-base:latest-master-devel
+  container_image_name: quay.io/ceph/daemon-base:latest-main-devel
   created: '2020-06-10T10:37:31.051288Z'
   last_refresh: '2020-06-10T10:57:40.715637Z'
   running: 1
diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index dea45f951f8..3e8544f43cf 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -21,7 +21,7 @@ class CephFSStorageProvider(_StrEnum):
 
     def expand(self) -> 'CephFSStorageProvider':
         """Expand abbreviated/default values into the full/expanded form."""
-        if self == self.SAMBA_VFS:
+        if self is self.SAMBA_VFS:
             # mypy gets confused by enums
             return self.__class__(self.SAMBA_VFS_NEW)
         return self
@@ -89,9 +89,9 @@ class LoginAccess(_StrEnum):
     def expand(self) -> 'LoginAccess':
         """Exapend abbreviated enum values into their full forms."""
         # the extra LoginAccess(...) calls are to appease mypy
-        if self == self.READ_ONLY_SHORT:
+        if self is self.READ_ONLY_SHORT:
             return LoginAccess(self.READ_ONLY)
-        if self == self.READ_WRITE_SHORT:
+        if self is self.READ_WRITE_SHORT:
             return LoginAccess(self.READ_WRITE)
         return self
 
diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py
index 7483eb7964b..4512ad6add3 100644
--- a/src/pybind/mgr/smb/module.py
+++ b/src/pybind/mgr/smb/module.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+from typing import TYPE_CHECKING, Any, List, Optional, cast
 
 import logging
 
@@ -171,6 +171,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         custom_dns: Optional[List[str]] = None,
         placement: Optional[str] = None,
         clustering: Optional[SMBClustering] = None,
+        public_addrs: Optional[List[str]] = None,
     ) -> results.Result:
         """Create an smb cluster"""
         domain_settings = None
@@ -255,6 +256,18 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
                 )
             )
 
+        c_public_addrs = []
+        if public_addrs:
+            for pa in public_addrs:
+                pa_arr = pa.split('%', 1)
+                address = pa_arr[0]
+                destination = pa_arr[1] if len(pa_arr) > 1 else None
+                c_public_addrs.append(
+                    resources.ClusterPublicIPAssignment(
+                        address=address, destination=destination
+                    )
+                )
+
         pspec = resources.WrappedPlacementSpec.wrap(
             PlacementSpec.from_string(placement)
         )
@@ -266,6 +279,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             custom_dns=custom_dns,
             placement=pspec,
             clustering=clustering,
+            public_addrs=c_public_addrs,
         )
         to_apply.append(cluster)
         return self._handler.apply(to_apply, create_only=True).squash(cluster)
@@ -336,45 +350,6 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             return resources[0].to_simplified()
         return {'resources': [r.to_simplified() for r in resources]}
 
-    @cli.SMBCommand('dump cluster-config', perm='r')
-    def dump_config(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example configuration"""
-        # TODO: Remove this command prior to release
-        return self._handler.generate_config(cluster_id)
-
-    @cli.SMBCommand('dump service-spec', perm='r')
-    def dump_service_spec(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example smb service spec"""
-        # TODO: Remove this command prior to release
-        return dict(
-            self._handler.generate_smb_service_spec(cluster_id).to_json()
-        )
-
-    @cli.SMBCommand('dump everything', perm='r')
-    def dump_everything(self) -> Dict[str, Any]:
-        """DEBUG: Show me everything"""
-        # TODO: Remove this command prior to release
-        everything: Dict[str, Any] = {}
-        everything['PUBLIC'] = {}
-        log.warning('dumping PUBLIC')
-        for key in self._public_store:
-            e = self._public_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PUBLIC'][e.uri] = e.get()
-        log.warning('dumping PRIV')
-        everything['PRIV'] = {}
-        for key in self._priv_store:
-            e = self._priv_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PRIV'][e.uri] = e.get()
-        log.warning('dumping INTERNAL')
-        everything['INTERNAL'] = {}
-        for key in self._internal_store:
-            e = self._internal_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['INTERNAL'][e.uri] = e.get()
-        return everything
-
     def submit_smb_spec(self, spec: SMBSpec) -> None:
         """Submit a new or updated smb spec object to ceph orchestration."""
         completion = self.apply_smb(spec)
diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py
index c9fd02968b9..0d3610326c2 100644
--- a/src/pybind/mgr/smb/tests/test_smb.py
+++ b/src/pybind/mgr/smb/tests/test_smb.py
@@ -410,72 +410,6 @@ def test_cmd_apply_share(tmodule):
     assert bdata["results"][0]["state"] == "created"
 
 
-def test_share_dump_config(tmodule):
-    _example_cfg_1(tmodule)
-
-    cfg = tmodule.dump_config('foo')
-    assert cfg == {
-        'samba-container-config': "v0",
-        'configs': {
-            'foo': {
-                'instance_name': 'foo',
-                'instance_features': [],
-                'shares': ['Ess One', 'Ess Two'],
-                'globals': ['default', 'foo'],
-            },
-        },
-        'shares': {
-            'Ess One': {
-                'options': {
-                    'path': '/',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.s1',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-            'Ess Two': {
-                'options': {
-                    'path': '/two',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.stwo',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-        },
-        'globals': {
-            'default': {
-                'options': {
-                    'load printers': 'No',
-                    'printing': 'bsd',
-                    'printcap name': '/dev/null',
-                    'disable spoolss': 'Yes',
-                },
-            },
-            'foo': {
-                'options': {
-                    'idmap config * : backend': 'autorid',
-                    'idmap config * : range': '2000-9999999',
-                    'realm': 'dom1.example.com',
-                    'security': 'ads',
-                    'workgroup': 'DOM1',
-                },
-            },
-        },
-    }
-
-
 def test_cluster_create_ad1(tmodule):
     _example_cfg_1(tmodule)
 
@@ -613,29 +547,6 @@ def test_cluster_rm(tmodule):
     assert result.success
 
 
-def test_dump_service_spec(tmodule):
-    _example_cfg_1(tmodule)
-    tmodule._public_store.overwrite(
-        {
-            'foo.config.smb': '',
-        }
-    )
-    tmodule._priv_store.overwrite(
-        {
-            'foo.join.2b9902c05d08bcba.json': '',
-            'foo.join.08129d4d3b8c37c7.json': '',
-        }
-    )
-
-    cfg = tmodule.dump_service_spec('foo')
-    assert cfg
-    assert cfg['service_id'] == 'foo'
-    assert cfg['spec']['cluster_id'] == 'foo'
-    assert cfg['spec']['features'] == ['domain']
-    assert cfg['spec']['config_uri'] == 'mem:foo/config.smb'
-    assert len(cfg['spec']['join_sources']) == 2
-
-
 def test_cmd_show_resource_json(tmodule):
     _example_cfg_1(tmodule)
 
diff --git a/src/pybind/mgr/telemetry/tox.ini b/src/pybind/mgr/telemetry/tox.ini
index a887590eed8..b2210da54ea 100644
--- a/src/pybind/mgr/telemetry/tox.ini
+++ b/src/pybind/mgr/telemetry/tox.ini
@@ -1,7 +1,6 @@
 [tox]
 envlist =
     py3
-    mypy
 skipsdist = true
 
 [testenv]
diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini
index a8a2d39d01a..f39ececa93d 100644
--- a/src/pybind/mgr/tox.ini
+++ b/src/pybind/mgr/tox.ini
@@ -160,7 +160,8 @@ modules =
 commands =
     flake8 --config=tox.ini {posargs} \
       {posargs:{[testenv:flake8]modules}}
-    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 13'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26'
 
 [testenv:jinjalint]
 deps =
diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py
index 6834e3e240b..83a119ca556 100644
--- a/src/pybind/mgr/volumes/fs/async_job.py
+++ b/src/pybind/mgr/volumes/fs/async_job.py
@@ -167,7 +167,7 @@ class AsyncJobs(threading.Thread):
                     for i in range(c, self.nr_concurrent_jobs):
                         self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(self.name_pfx, time.time(), i)))
                         self.threads[-1].start()
-                self.cv.wait(timeout=5)
+                self.cv.wait(timeout=self.wakeup_timeout)
 
     def shutdown(self):
         self.stopping.set()
diff --git a/src/pybind/mgr/volumes/fs/operations/pin_util.py b/src/pybind/mgr/volumes/fs/operations/pin_util.py
index a12ab5b4d4b..631fdd8fcaa 100644
--- a/src/pybind/mgr/volumes/fs/operations/pin_util.py
+++ b/src/pybind/mgr/volumes/fs/operations/pin_util.py
@@ -3,7 +3,7 @@ import errno
 import cephfs
 
 from ..exception import VolumeException
-from distutils.util import strtobool
+from ceph.utils import strtobool
 
 _pin_value = {
     "export": lambda x: int(x),
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
index 610a61e6a4c..146d6d3f453 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
@@ -172,7 +172,7 @@ class MetadataManager(object):
                 metadata_dict[option] = self.config.get(section,option)
         return metadata_dict
 
-    def list_all_keys_with_specified_values_from_section(self, section, value):
+    def filter_keys(self, section, value):
         keys = []
         if self.config.has_section(section):
             options = self.config.options(section)
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
index 33d364b8b45..72209ca61b5 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
@@ -758,7 +758,7 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate):
 
         try:
             if self.has_pending_clones(snapname):
-                pending_track_id_list = self.metadata_mgr.list_all_keys_with_specified_values_from_section('clone snaps', snapname)
+                pending_track_id_list = self.metadata_mgr.filter_keys('clone snaps', snapname)
             else:
                 return pending_clones_info
         except MetadataMgrException as me:
@@ -780,9 +780,9 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate):
                     raise VolumeException(-e.args[0], e.args[1])
                 else:
                     try:
-                        # If clone is completed between 'list_all_keys_with_specified_values_from_section'
-                        # and readlink(track_id_path) call then readlink will fail with error ENOENT (2)
-                        # Hence we double check whether track_id is exist in .meta file or not.
+                        # If clone is completed between 'filter_keys' and readlink(track_id_path) call
+                        # then readlink will fail with error ENOENT (2). Hence we double check whether
+                        # track_id exists in .meta file or not.
                         # Edge case scenario.
                         # If track_id for clone exist but path /volumes/_index/clone/{track_id} not found
                         # then clone is orphan.
diff --git a/src/pybind/mgr/volumes/fs/stats_util.py b/src/pybind/mgr/volumes/fs/stats_util.py
index cec33eaa887..3334dc5a3d7 100644
--- a/src/pybind/mgr/volumes/fs/stats_util.py
+++ b/src/pybind/mgr/volumes/fs/stats_util.py
@@ -106,6 +106,11 @@ class CloneProgressReporter:
         # reporting has already been initiated by calling RTimer.is_alive().
         self.update_task = RTimer(1, self._update_progress_bars)
 
+        # progress event ID for ongoing clone jobs
+        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
+        # progress event ID for ongoing+pending clone jobs
+        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
+
     def initiate_reporting(self):
         if self.update_task.is_alive():
             log.info('progress reporting thread is already alive, not '
@@ -113,11 +118,6 @@ class CloneProgressReporter:
             return
 
         log.info('initiating progress reporting for clones...')
-        # progress event ID for ongoing clone jobs
-        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
-        # progress event ID for ongoing+pending clone jobs
-        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
-
         self.update_task = RTimer(1, self._update_progress_bars)
         self.update_task.start()
         log.info('progress reporting for clones has been initiated')
@@ -294,10 +294,7 @@ class CloneProgressReporter:
         assert self.onpen_pev_id is not None
 
         self.volclient.mgr.remote('progress', 'complete', self.on_pev_id)
-        self.on_pev_id = None
-
         self.volclient.mgr.remote('progress', 'complete', self.onpen_pev_id)
-        self.onpen_pev_id = None
 
         log.info('finished removing progress bars from "ceph status" output')
 
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 041f1ed3044..59ebbb6347e 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -131,6 +131,10 @@ class DriveSelection(object):
         for disk in self.disks:
             logger.debug("Processing disk {}".format(disk.path))
 
+            if disk.being_replaced:
+                logger.debug('Ignoring disk {} as it is being replaced.'.format(disk.path))
+                continue
+
             if not disk.available and not disk.ceph_device:
                 logger.debug(
                     ("Ignoring disk {}. "
diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py
index a3023882108..e2c1a5605f9 100644
--- a/src/python-common/ceph/deployment/inventory.py
+++ b/src/python-common/ceph/deployment/inventory.py
@@ -54,7 +54,8 @@ class Device(object):
         'human_readable_type',
         'device_id',
         'lsm_data',
-        'crush_device_class'
+        'crush_device_class',
+        'being_replaced'
     ]
 
     def __init__(self,
@@ -67,7 +68,8 @@ class Device(object):
                  lsm_data=None,  # type: Optional[Dict[str, Dict[str, str]]]
                  created=None,  # type: Optional[datetime.datetime]
                  ceph_device=None,  # type: Optional[bool]
-                 crush_device_class=None  # type: Optional[str]
+                 crush_device_class=None,  # type: Optional[str]
+                 being_replaced=None,  # type: Optional[bool]
                  ):
 
         self.path = path
@@ -80,6 +82,7 @@ class Device(object):
         self.created = created if created is not None else datetime_now()
         self.ceph_device = ceph_device
         self.crush_device_class = crush_device_class
+        self.being_replaced = being_replaced
 
     def __eq__(self, other):
         # type: (Any) -> bool
@@ -129,7 +132,8 @@ class Device(object):
             'lvs': self.lvs if self.lvs else 'None',
             'available': str(self.available),
             'ceph_device': str(self.ceph_device),
-            'crush_device_class': str(self.crush_device_class)
+            'crush_device_class': str(self.crush_device_class),
+            'being_replaced': str(self.being_replaced)
         }
         if not self.available and self.rejected_reasons:
             device_desc['rejection reasons'] = self.rejected_reasons
diff --git a/src/python-common/ceph/utils.py b/src/python-common/ceph/utils.py
index e92a2d1de7d..0544e9f4173 100644
--- a/src/python-common/ceph/utils.py
+++ b/src/python-common/ceph/utils.py
@@ -167,3 +167,18 @@ def http_req(hostname: str = '',
         log.error(e)
         # handle error here if needed
         raise
+
+
+_TRUE_VALS = {'y', 'yes', 't', 'true', 'on', '1'}
+_FALSE_VALS = {'n', 'no', 'f', 'false', 'off', '0'}
+
+
+def strtobool(value: str) -> bool:
+    """Convert a string to a boolean value.
+    Based on a simlilar function once available at distutils.util.strtobool.
+    """
+    if value.lower() in _TRUE_VALS:
+        return True
+    if value.lower() in _FALSE_VALS:
+        return False
+    raise ValueError(f'invalid truth value {value!r}')
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index a8874195217..b00dfaa1ec5 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -11187,22 +11187,22 @@ next:
     }
 
     formatter->open_object_section("result");
-    formatter->open_array_section("topics");
-    do {
-      rgw_pubsub_topics result;
-      int ret = ps.get_topics(dpp(), next_token, max_entries,
-                              result, next_token, null_yield);
-      if (ret < 0 && ret != -ENOENT) {
-        cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
-        return -ret;
-      }
-      for (const auto& [_, topic] : result.topics) {
-        if (owner && *owner != topic.owner) {
-          continue;
+    rgw_pubsub_topics result;
+    if (rgw::all_zonegroups_support(*site, rgw::zone_features::notification_v2) &&
+        driver->stat_topics_v1(tenant, null_yield, dpp()) == -ENOENT) {
+      formatter->open_array_section("topics");
+      do {
+        int ret = ps.get_topics_v2(dpp(), next_token, max_entries,
+                                   result, next_token, null_yield);
+        if (ret < 0 && ret != -ENOENT) {
+          cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
         }
-        std::set<std::string> subscribed_buckets;
-        if (rgw::all_zonegroups_support(*site, rgw::zone_features::notification_v2) &&
-            driver->stat_topics_v1(tenant, null_yield, dpp()) == -ENOENT) {
+        for (const auto& [_, topic] : result.topics) {
+          if (owner && *owner != topic.owner) {
+            continue;
+          }
+          std::set<std::string> subscribed_buckets;
           ret = driver->get_bucket_topic_mapping(topic, subscribed_buckets,
                                                  null_yield, dpp());
           if (ret < 0) {
@@ -11210,15 +11210,21 @@ next:
                  << topic.name << ", ret=" << ret << std::endl;
           }
           show_topics_info_v2(topic, subscribed_buckets, formatter.get());
-        } else {
-          encode_json("result", result, formatter.get());
-        }
-        if (max_entries_specified) {
-          --max_entries;
+          if (max_entries_specified) {
+            --max_entries;
+          }
         }
+        result.topics.clear();
+      } while (!next_token.empty() && max_entries > 0);
+      formatter->close_section(); // topics
+    } else { // v1, list all topics
+      int ret = ps.get_topics_v1(dpp(), result, null_yield);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
       }
-    } while (!next_token.empty() && max_entries > 0);
-    formatter->close_section(); // topics
+      encode_json("result", result, formatter.get());
+    }
     if (max_entries_specified) {
       encode_json("truncated", !next_token.empty(), formatter.get());
       if (!next_token.empty()) {
diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h
index 7b8d12599f4..2973a753fff 100644
--- a/src/rgw/rgw_lua_background.h
+++ b/src/rgw/rgw_lua_background.h
@@ -153,9 +153,8 @@ private:
 
   void run();
 
-protected:
   std::string rgw_script;
-  virtual int read_script();
+  int read_script();
 
 public:
   Background(rgw::sal::Driver* _driver,
@@ -173,7 +172,7 @@ public:
     std::unique_lock cond_lock(table_mutex);
     rgw_map[key] = value;
   }
-   
+
   // update the manager after 
   void set_manager(rgw::sal::LuaManager* _lua_manager);
   void pause() override;
diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc
index 92b65b0ebba..cb68d72d7da 100644
--- a/src/rgw/rgw_pubsub.cc
+++ b/src/rgw/rgw_pubsub.cc
@@ -570,22 +570,16 @@ RGWPubSub::RGWPubSub(rgw::sal::Driver* _driver,
 {
 }
 
-int RGWPubSub::get_topics(const DoutPrefixProvider* dpp,
-                          const std::string& start_marker, int max_items,
-                          rgw_pubsub_topics& result, std::string& next_marker,
-                          optional_yield y) const
+int RGWPubSub::get_topics_v2(const DoutPrefixProvider* dpp,
+                             const std::string& start_marker, int max_items,
+                             rgw_pubsub_topics& result, std::string& next_marker,
+                             optional_yield y) const
 {
   if (rgw::account::validate_id(tenant)) {
     // if our tenant is an account, return the account listing
     return list_account_topics(dpp, start_marker, max_items,
                                result, next_marker, y);
   }
-
-  if (!use_notification_v2 || driver->stat_topics_v1(tenant, y, dpp) != -ENOENT) {
-    // in case of v1 or during migration we use v1 topics
-    // v1 returns all topics, ignoring marker/max_items
-    return read_topics_v1(dpp, result, nullptr, y);
-  }
  
   // TODO: prefix filter on 'tenant:'
   void* handle = NULL;
@@ -629,6 +623,13 @@ int RGWPubSub::get_topics(const DoutPrefixProvider* dpp,
   return ret;
 }
 
+int RGWPubSub::get_topics_v1(const DoutPrefixProvider* dpp,
+                             rgw_pubsub_topics& result,
+                             optional_yield y) const
+{
+  return read_topics_v1(dpp, result, nullptr, y);
+}
+
 int RGWPubSub::list_account_topics(const DoutPrefixProvider* dpp,
                                    const std::string& start_marker,
                                    int max_items, rgw_pubsub_topics& result,
diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h
index b7ce443af03..8a6b290cb85 100644
--- a/src/rgw/rgw_pubsub.h
+++ b/src/rgw/rgw_pubsub.h
@@ -643,9 +643,14 @@ public:
 
   // get a paginated list of topics
   // return 0 on success, error code otherwise
-  int get_topics(const DoutPrefixProvider* dpp,
-                 const std::string& start_marker, int max_items,
-                 rgw_pubsub_topics& result, std::string& next_marker,
+  int get_topics_v2(const DoutPrefixProvider* dpp,
+                    const std::string& start_marker, int max_items,
+                    rgw_pubsub_topics& result, std::string& next_marker,
+                    optional_yield y) const;
+
+  // return 0 on success, error code otherwise
+  int get_topics_v1(const DoutPrefixProvider* dpp,
+                 rgw_pubsub_topics& result,
                  optional_yield y) const;
 
   // get a topic with by its name and populate it into "result"
diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc
index a3784ca95b0..c0345a4f88a 100644
--- a/src/rgw/rgw_rest_pubsub.cc
+++ b/src/rgw/rgw_rest_pubsub.cc
@@ -493,8 +493,13 @@ void RGWPSListTopicsOp::execute(optional_yield y) {
   const std::string start_token = s->info.args.get("NextToken");
 
   const RGWPubSub ps(driver, get_account_or_tenant(s->owner.id), *s->penv.site);
-  constexpr int max_items = 100;
-  op_ret = ps.get_topics(this, start_token, max_items, result, next_token, y);
+  if (rgw::all_zonegroups_support(*s->penv.site, rgw::zone_features::notification_v2) &&
+      driver->stat_topics_v1(s->bucket->get_tenant(), null_yield, this) == -ENOENT) {
+    op_ret = ps.get_topics_v1(this, result, y);
+  } else {
+    constexpr int max_items = 100;
+    op_ret = ps.get_topics_v2(this, start_token, max_items, result, next_token, y);
+  }
   // if there are no topics it is not considered an error
   op_ret = op_ret == -ENOENT ? 0 : op_ret;
   if (op_ret < 0) {
diff --git a/src/test/client/nonblocking.cc b/src/test/client/nonblocking.cc
index d4aecb10ffc..93bcfabd3fc 100644
--- a/src/test/client/nonblocking.cc
+++ b/src/test/client/nonblocking.cc
@@ -111,6 +111,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_a = iov_out_a[0].iov_len + iov_out_a[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_a, 2, 100, true, writefinish.get(), nullptr);
   ASSERT_EQ(0, rc);
@@ -130,6 +132,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_b = iov_out_b[0].iov_len + iov_out_b[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_b, 2, 1000, true, writefinish.get(), nullptr, true, false);
   ASSERT_EQ(0, rc);
diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 977dfe738a9..cee4b427770 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -1,5 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 &smarttab
+// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -57,21 +57,13 @@ TEST(MutexDebug, Lock) {
   test_lock<ceph::mutex_debug>();
 }
 
-TEST(MutexDebug, NotRecursive) {
+TEST(MutexDebugDeathTest, NotRecursive) {
   ceph::mutex_debug m("foo");
-  auto ttl = &test_try_lock<mutex_debug>;
-
-  ASSERT_NO_THROW(m.lock());
-  ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_THROW(m.lock(), std::system_error);
+  // avoid assert during test cleanup where the mutex is locked and cannot be
+  // pthread_mutex_destroy'd
+  std::unique_lock locker{m};
   ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_NO_THROW(m.unlock());
-  ASSERT_FALSE(m.is_locked());
-  ASSERT_TRUE(std::async(std::launch::async, ttl, &m).get());
+  ASSERT_DEATH(m.lock(), "FAILED ceph_assert(recursive || !is_locked_by_me())");
 }
 
 TEST(MutexRecursiveDebug, Lock) {
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 6648719c61c..df743327aaa 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -128,7 +128,8 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
 
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) override;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) override;
 
   void enqueue_drop(
     const pg_shard_t& target,
@@ -243,6 +244,10 @@ struct BackfillFixture::PeeringFacade
   void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) override {
   }
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {}
   bool is_backfilling() const override {
     return true;
   }
@@ -270,6 +275,9 @@ BackfillFixture::BackfillFixture(
                                                    this->backfill_targets),
                    std::make_unique<PGFacade>(this->backfill_source))
 {
+  seastar::global_logger_registry().set_all_loggers_level(
+    seastar::log_level::debug
+  );
   backfill_state.process_event(crimson::osd::BackfillState::Triggered{}.intrusive_from_this());
 }
 
@@ -303,7 +311,8 @@ void BackfillFixture::request_primary_scan(
 
 void BackfillFixture::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &)
 {
   for (auto& [ _, bt ] : backfill_targets) {
     bt.store.push(obj, v);
diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc
index 907884fe35d..e24773886bc 100644
--- a/src/test/exporter/test_exporter.cc
+++ b/src/test/exporter/test_exporter.cc
@@ -1,6 +1,8 @@
 #include "common/ceph_argparse.h"
 #include "common/config.h"
 #include "common/config_proxy.h"
+#include "common/admin_socket.h"
+#include "common/admin_socket_client.h"
 #include <gmock/gmock.h>
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
@@ -8,6 +10,7 @@
 #include "global/global_init.h"
 #include "exporter/util.h"
 #include "exporter/DaemonMetricCollector.h"
+#include <filesystem>
 
 #include <regex>
 #include <string>
@@ -674,6 +677,27 @@ static std::vector<std::pair<std::string, std::string>> promethize_data = {
   {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"}
 };
 
+
+class AdminSocketTest
+{
+public:
+  explicit AdminSocketTest(AdminSocket *asokc)
+    : m_asokc(asokc)
+  {
+  }
+  bool init(const std::string &uri) {
+    return m_asokc->init(uri);
+  }
+  std::string bind_and_listen(const std::string &sock_path, int *fd) {
+    return m_asokc->bind_and_listen(sock_path, fd);
+  }
+  bool shutdown() {
+    m_asokc->shutdown();
+    return true;
+  }
+  AdminSocket *m_asokc;
+};
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
@@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577
 # TYPE ceph_mon_session_trim counter
 ceph_mon_session_trim{ceph_daemon="mon.a"} 9
 )";
-  
-  ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
+
+  std::string actualMetrics = collector.metrics;
+  std::cout << "Actual MON Metrics: " << actualMetrics << std::endl;
+  ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos);
+  //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
 
   // Test for labeled metrics - RGW
   daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064";
@@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) {
     EXPECT_EQ(new_metric.first, expected_labels);
     ASSERT_TRUE(new_metric.second == expected_metric_name);
 }
+
+TEST(Exporter, UpdateSockets) {
+    const std::string mock_dir = "/tmp/fake_sock_dir";
+
+    // Create the mock directory
+    std::filesystem::create_directories(mock_dir);
+
+    // Create a mix of vstart and real cluster mock .asok files
+    std::ofstream(mock_dir + "/ceph-osd.0.asok").close();
+    std::ofstream(mock_dir + "/ceph-mds.a.asok").close();
+    std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close();
+    std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close();
+
+    g_conf().set_val("exporter_sock_dir", mock_dir);
+
+    DaemonMetricCollector collector;
+
+    // Run the function that interacts with the mock directory
+    collector.update_sockets();
+
+    // Verify the expected results
+    ASSERT_EQ(collector.clients.size(), 4);
+    ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end());
+
+
+    // Remove the mock directory and files
+    std::filesystem::remove_all(mock_dir);
+}
+
+
+TEST(Exporter, HealthMetrics) {
+    std::map<std::string, AdminSocketClient> clients;
+    DaemonMetricCollector &collector = collector_instance();
+    std::string daemon = "test_daemon";
+    std::string expectedCounterDump = "";
+    std::string expectedCounterSchema = "";
+    std::string metricName = "ceph_daemon_socket_up";
+
+    // Fake admin socket
+    std::string asok_path = "/tmp/" + daemon + ".asok";
+    std::unique_ptr<AdminSocket> asokc = std::make_unique<AdminSocket>(g_ceph_context);
+    AdminSocketClient client(asok_path);
+
+    // Add the daemon clients to the collector
+    clients.insert({daemon, std::move(client)});
+    collector.clients = clients;
+
+    auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) {
+        collector.metrics = "";
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.init(asok_path));
+        }
+
+        collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false);
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.shutdown());
+        }
+
+        std::string retrievedMetrics = collector.metrics;
+        std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)";
+        std::regex regexPattern(pattern);
+        ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern));
+    };
+
+    // Test an admin socket not answering: metric value should be "0"
+    verifyMetricValue("0", false);
+
+    // Test an admin socket answering: metric value should be "1"
+    verifyMetricValue("1", true);
+}
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index f2c87168633..6f10d2bbd4e 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -976,6 +976,13 @@ TEST(LibCephFS, Symlinks) {
   fd = ceph_open(cmount, test_symlink, O_NOFOLLOW, 0);
   ASSERT_EQ(fd, -CEPHFS_ELOOP);
 
+#if defined(__linux__) && defined(O_PATH)
+  // test the O_NOFOLLOW with O_PATH case
+  fd = ceph_open(cmount, test_symlink, O_PATH|O_NOFOLLOW, 0);
+  ASSERT_GT(fd, 0);
+  ceph_close(cmount, fd);
+#endif /* __linux */
+
   // stat the original file
   struct ceph_statx stx_orig;
   ASSERT_EQ(ceph_statx(cmount, test_file, &stx_orig, CEPH_STATX_ALL_STATS, 0), 0);
@@ -3012,6 +3019,18 @@ TEST(LibCephFS, Readlinkat) {
   ASSERT_EQ(0, memcmp(target, rel_file_path, target_len));
 
   ASSERT_EQ(0, ceph_close(cmount, fd));
+#if defined(__linux__) && defined(O_PATH)
+  // test readlinkat with empty pathname relative to O_PATH|O_NOFOLLOW fd
+  fd = ceph_open(cmount, link_path, O_PATH | O_NOFOLLOW, 0);
+  ASSERT_LE(0, fd);
+  size_t link_target_len = strlen(rel_file_path);
+  char link_target[link_target_len+1];
+  ASSERT_EQ(link_target_len, ceph_readlinkat(cmount, fd, "", link_target, link_target_len));
+  link_target[link_target_len] = '\0';
+  ASSERT_EQ(0, memcmp(link_target, rel_file_path, link_target_len));
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+#endif /* __linux */
+
   ASSERT_EQ(0, ceph_unlink(cmount, link_path));
   ASSERT_EQ(0, ceph_unlink(cmount, file_path));
   ASSERT_EQ(0, ceph_rmdir(cmount, dir_path));
diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc
index dfc0304d00b..5e340c5c9c5 100644
--- a/src/test/osd/ceph_test_rados_io_sequence.cc
+++ b/src/test/osd/ceph_test_rados_io_sequence.cc
@@ -27,15 +27,193 @@
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
+namespace {
+  struct Size {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                Size *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+
+    std::string parse_error;
+    uint64_t size = strict_iecstrtoll(s, &parse_error);
+    if (!parse_error.empty()) {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    v = boost::any(size);
+  }
+
+  struct Pair {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                Pair *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+    auto part = ceph::split(s).begin();
+    std::string parse_error;
+    int first = strict_iecstrtoll(*part++, &parse_error);
+    int second = strict_iecstrtoll(*part, &parse_error);
+    if (!parse_error.empty()) {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    v = boost::any(std::pair<int,int>{first,second});
+  }
+
+  struct PluginString {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                PluginString *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+
+    const std::string_view* pluginIt = std::find(
+          ceph::io_sequence::tester::pluginChoices.begin(),
+          ceph::io_sequence::tester::pluginChoices.end(), 
+          s
+    );
+    if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt)
+    {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+
+    v = boost::any(*pluginIt);
+  }
+
+  constexpr std::string_view usage[] = {
+    "Basic usage:",
+    "",
+    "ceph_test_rados_io_sequence",
+    "\t Test I/O to a single object using default settings. Good for",
+    "\t testing boundary conditions",
+    "",
+    "ceph_test_rados_io_sequence --parallel <n>",
+    "\t Run parallel test to multiple objects. First object is tested with",
+    "\t default settings, other objects are tested with random settings",
+    "",
+    "Advanced usage:",
+    "",
+    "ceph_test_rados_io_sequence --blocksize <b> --km <k,m> --plugin <p>",
+    "                            --objectsize <min,max> --threads <t>",
+    "ceph_test_rados_io_sequence --blocksize <b> --pool <p> --object <oid>",
+    "                            --objectsize <min,max> --threads <t>",
+    "\tCustomize the test, if a pool is specified then it defines the",
+    "\tReplica/EC configuration",
+    "",
+    "ceph_test_rados_io_sequence --listsequence",
+    "\t Display list of supported I/O sequences",
+    "",
+    "ceph_test_rados_io_sequence --dryrun --sequence <n>",
+    "\t Show I/O that will be generated for a sequence, validate",
+    "\t seqeunce has correct I/O barriers to restrict concurrency",
+    "",
+    "ceph_test_rados_io_sequence --seed <seed>",
+    "\t Repeat a previous test with the same random numbers (seed is",
+    "\t displayed at start of test), if threads = 1 then this will produce",
+    "\t the exact same sequence of I/O, if threads > 1 then I/Os are issued",
+    "\t in parallel so ordering might be slightly different",
+    "",
+    "ceph_test_rados_io_sequence --sequence <n> --seqseed <n>",
+    "\t Repeat a sequence from a previous test with the same random",
+    "\t numbers (seqseed is displayed at start of sequence)",
+    "",
+    "ceph_test_rados_io_sequence --pool <p> --object <oid> --interactive",
+    "\t Execute sequence of I/O commands from stdin. Offset and length",
+    "\t are specified with unit of blocksize. Supported commands:",
+    "\t\t create <len>",
+    "\t\t remove",
+    "\t\t read|write <off> <len>",
+    "\t\t read2|write2 <off> <len> <off> <len>",
+    "\t\t read3|write3 <off> <len> <off> <len> <off> <len>",
+    "\t\t done"
+  };
+
+  po::options_description get_options_description()
+  {
+    po::options_description desc("ceph_test_rados_io options");
+    desc.add_options()
+      ("help,h",
+        "show help message")
+      ("listsequence,l",
+        "show list of sequences")
+      ("dryrun,d",
+        "test sequence, do not issue any I/O")
+      ("verbose",
+        "more verbose output during test")
+      ("sequence,s", po::value<int>(),
+        "test specified sequence")
+      ("seed", po::value<int>(),
+        "seed for whole test")
+      ("seqseed", po::value<int>(),
+        "seed for sequence")
+      ("blocksize,b", po::value<Size>(),
+        "block size (default 2048)")
+      ("chunksize,c", po::value<Size>(),
+        "chunk size (default 4096)")
+      ("pool,p", po::value<std::string>(),
+        "pool name")
+      ("object,o", po::value<std::string>()->default_value("test"),
+        "object name")
+      ("km", po::value<Pair>(),
+        "k,m EC pool profile (default 2,2)")
+      ("plugin", po::value<PluginString>(),
+        "EC plugin (isa or jerasure)")
+      ("objectsize", po::value<Pair>(),
+        "min,max object size in blocks (default 1,32)")
+      ("threads,t", po::value<int>(),
+        "number of threads of I/O per object (default 1)")
+      ("parallel,p", po::value<int>()->default_value(1),
+        "number of objects to exercise in parallel")
+      ("interactive",
+        "interactive mode, execute IO commands from stdin");
+
+    return desc;
+  }
+
+  int parse_io_seq_options(
+      po::variables_map& vm,
+      int argc,
+      char** argv)
+  {
+    std::vector<std::string> unrecognized_options;
+    try {
+      po::options_description desc = get_options_description();
+
+      auto parsed = po::command_line_parser(argc, argv)
+        .options(desc)
+        .allow_unregistered()
+        .run();
+      po::store(parsed, vm);
+      po::notify(vm);
+      unrecognized_options = po::collect_unrecognized(parsed.options,
+                                                      po::include_positional);
+
+      if (!unrecognized_options.empty())
+      {
+        std::stringstream ss;
+        ss << "Unrecognised command options supplied: ";
+        while (unrecognized_options.size() > 1)
+        {
+          ss << unrecognized_options.back().c_str() << ", ";
+          unrecognized_options.pop_back();
+        }
+        ss << unrecognized_options.back();
+        dout(0) << ss.str() << dendl;
+        return 1;
+      }
+    } catch(const po::error& e) {
+      std::cerr << "error: " << e.what() << std::endl;
+      return 1;
+    }
+
+    return 0;
+  }
+}
+
 template <typename T, int N, const std::array<T, N>& Ts>
 ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>
   ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
                           po::variables_map vm,
                           const std::string& option_name,
                           bool set_forced,
-                          bool select_first) 
+                          bool select_first)
   : rng(rng),
-    // choices(choices),
     option_name(option_name) {
   if (set_forced && vm.count(option_name)) {
     force_value = vm[option_name].as<T>();
@@ -86,7 +264,7 @@ ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize(
 
 ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads(
     ceph::util::random_number_generator<int>& rng,
-    po::variables_map vm) 
+    po::variables_map vm)
   : ProgramOptionSelector(rng, vm, "threads", true, true)
 {
 }
@@ -99,9 +277,9 @@ ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange(
   : ProgramOptionSelector(rng, vm, "sequence", false, false)
 {
   if (vm.count(option_name)) {
-    ceph::io_exerciser::Sequence s = 
+    ceph::io_exerciser::Sequence s =
       static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>());
-    if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN || 
+    if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN ||
         s >= ceph::io_exerciser::Sequence::SEQUENCE_END) {
       dout(0) << "Sequence argument out of range" << dendl;
       throw po::validation_error(po::validation_error::invalid_option_value);
@@ -127,7 +305,7 @@ const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence>
 
 
 ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM(
-  ceph::util::random_number_generator<int>& rng, 
+  ceph::util::random_number_generator<int>& rng,
   po::variables_map vm)
   : ProgramOptionSelector(rng, vm, "km", true, true)
 {
@@ -180,11 +358,11 @@ const std::string ceph::io_sequence::tester::SelectECPool::choose()
   }
   int k = value.first;
   int m = value.second;
-  
+
   const std::string plugin = std::string(spl.choose());
   const uint64_t chunk_size = scs.choose();
 
-  std::string pool_name = "ec_" + plugin + 
+  std::string pool_name = "ec_" + plugin +
                           "_cs" + std::to_string(chunk_size) +
                           "_k" + std::to_string(k) +
                           "_m" + std::to_string(m);
@@ -234,33 +412,32 @@ ceph::io_sequence::tester::TestObject::TestObject( const std::string oid,
                         SelectECPool& spo,
                         SelectObjectSize& sos,
                         SelectNumThreads& snt,
-                        SelectSeqRange & ssr,
+                        SelectSeqRange& ssr,
                         ceph::util::random_number_generator<int>& rng,
                         ceph::mutex& lock,
                         ceph::condition_variable& cond,
                         bool dryrun,
                         bool verbose,
-                        bool has_seqseed,
-                        int  seqseed) :
-  rng(rng), verbose(verbose), has_seqseed(has_seqseed), seqseed(seqseed)
+                        std::optional<int>  seqseed) :
+  rng(rng), verbose(verbose), seqseed(seqseed)
 {
   if (dryrun) {
     verbose = true;
     exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid,
-                                        sbs.choose(),
-                                        rng());
+                                                                        sbs.choose(),
+                                                                        rng());
   } else {
     const std::string pool = spo.choose();
     int threads = snt.choose();
     exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados,
-                                    asio,
-                                    pool,
-                                    oid,
-                                    sbs.choose(),
-                                    rng(),
-                                    threads,
-                                    lock,
-                                    cond);
+                                                                    asio,
+                                                                    pool,
+                                                                    oid,
+                                                                    sbs.choose(),
+                                                                    rng(),
+                                                                    threads,
+                                                                    lock,
+                                                                    cond);
     dout(0) << "= " << oid << " pool=" << pool
             << " threads=" << threads
             << " blocksize=" << exerciser_model->get_block_size()
@@ -271,14 +448,12 @@ ceph::io_sequence::tester::TestObject::TestObject( const std::string oid,
   curseq = seq_range.first;
   seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
                                                           obj_size_range,
-                                                          has_seqseed ?
-                                                            seqseed :
-                                                            rng());
+                                                          seqseed.value_or(rng()));
   op = seq->next();
   done = false;
-  dout(0) << "== " << exerciser_model->get_oid() << " " 
-          << curseq << " " 
-          << seq->get_name() 
+  dout(0) << "== " << exerciser_model->get_oid() << " "
+          << curseq << " "
+          << seq->get_name()
           << " ==" <<dendl;
 }
 
@@ -291,12 +466,12 @@ bool ceph::io_sequence::tester::TestObject::next()
 {
   if (!done) {
     if (verbose) {
-      dout(0) << exerciser_model->get_oid() 
+      dout(0) << exerciser_model->get_oid()
               << " Step " << seq->get_step() << ": "
               << op->to_string(exerciser_model->get_block_size()) << dendl;
     } else {
-      dout(5) << exerciser_model->get_oid() 
-              << " Step " << seq->get_step() << ": " 
+      dout(5) << exerciser_model->get_oid()
+              << " Step " << seq->get_step() << ": "
               << op->to_string(exerciser_model->get_block_size()) << dendl;
     }
     exerciser_model->applyIoOp(*op);
@@ -310,10 +485,8 @@ bool ceph::io_sequence::tester::TestObject::next()
       } else {
         seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
                                                                 obj_size_range,
-                                                                has_seqseed ?
-                                                                  seqseed : 
-                                                                  rng());
-        dout(0) << "== " << exerciser_model->get_oid() << " " 
+                                                                seqseed.value_or(rng()));
+        dout(0) << "== " << exerciser_model->get_oid() << " "
                 << curseq << " " << seq->get_name()
                 << " ==" <<dendl;
         op = seq->next();
@@ -335,97 +508,232 @@ int ceph::io_sequence::tester::TestObject::get_num_io()
   return exerciser_model->get_num_io();
 }
 
-struct Size {};
-void validate(boost::any& v, const std::vector<std::string>& values,
-              Size *target_type, int) {
-  po::validators::check_first_occurrence(v);
-  const std::string &s = po::validators::get_single_string(values);
+ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm,
+                                                  librados::Rados& rados) :
+  rados(rados),
+  seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)),
+  rng(ceph::util::random_number_generator<int>(seed)),
+  sbs{rng, vm},
+  sos{rng, vm},
+  spo{rng, vm, rados, vm.contains("dryrun")},
+  snt{rng, vm},
+  ssr{rng, vm}
+{
+  dout(0) << "Test using seed " << seed << dendl;
 
-  std::string parse_error;
-  uint64_t size = strict_iecstrtoll(s, &parse_error);
-  if (!parse_error.empty()) {
-    throw po::validation_error(po::validation_error::invalid_option_value);
+  verbose = vm.contains("verbose");
+  dryrun = vm.contains("dryrun");
+
+  seqseed = std::nullopt;
+  if (vm.contains("seqseed")) {
+    seqseed = vm["seqseed"].as<int>();
+  }
+  num_objects = vm["parallel"].as<int>();
+  object_name = vm["object"].as<std::string>();
+  interactive = vm.contains("interactive");
+
+  if (!dryrun)
+  {
+    guard.emplace(boost::asio::make_work_guard(asio));
+    thread = make_named_thread("io_thread",[&asio = asio] { asio.run(); });
   }
-  v = boost::any(size);
+
+  show_help = vm.contains("help");
+  show_sequence = vm.contains("listsequence");
 }
 
-struct Pair {};
-void validate(boost::any& v, const std::vector<std::string>& values,
-              Pair *target_type, int) {
-  po::validators::check_first_occurrence(v);
-  const std::string &s = po::validators::get_single_string(values);
-  auto part = ceph::split(s).begin();
+ceph::io_sequence::tester::TestRunner::~TestRunner()
+{
+  if (!dryrun) {
+    guard = std::nullopt;
+    asio.stop();
+    thread.join();
+    rados.shutdown();
+  }
+}
+
+void ceph::io_sequence::tester::TestRunner::help()
+{
+  std::cout << get_options_description() << std::endl;
+  for (auto line : usage) {
+    std::cout << line << std::endl;
+  }
+}
+
+void ceph::io_sequence::tester::TestRunner::list_sequence()
+{
+  // List seqeunces
+  std::pair<int,int> obj_size_range = sos.choose();
+  for (ceph::io_exerciser::Sequence s
+        = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN;
+        s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) {
+    std::unique_ptr<ceph::io_exerciser::IoSequence> seq =
+    ceph::io_exerciser::IoSequence::generate_sequence(s,
+                                                      obj_size_range,
+                                                      seqseed.value_or(rng()));
+    dout(0) << s << " " << seq->get_name() << dendl;
+  }
+}
+
+std::string ceph::io_sequence::tester::TestRunner::get_token()
+{
+  static std::string line;
+  static ceph::split split = ceph::split("");
+  static ceph::spliterator tokens;
+  while (line.empty() || tokens == split.end()) {
+    if (!std::getline(std::cin, line)) {
+      throw std::runtime_error("End of input");
+    }
+    split = ceph::split(line);
+    tokens = split.begin();
+  }
+  return std::string(*tokens++);
+}
+
+uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token()
+{
   std::string parse_error;
-  int first = strict_iecstrtoll(*part++, &parse_error);
-  int second = strict_iecstrtoll(*part, &parse_error);
+  std::string token = get_token();
+  uint64_t num = strict_iecstrtoll(token, &parse_error);
   if (!parse_error.empty()) {
-    throw po::validation_error(po::validation_error::invalid_option_value);
+    throw std::runtime_error("Invalid number "+token);
   }
-  v = boost::any(std::pair<int,int>{first,second});
+  return num;
 }
 
-struct PluginString {};
-void validate(boost::any& v, const std::vector<std::string>& values,
-              PluginString *target_type, int) {
-  po::validators::check_first_occurrence(v);
-  const std::string &s = po::validators::get_single_string(values);
-
-  const std::string_view* pluginIt = std::find(
-        ceph::io_sequence::tester::pluginChoices.begin(),
-        ceph::io_sequence::tester::pluginChoices.end(), 
-        s
-  );
-  if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt)
+bool ceph::io_sequence::tester::TestRunner::run_test()
+{
+  if (show_help)
   {
-    throw po::validation_error(po::validation_error::invalid_option_value);
+    help();
+    return true;
+  }
+  else if (show_sequence)
+  {
+    list_sequence();
+    return true;
+  }
+  else if (interactive)
+  {
+    return run_interactive_test();
+  }
+  else
+  {
+    return run_automated_test();
   }
-
-  v = boost::any(*pluginIt);
 }
 
-int parse_io_seq_options(
-    po::variables_map& vm,
-    const po::options_description& desc,
-    int argc,
-    char** argv)
-{  
-  std::vector<std::string> unrecognized_options;
-  try {
-    auto parsed = po::command_line_parser(argc, argv)
-      .options(desc)
-      .allow_unregistered()
-      .run();
-    po::store(parsed, vm);
-    po::notify(vm);
-    unrecognized_options = po::collect_unrecognized(parsed.options,
-						    po::include_positional);
-
-    if (!unrecognized_options.empty())
-    {
-      std::stringstream ss;
-      ss << "Unrecognised command options supplied: ";
-      while (unrecognized_options.size() > 1)
-      {
-        ss << unrecognized_options.back().c_str() << ", ";
-        unrecognized_options.pop_back();
-      }
-      ss << unrecognized_options.back();
-      dout(0) << ss.str() << dendl;
-      return 1;
+bool ceph::io_sequence::tester::TestRunner::run_interactive_test()
+{
+  bool done = false;
+  std::unique_ptr<ceph::io_exerciser::IoOp> ioop;
+  std::unique_ptr<ceph::io_exerciser::Model> model;
+
+  if (dryrun) {
+    model = std::make_unique<ceph::io_exerciser::ObjectModel>(object_name,
+				                              sbs.choose(),
+				                              rng());
+  } else {
+    const std::string pool = spo.choose();
+    model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, asio, pool,
+                                                          object_name, sbs.choose(),
+                                                          rng(), 1, // 1 thread
+                                                          lock, cond);
+  }
+
+  while (!done) {
+    const std::string op = get_token();
+    if (!op.compare("done")  || !op.compare("q") || !op.compare("quit")) {
+      ioop = ceph::io_exerciser::IoOp::generate_done();
+    } else if (!op.compare("create")) {
+      ioop = ceph::io_exerciser::IoOp::generate_create(get_numeric_token());
+    } else if (!op.compare("remove") || !op.compare("delete")) {
+      ioop = ceph::io_exerciser::IoOp::generate_remove();
+    } else if (!op.compare("read")) {
+      uint64_t offset = get_numeric_token();
+      uint64_t length = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read(offset, length);
+    } else if (!op.compare("read2")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read2(offset1, length1,
+                                                      offset2, length2);
+    } else if (!op.compare("read3")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      uint64_t offset3 = get_numeric_token();
+      uint64_t length3 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read3(offset1, length1,
+                                                      offset2, length2,
+				                      offset3, length3);
+    } else if (!op.compare("write")) {
+      uint64_t offset = get_numeric_token();
+      uint64_t length = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write(offset, length);
+    } else if (!op.compare("write2")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write2(offset1, length1,
+                                                       offset2, length2);
+    } else if (!op.compare("write3")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      uint64_t offset3 = get_numeric_token();
+      uint64_t length3 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write3(offset1, length1,
+                                                       offset2, length2,
+				                       offset3, length3);
+    } else {
+      throw std::runtime_error("Invalid operation "+op);
+    }
+    dout(0) << ioop->to_string(model->get_block_size()) << dendl;
+    model->applyIoOp(*ioop);
+    done = ioop->done();
+    if (!done) {
+      ioop = ceph::io_exerciser::IoOp::generate_barrier();
+      model->applyIoOp(*ioop);
     }
-  } catch(const po::error& e) {
-    std::cerr << "error: " << e.what() << std::endl;
-    return 1;
   }
 
-  return 0;
+  return true;
 }
 
-void run_test(const std::vector<
-    std::shared_ptr<ceph::io_sequence::tester::TestObject>
-  >& test_objects,
-  ceph::mutex& lock)
+bool ceph::io_sequence::tester::TestRunner::run_automated_test()
 {
+  // Create a test for each object
+  std::vector<std::shared_ptr<
+    ceph::io_sequence::tester::TestObject>> test_objects;
+
+  for (int obj = 0; obj < num_objects; obj++) {
+    std::string name;
+    if (obj == 0) {
+      name = object_name;
+    } else {
+      name = object_name + std::to_string(obj);
+    }
+    test_objects.push_back(
+      std::make_shared<ceph::io_sequence::tester::TestObject>(
+            name,
+            rados, asio,
+            sbs, spo, sos, snt, ssr,
+            rng, lock, cond,
+            dryrun, verbose,
+            seqseed
+      )
+    );
+  }
+  if (!dryrun) {
+    rados.wait_for_latest_osdmap();
+  }
+
   // Main loop of test - while not all test objects have finished
   // check to see if any are able to start a new I/O. If all test
   // objects are waiting for I/O to complete then wait on a cond
@@ -475,56 +783,8 @@ void run_test(const std::vector<
     ceph_assert(to->finished());
   }
   dout(0) << "Total number of IOs = " << total_io << dendl;
-}
 
-namespace {
-  constexpr std::string_view usage[] = {
-    "Basic usage:",
-    "",
-    "ceph_test_rados_io_sequence",
-    "\t Test I/O to a single object using default settings. Good for",
-    "\t testing boundary conditions",
-    "",
-    "ceph_test_rados_io_sequence --parallel <n>",
-    "\t Run parallel test to multiple objects. First object is tested with",
-    "\t default settings, other objects are tested with random settings",
-    "",
-    "Advanced usage:",
-    "",
-    "ceph_test_rados_io_sequence --blocksize <b> --km <k,m> --plugin <p>",
-    "                            --objectsize <min,max> --threads <t>",
-    "ceph_test_rados_io_sequence --blocksize <b> --pool <p> --object <oid>",
-    "                            --objectsize <min,max> --threads <t>",
-    "\tCustomize the test, if a pool is specified then it defines the",
-    "\tReplica/EC configuration",
-    "",
-    "ceph_test_rados_io_sequence --listsequence",
-    "\t Display list of supported I/O sequences",
-    "",
-    "ceph_test_rados_io_sequence --dryrun --sequence <n>",
-    "\t Show I/O that will be generated for a sequence, validate",
-    "\t seqeunce has correct I/O barriers to restrict concurrency",
-    "",
-    "ceph_test_rados_io_sequence --seed <seed>",
-    "\t Repeat a previous test with the same random numbers (seed is",
-    "\t displayed at start of test), if threads = 1 then this will produce",
-    "\t the exact same sequence of I/O, if threads > 1 then I/Os are issued",
-    "\t in parallel so ordering might be slightly different",
-    "",
-    "ceph_test_rados_io_sequence --sequence <n> --seqseed <n>",
-    "\t Repeat a sequence from a previous test with the same random",
-    "\t numbers (seqseed is displayed at start of sequence)",
-    "",
-    "ceph_test_rados_io_sequence --pool <p> --object <oid> --interactive",
-    "\t Execute sequence of I/O commands from stdin. Offset and length",
-    "\t are specified with unit of blocksize. Supported commands:",
-    "\t\t create <len>",
-    "\t\t remove",
-    "\t\t read|write <off> <len>",
-    "\t\t read2|write2 <off> <len> <off> <len>",
-    "\t\t read3|write3 <off> <len> <off> <len> <off> <len>",
-    "\t\t done"
-  };
+  return true;
 }
 
 int main(int argc, char **argv)
@@ -535,161 +795,28 @@ int main(int argc, char **argv)
 			 CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(cct.get());
 
-  librados::Rados rados;
-  boost::asio::io_context asio;
-  std::thread thread;
-  std::optional<boost::asio::executor_work_guard<
-                  boost::asio::io_context::executor_type>> guard;
-  ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
-  ceph::condition_variable cond;
-
-  po::options_description desc("ceph_test_rados_io options");
-
-  desc.add_options()
-    ("help,h",
-      "show help message")
-    ("listsequence,l",
-      "show list of sequences")
-    ("dryrun,d",
-      "test sequence, do not issue any I/O")
-    ("verbose",
-      "more verbose output during test")
-    ("sequence,s", po::value<int>(),
-      "test specified sequence")
-    ("seed", po::value<int>(),
-      "seed for whole test")
-    ("seqseed", po::value<int>(),
-      "seed for sequence")
-    ("blocksize,b", po::value<Size>(),
-      "block size (default 2048)")
-    ("chunksize,c", po::value<Size>(),
-      "chunk size (default 4096)")
-    ("pool,p", po::value<std::string>(),
-      "pool name")
-    ("km", po::value<Pair>(),
-      "k,m EC pool profile (default 2,2)")
-    ("plugin", po::value<PluginString>(),
-      "EC plugin (isa or jerasure)")
-    ("objectsize", po::value<Pair>(),
-      "min,max object size in blocks (default 1,32)")
-    ("threads,t", po::value<int>(),
-      "number of threads of I/O per object (default 1)")
-    ("objects,o", po::value<int>()->default_value(1),
-      "number of objects to exercise in parallel");
-
   po::variables_map vm;
-  int rc = parse_io_seq_options(vm, desc, argc, argv);
+  int rc = parse_io_seq_options(vm, argc, argv);
   if (rc != 0)
   {
     return rc;
   }
 
-  if (vm.count("help")) {
-    std::cout << desc << std::endl;
-    for (auto line : usage) {
-      std::cout << line << std::endl;
-    }
-    return 0;
-  }
-
-  // Seed
-  int seed = time(nullptr);
-  if (vm.count("seed")) {
-    seed = vm["seed"].as<int>();
-  }
-  dout(0) << "Test using seed " << seed << dendl;
-  auto rng = ceph::util::random_number_generator<int>(seed);
-
-  bool verbose = vm.count("verbose");
-  bool dryrun = vm.count("dryrun");
-  bool has_seqseed = vm.count("seqseed");
-  int seqseed = 0;
-  if (has_seqseed) {
-    seqseed = vm["seqseed"].as<int>();
-  }
-  int num_objects = vm["objects"].as<int>();
-
-  if (!dryrun) {
+  librados::Rados rados;
+  if (!vm.contains("dryrun")) {
     rc = rados.init_with_context(g_ceph_context);
     ceph_assert(rc == 0);
     rc = rados.connect();
     ceph_assert(rc == 0);
+  }
 
-    guard.emplace(boost::asio::make_work_guard(asio));
-    thread = make_named_thread("io_thread",[&asio] { asio.run(); });
-  }
-  
-  // Select block size
-  std::unique_ptr<ceph::io_sequence::tester::SelectBlockSize> sbs
-        = std::make_unique<ceph::io_sequence::tester::SelectBlockSize>(rng, vm);
-
-  // Select pool
-  std::unique_ptr<ceph::io_sequence::tester::SelectECPool> spo
-        = std::make_unique<ceph::io_sequence::tester::SelectECPool>(rng, vm,
-                                                                    rados,
-                                                                    dryrun);
-
-  // Select object size range
-  std::unique_ptr<ceph::io_sequence::tester::SelectObjectSize> sos
-        = std::make_unique<ceph::io_sequence::tester::SelectObjectSize>(rng,
-                                                                        vm);
-
-  // Select number of threads
-  std::unique_ptr<ceph::io_sequence::tester::SelectNumThreads> snt =
-        std::make_unique<ceph::io_sequence::tester::SelectNumThreads>(rng, vm);
-
-  // Select range of sequences
-  std::unique_ptr<ceph::io_sequence::tester::SelectSeqRange> ssr;
+  std::unique_ptr<ceph::io_sequence::tester::TestRunner> runner;
   try {
-    ssr = std::make_unique<ceph::io_sequence::tester::SelectSeqRange>(rng, vm);
+    runner = std::make_unique<ceph::io_sequence::tester::TestRunner>(vm, rados);
   } catch(const po::error& e) {
     return 1;
   }
+  runner->run_test();
 
-  // List seqeunces
-  if (vm.count("listsequence")) {
-    std::pair<int,int> obj_size_range = sos->choose();
-    for (ceph::io_exerciser::Sequence s
-          = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; 
-         s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) {
-      std::unique_ptr<ceph::io_exerciser::IoSequence> seq =
-      ceph::io_exerciser::IoSequence::generate_sequence(s,
-                                                        obj_size_range,
-                                                        has_seqseed ?
-                                                          seqseed :
-                                                          rng());
-      dout(0) << s << " " << seq->get_name() << dendl;
-    }
-    return 0;
-  }
-
-  // Create a test for each object
-  std::vector<std::shared_ptr<
-    ceph::io_sequence::tester::TestObject>> test_objects;
-    
-  for (int obj = 0; obj < num_objects; obj++) {
-    test_objects.push_back(
-      std::make_shared<ceph::io_sequence::tester::TestObject>(
-            "test" + std::to_string(obj),
-            rados, asio,
-            *sbs, *spo, *sos, *snt, *ssr,
-            rng, lock, cond, 
-            dryrun, verbose,
-            has_seqseed, seqseed
-      )
-    );
-  }
-  if (!dryrun) {
-    rados.wait_for_latest_osdmap();
-  }
-
-  run_test(test_objects, lock); 
-
-  if (!dryrun) {
-    guard = std::nullopt;
-    asio.stop();
-    thread.join();
-    rados.shutdown();
-  }
   return 0;
 }
diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h
index 3a84b7bc824..4f77c940274 100644
--- a/src/test/osd/ceph_test_rados_io_sequence.h
+++ b/src/test/osd/ceph_test_rados_io_sequence.h
@@ -1,5 +1,7 @@
 #include <utility>
 
+#include "include/random.h"
+
 #include "global/global_init.h"
 #include "global/global_context.h"
 
@@ -267,8 +269,7 @@ namespace ceph
                   ceph::condition_variable& cond,
                   bool dryrun,
                   bool verbose,
-                  bool has_seqseed,
-                  int  seqseed);
+                  std::optional<int>  seqseed);
       
       int get_num_io();
       bool readyForIo();
@@ -286,8 +287,57 @@ namespace ceph
       bool done;
       ceph::util::random_number_generator<int>& rng;
       bool verbose;
-      bool has_seqseed;
-      int seqseed;
+      std::optional<int> seqseed;
+    };
+
+    class TestRunner
+    {
+    public:
+      TestRunner(po::variables_map& vm, librados::Rados& rados);
+      ~TestRunner();
+
+      bool run_test();
+
+    private:
+      librados::Rados& rados;
+      int seed;
+      ceph::util::random_number_generator<int> rng;
+
+      ceph::io_sequence::tester::SelectBlockSize sbs;
+      ceph::io_sequence::tester::SelectObjectSize sos;
+      ceph::io_sequence::tester::SelectECPool spo;
+      ceph::io_sequence::tester::SelectNumThreads snt;
+      ceph::io_sequence::tester::SelectSeqRange ssr;
+
+      boost::asio::io_context asio;
+      std::thread thread;
+      std::optional<boost::asio::executor_work_guard<
+                    boost::asio::io_context::executor_type>> guard;
+      ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
+      ceph::condition_variable cond;
+
+      bool input_valid;
+
+      bool verbose;
+      bool dryrun;
+      std::optional<int> seqseed;
+      bool interactive;
+
+      bool show_sequence;
+      bool show_help;
+
+      int num_objects;
+      std::string object_name;
+
+      std::string get_token();
+      uint64_t get_numeric_token();
+
+      bool run_automated_test();
+
+      bool run_interactive_test();
+
+      void help();
+      void list_sequence();
     };
   }
 }
 \ No newline at end of file
diff --git a/src/test/rgw/bucket_notification/test_bn.py b/src/test/rgw/bucket_notification/test_bn.py
index 642ab6955a4..359990b3531 100644
--- a/src/test/rgw/bucket_notification/test_bn.py
+++ b/src/test/rgw/bucket_notification/test_bn.py
@@ -711,19 +711,16 @@ def test_ps_s3_topic_on_master():
     assert_equal(status, 404)
 
     # get the remaining 2 topics
-    result, status = topic_conf1.get_list()
-    assert_equal(status, 200)
-    assert_equal(len(result['ListTopicsResponse']['ListTopicsResult']['Topics']['member']), 2)
+    list_topics(2, tenant)
 
     # delete topics
-    result = topic_conf2.del_config()
+    status = topic_conf2.del_config()
     assert_equal(status, 200)
-    result = topic_conf3.del_config()
+    status = topic_conf3.del_config()
     assert_equal(status, 200)
 
     # get topic list, make sure it is empty
-    result, status = topic_conf1.get_list()
-    assert_equal(result['ListTopicsResponse']['ListTopicsResult']['Topics'], None)
+    list_topics(0, tenant)
 
 
 @attr('basic_test')
diff --git a/src/test/rgw/test_rgw_lua.cc b/src/test/rgw/test_rgw_lua.cc
index b2e11e442a2..ad923023a6d 100644
--- a/src/test/rgw/test_rgw_lua.cc
+++ b/src/test/rgw/test_rgw_lua.cc
@@ -9,6 +9,7 @@
 #include "rgw_lua_background.h"
 #include "rgw_lua_data_filter.h"
 #include "rgw_sal_config.h"
+#include "rgw_perf_counters.h"
 
 using namespace std;
 using namespace rgw;
@@ -184,9 +185,51 @@ inline std::unique_ptr<sal::RadosStore> make_store() {
   return std::make_unique<StoreBundle>(std::move(context_pool));
 };
 
+class TestLuaManager : public rgw::sal::StoreLuaManager {
+  public:
+    std::string lua_script;
+    unsigned read_time = 0;
+    TestLuaManager() {
+      rgw_perf_start(g_cct);
+    }
+    int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override {
+      std::this_thread::sleep_for(std::chrono::seconds(read_time));
+      script = lua_script;
+      return 0;
+    }
+    int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override {
+      return 0;
+    }
+    int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override {
+      return 0;
+    }
+    int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override {
+      return 0;
+    }
+    int reload_packages(const DoutPrefixProvider* dpp, optional_yield y) override {
+      return 0;
+    }
+    ~TestLuaManager() {
+      rgw_perf_stop(g_cct);
+    }
+};
+
+void set_script(rgw::sal::LuaManager* manager, const std::string& script) {
+  static_cast<TestLuaManager*>(manager)->lua_script = script;
+}
+void set_read_time(rgw::sal::LuaManager* manager, unsigned read_time) {
+  static_cast<TestLuaManager*>(manager)->read_time = read_time;
+}
+
 #define DEFINE_REQ_STATE RGWProcessEnv pe; \
   auto store = make_store();                   \
-  pe.lua.manager = store->get_lua_manager(""); \
+  pe.lua.manager = std::make_unique<TestLuaManager>(); \
   RGWEnv e; \
   req_state s(g_cct, pe, &e, 0);
 
@@ -850,24 +893,12 @@ TEST(TestRGWLua, OpsLog)
 }
 
 class TestBackground : public rgw::lua::Background {
-  const unsigned read_time;
-
-protected:
-  int read_script() override {
-    // don't read the object from the store
-    std::this_thread::sleep_for(std::chrono::seconds(read_time));
-    return 0;
-  }
-
 public:
-  TestBackground(sal::RadosStore* store, const std::string& script, rgw::sal::LuaManager* manager, unsigned read_time = 0) : 
+  TestBackground(sal::RadosStore* store, rgw::sal::LuaManager* manager) : 
     rgw::lua::Background(store, 
         g_cct, 
         manager,
-        1 /* run every second */),
-    read_time(read_time) {
-      // the script is passed in the constructor
-      rgw_script = script;
+        1 /* run every second */) {
     }
 
   ~TestBackground() override {
@@ -878,20 +909,19 @@ public:
 TEST(TestRGWLuaBackground, Start)
 {
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
+  auto manager = std::make_unique<TestLuaManager>();
   {
     // ctr and dtor without running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
   }
   {
     // ctr and dtor with running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
     lua_background.start();
   }
 }
 
-
-constexpr auto wait_time = std::chrono::seconds(3);
+constexpr auto wait_time = std::chrono::milliseconds(100);
 
 template<typename T>
 const T& get_table_value(const TestBackground& b, const std::string& index) {
@@ -903,6 +933,15 @@ const T& get_table_value(const TestBackground& b, const std::string& index) {
   }
 }
 
+#define WAIT_FOR_BACKGROUND \
+{ \
+  unsigned max_tries = 100; \
+  do { \
+    std::this_thread::sleep_for(wait_time); \
+    --max_tries; \
+  } while (perfcounter->get(l_rgw_lua_script_ok) + perfcounter->get(l_rgw_lua_script_fail) == 0 && max_tries > 0); \
+}
+
 TEST(TestRGWLuaBackground, Script)
 {
   const std::string script = R"(
@@ -912,10 +951,11 @@ TEST(TestRGWLuaBackground, Script)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -928,9 +968,10 @@ TEST(TestRGWLuaBackground, RequestScript)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), background_script, pe.lua.manager.get());
+  set_script(pe.lua.manager.get(), background_script);
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
 
   const std::string request_script = R"(
     local key = "hello"
@@ -947,8 +988,9 @@ TEST(TestRGWLuaBackground, RequestScript)
   ASSERT_EQ(rc, 0);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from request");
   // now we resume and let the background set the value
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from background");
 }
 
@@ -965,14 +1007,16 @@ TEST(TestRGWLuaBackground, Pause)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -991,15 +1035,17 @@ TEST(TestRGWLuaBackground, PauseWhileReading)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get(), 2);
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  set_read_time(manager.get(), 2);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  constexpr auto long_wait_time = std::chrono::seconds(6);
-  std::this_thread::sleep_for(long_wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(long_wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // one execution might occur after pause
   EXPECT_TRUE(value_len + 1 >= get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -1013,14 +1059,16 @@ TEST(TestRGWLuaBackground, ReadWhilePaused)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.pause();
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "");
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -1037,18 +1085,21 @@ TEST(TestRGWLuaBackground, PauseResume)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1066,18 +1117,19 @@ TEST(TestRGWLuaBackground, MultipleStarts)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.start();
   lua_background.shutdown();
   lua_background.shutdown();
-  std::this_thread::sleep_for(wait_time);
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1085,7 +1137,7 @@ TEST(TestRGWLuaBackground, MultipleStarts)
 TEST(TestRGWLuaBackground, TableValues)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1107,7 +1159,7 @@ TEST(TestRGWLuaBackground, TableValues)
 TEST(TestRGWLuaBackground, TablePersist)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1137,7 +1189,7 @@ TEST(TestRGWLuaBackground, TablePersist)
 TEST(TestRGWLuaBackground, TableValuesFromRequest)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1165,7 +1217,7 @@ TEST(TestRGWLuaBackground, TableValuesFromRequest)
 TEST(TestRGWLuaBackground, TableInvalidValue)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1191,7 +1243,7 @@ TEST(TestRGWLuaBackground, TableInvalidValue)
 TEST(TestRGWLuaBackground, TableErase)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["size"] = 0
@@ -1229,7 +1281,7 @@ TEST(TestRGWLuaBackground, TableErase)
 TEST(TestRGWLuaBackground, TableIterate)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1256,7 +1308,7 @@ TEST(TestRGWLuaBackground, TableIterate)
 TEST(TestRGWLuaBackground, TableIterateWrite)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["a"] = 1
@@ -1286,7 +1338,7 @@ TEST(TestRGWLuaBackground, TableIterateWrite)
 TEST(TestRGWLuaBackground, TableIncrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1306,7 +1358,7 @@ TEST(TestRGWLuaBackground, TableIncrement)
 TEST(TestRGWLuaBackground, TableIncrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1328,7 +1380,7 @@ TEST(TestRGWLuaBackground, TableIncrementBy)
 TEST(TestRGWLuaBackground, TableDecrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1348,7 +1400,7 @@ TEST(TestRGWLuaBackground, TableDecrement)
 TEST(TestRGWLuaBackground, TableDecrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1370,7 +1422,7 @@ TEST(TestRGWLuaBackground, TableDecrementBy)
 TEST(TestRGWLuaBackground, TableIncrementValueError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- cannot increment string values
@@ -1405,7 +1457,7 @@ TEST(TestRGWLuaBackground, TableIncrementValueError)
 TEST(TestRGWLuaBackground, TableIncrementError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- missing argument
@@ -1494,7 +1546,7 @@ TEST(TestRGWLua, Data)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   s.host_id = "foo";
   pe.lua.background = &lua_background;
   lua::RGWObjFilter filter(&s, script);
diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
index 9df0f900604..f95a4afd057 100755
--- a/src/tools/cephfs/shell/cephfs-shell
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -15,14 +15,22 @@ import re
 import shlex
 import stat
 import errno
+import distro
 
 from cmd2 import Cmd
 from cmd2 import __version__ as cmd2_version
 from packaging.version import Version
 
+# DFLAG is used to override the checks done by cephfs-shell
+# for cmd2 versions due to weird behaviour of Ubuntu22.04 with
+# cmd2's version i.e. it always gets the version of cmd2 as
+# "0.0.0" instead of the actual cmd2 version.
+DFLAG = False
+if distro.name() == "Ubuntu" and distro.version() == "22.04":
+    DFLAG = True
 # XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of
 # Cmd2ArgparseError
-if Version(cmd2_version) >= Version("1.0.1"):
+if Version(cmd2_version) >= Version("1.0.1") or DFLAG is True:
     from cmd2.exceptions import Cmd2ArgparseError
 else:
     # HACK: so that we don't have check for version everywhere
@@ -1700,7 +1708,7 @@ def read_shell_conf(shell, shell_conf_file):
 
     sec = 'cephfs-shell'
     opts = []
-    if Version(cmd2_version) >= Version("0.10.0"):
+    if Version(cmd2_version) >= Version("0.10.0") or DFLAG is True:
         for attr in shell.settables.keys():
             opts.append(attr)
     else:
@@ -1768,7 +1776,7 @@ def manage_args():
     args.exe_and_quit = False    # Execute and quit, don't launch the shell.
 
     if args.batch:
-        if Version(cmd2_version) <= Version("0.9.13"):
+        if Version(cmd2_version) <= Version("0.9.13") and DFLAG is not True:
             args.commands = ['load ' + args.batch, ',quit']
         else:
             args.commands = ['run_script ' + args.batch, ',quit']
@@ -1813,7 +1821,7 @@ def execute_cmds_and_quit(args):
     # value to indicate whether the execution of the commands should stop, but
     # since 0.9.7 it returns the return value of do_* methods only if it's
     # not None. When it is None it returns False instead of None.
-    if Version(cmd2_version) <= Version("0.9.6"):
+    if Version(cmd2_version) <= Version("0.9.6") and DFLAG is not True:
         stop_exec_val = None
     else:
         stop_exec_val = False